TUT HEVC Encoder
Loading...
Searching...
No Matches
src
strategies
avx2
reg_sad_pow2_widths-avx2.h
Go to the documentation of this file.
1
/*****************************************************************************
2
* This file is part of Kvazaar HEVC encoder.
3
*
4
* Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
5
* All rights reserved.
6
*
7
* Redistribution and use in source and binary forms, with or without modification,
8
* are permitted provided that the following conditions are met:
9
*
10
* * Redistributions of source code must retain the above copyright notice, this
11
* list of conditions and the following disclaimer.
12
*
13
* * Redistributions in binary form must reproduce the above copyright notice, this
14
* list of conditions and the following disclaimer in the documentation and/or
15
* other materials provided with the distribution.
16
*
17
* * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
18
* contributors may be used to endorse or promote products derived from
19
* this software without specific prior written permission.
20
*
21
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
22
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
25
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26
* INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
28
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
* INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
31
****************************************************************************/
32
33
#ifndef REG_SAD_POW2_WIDTHS_AVX2_H_
34
#define REG_SAD_POW2_WIDTHS_AVX2_H_
35
36
#include "
kvazaar.h
"
37
38
#if KVZ_BIT_DEPTH == 8
39
40
#include "
strategies/sse41/reg_sad_pow2_widths-sse41.h
"
41
42
static
INLINE
uint32_t
reg_sad_w32
(
const
uint8_t
*
const
data1
,
const
uint8_t
*
const
data2
,
43
const
int32_t
height,
const
uint32_t
stride1
,
44
const
uint32_t
stride2
)
45
{
46
__m256i
avx_inc
=
_mm256_setzero_si256
();
47
int32_t
y;
48
49
const
int32_t
height_fourline_groups
= height & ~3;
50
const
int32_t
height_residual_lines
= height & 3;
51
52
for
(y = 0; y <
height_fourline_groups
; y += 4) {
53
__m256i
a =
_mm256_loadu_si256
((
const
__m256i
*)(
data1
+ (y + 0) *
stride1
));
54
__m256i
b =
_mm256_loadu_si256
((
const
__m256i
*)(
data2
+ (y + 0) *
stride2
));
55
__m256i
c
=
_mm256_loadu_si256
((
const
__m256i
*)(
data1
+ (y + 1) *
stride1
));
56
__m256i
d =
_mm256_loadu_si256
((
const
__m256i
*)(
data2
+ (y + 1) *
stride2
));
57
__m256i
e
=
_mm256_loadu_si256
((
const
__m256i
*)(
data1
+ (y + 2) *
stride1
));
58
__m256i
f
=
_mm256_loadu_si256
((
const
__m256i
*)(
data2
+ (y + 2) *
stride2
));
59
__m256i
g
=
_mm256_loadu_si256
((
const
__m256i
*)(
data1
+ (y + 3) *
stride1
));
60
__m256i
h =
_mm256_loadu_si256
((
const
__m256i
*)(
data2
+ (y + 3) *
stride2
));
61
62
__m256i
curr_sads_ab
=
_mm256_sad_epu8
(a, b);
63
__m256i
curr_sads_cd
=
_mm256_sad_epu8
(
c
, d);
64
__m256i
curr_sads_ef
=
_mm256_sad_epu8
(
e
,
f
);
65
__m256i
curr_sads_gh
=
_mm256_sad_epu8
(
g
, h);
66
67
avx_inc
=
_mm256_add_epi64
(
avx_inc
,
curr_sads_ab
);
68
avx_inc
=
_mm256_add_epi64
(
avx_inc
,
curr_sads_cd
);
69
avx_inc
=
_mm256_add_epi64
(
avx_inc
,
curr_sads_ef
);
70
avx_inc
=
_mm256_add_epi64
(
avx_inc
,
curr_sads_gh
);
71
}
72
if
(
height_residual_lines
) {
73
for
(; y < height; y++) {
74
__m256i
a =
_mm256_loadu_si256
((
const
__m256i
*)(
data1
+ (y + 0) *
stride1
));
75
__m256i
b =
_mm256_loadu_si256
((
const
__m256i
*)(
data2
+ (y + 0) *
stride2
));
76
77
__m256i
curr_sads
=
_mm256_sad_epu8
(a, b);
78
avx_inc
=
_mm256_add_epi64
(
avx_inc
,
curr_sads
);
79
}
80
}
81
82
__m128i
inchi
=
_mm256_extracti128_si256
(
avx_inc
, 1);
83
__m128i
inclo
=
_mm256_castsi256_si128
(
avx_inc
);
84
85
__m128i
sum_1
=
_mm_add_epi64
(
inclo
,
inchi
);
86
__m128i
sum_2
=
_mm_shuffle_epi32
(
sum_1
,
_MM_SHUFFLE
(1, 0, 3, 2));
87
__m128i
sad
=
_mm_add_epi64
(
sum_1
,
sum_2
);
88
89
return
_mm_cvtsi128_si32
(
sad
);
90
}
91
92
static
INLINE
uint32_t
reg_sad_w64
(
const
uint8_t
*
const
data1
,
const
uint8_t
*
const
data2
,
93
const
int32_t
height,
const
uint32_t
stride1
,
94
const
uint32_t
stride2
)
95
{
96
__m256i
avx_inc
=
_mm256_setzero_si256
();
97
int32_t
y;
98
99
const
int32_t
height_twoline_groups
= height & ~1;
100
const
int32_t
height_residual_lines
= height & 1;
101
102
for
(y = 0; y <
height_twoline_groups
; y += 2) {
103
__m256i
a =
_mm256_loadu_si256
((
const
__m256i
*)(
data1
+ (y + 0) *
stride1
));
104
__m256i
b =
_mm256_loadu_si256
((
const
__m256i
*)(
data2
+ (y + 0) *
stride2
));
105
__m256i
c
=
_mm256_loadu_si256
((
const
__m256i
*)(
data1
+ (y + 0) *
stride1
+ 32));
106
__m256i
d =
_mm256_loadu_si256
((
const
__m256i
*)(
data2
+ (y + 0) *
stride2
+ 32));
107
108
__m256i
e
=
_mm256_loadu_si256
((
const
__m256i
*)(
data1
+ (y + 1) *
stride1
));
109
__m256i
f
=
_mm256_loadu_si256
((
const
__m256i
*)(
data2
+ (y + 1) *
stride2
));
110
__m256i
g
=
_mm256_loadu_si256
((
const
__m256i
*)(
data1
+ (y + 1) *
stride1
+ 32));
111
__m256i
h =
_mm256_loadu_si256
((
const
__m256i
*)(
data2
+ (y + 1) *
stride2
+ 32));
112
113
__m256i
curr_sads_ab
=
_mm256_sad_epu8
(a, b);
114
__m256i
curr_sads_cd
=
_mm256_sad_epu8
(
c
, d);
115
__m256i
curr_sads_ef
=
_mm256_sad_epu8
(
e
,
f
);
116
__m256i
curr_sads_gh
=
_mm256_sad_epu8
(
g
, h);
117
118
avx_inc
=
_mm256_add_epi64
(
avx_inc
,
curr_sads_ab
);
119
avx_inc
=
_mm256_add_epi64
(
avx_inc
,
curr_sads_cd
);
120
avx_inc
=
_mm256_add_epi64
(
avx_inc
,
curr_sads_ef
);
121
avx_inc
=
_mm256_add_epi64
(
avx_inc
,
curr_sads_gh
);
122
}
123
if
(
height_residual_lines
) {
124
for
(; y < height; y++) {
125
__m256i
a =
_mm256_loadu_si256
((
const
__m256i
*)(
data1
+ (y + 0) *
stride1
));
126
__m256i
b =
_mm256_loadu_si256
((
const
__m256i
*)(
data2
+ (y + 0) *
stride2
));
127
__m256i
c
=
_mm256_loadu_si256
((
const
__m256i
*)(
data1
+ (y + 0) *
stride1
+ 32));
128
__m256i
d =
_mm256_loadu_si256
((
const
__m256i
*)(
data2
+ (y + 0) *
stride2
+ 32));
129
130
__m256i
curr_sads_ab
=
_mm256_sad_epu8
(a, b);
131
__m256i
curr_sads_cd
=
_mm256_sad_epu8
(
c
, d);
132
avx_inc
=
_mm256_add_epi64
(
avx_inc
,
curr_sads_ab
);
133
avx_inc
=
_mm256_add_epi64
(
avx_inc
,
curr_sads_cd
);
134
}
135
}
136
137
__m128i
inchi
=
_mm256_extracti128_si256
(
avx_inc
, 1);
138
__m128i
inclo
=
_mm256_castsi256_si128
(
avx_inc
);
139
140
__m128i
sum_1
=
_mm_add_epi64
(
inclo
,
inchi
);
141
__m128i
sum_2
=
_mm_shuffle_epi32
(
sum_1
,
_MM_SHUFFLE
(1, 0, 3, 2));
142
__m128i
sad
=
_mm_add_epi64
(
sum_1
,
sum_2
);
143
144
return
_mm_cvtsi128_si32
(
sad
);
145
}
146
147
static
uint32_t
hor_sad_avx2_w32
(
const
uint8_t
*
pic_data
,
const
uint8_t
*
ref_data
,
148
int32_t
height,
uint32_t
pic_stride
,
uint32_t
ref_stride
,
149
const
uint32_t
left,
const
uint32_t
right)
150
{
151
__m256i
avx_inc
=
_mm256_setzero_si256
();
152
153
const
size_t
block_width
= 32;
154
const
size_t
block_width_log2
= 5;
155
const
size_t
lane_width
= 16;
156
157
const
int32_t
left_eq_wid
= left >>
block_width_log2
;
158
const
int32_t
left_clamped
= left -
left_eq_wid
;
159
const
int32_t
right_eq_wid
= right >>
block_width_log2
;
160
const
int32_t
right_clamped
= right -
right_eq_wid
;
161
162
const
__m256i
zero
=
_mm256_setzero_si256
();
163
const
__m256i
lane_widths
=
_mm256_set1_epi8
((
uint8_t
)
lane_width
);
164
const
__m256i
lefts
=
_mm256_set1_epi8
((
uint8_t
)
left_clamped
);
165
const
__m256i
rights
=
_mm256_set1_epi8
((
uint8_t
)
right_clamped
);
166
const
__m256i
unsign_mask
=
_mm256_set1_epi8
(0x7f);
167
const
__m256i
ns
=
_mm256_setr_epi8
(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
168
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
169
170
const
__m256i
rightmost_good_idx
=
_mm256_set1_epi8
((
uint8_t
)(
block_width
- right - 1));
171
172
const
__m256i
shufmask1_l
=
_mm256_sub_epi8
(
ns
,
lefts
);
173
const
__m256i
shufmask1_r
=
_mm256_add_epi8
(
shufmask1_l
,
rights
);
174
const
__m256i
shufmask1
=
_mm256_and_si256
(
shufmask1_r
,
unsign_mask
);
175
176
const
__m256i
epol_mask_r
=
_mm256_min_epi8
(
ns
,
rightmost_good_idx
);
177
const
__m256i
epol_mask
=
_mm256_max_epi8
(
lefts
,
epol_mask_r
);
178
179
const
__m256i
mlo2hi_mask_l
=
_mm256_cmpgt_epi8
(
lefts
,
ns
);
180
const
__m256i
mlo2hi_imask_r
=
_mm256_cmpgt_epi8
(
lane_widths
,
shufmask1
);
181
const
__m256i
mlo2hi_mask_r
=
_mm256_cmpeq_epi8
(
mlo2hi_imask_r
,
zero
);
182
183
// For left != 0, use low lane of mlo2hi_mask_l as blend mask for high lane.
184
// For right != 0, use low lane of mlo2hi_mask_r as blend mask for low lane.
185
const
__m256i
xchg_mask1
=
_mm256_permute2x128_si256
(
mlo2hi_mask_l
,
mlo2hi_mask_r
, 0x02);
186
187
// If left != 0 (ie. right == 0), the xchg should only affect high lane,
188
// if right != 0 (ie. left == 0), the low lane. Set bits on the lane that
189
// the xchg should affect. left == right == 0 should never happen, this'll
190
// break if it does.
191
const
__m256i
lanes_llo_rhi
=
_mm256_blend_epi32
(
lefts
,
rights
, 0xf0);
192
const
__m256i
xchg_lane_mask
=
_mm256_cmpeq_epi32
(
lanes_llo_rhi
,
zero
);
193
194
const
__m256i
xchg_data_mask
=
_mm256_and_si256
(
xchg_mask1
,
xchg_lane_mask
);
195
196
// If we're straddling the left border, start from the left border instead,
197
// and if right border, end on the border
198
const
int32_t
ld_offset
= left - right;
199
200
int32_t
y;
201
for
(y = 0; y < height; y++) {
202
__m256i
a =
_mm256_loadu_si256
((
__m256i
*)(
pic_data
+ (y + 0) *
pic_stride
+ 0));
203
__m256i
b =
_mm256_loadu_si256
((
__m256i
*)(
ref_data
+ (y + 0) *
ref_stride
+ 0 +
ld_offset
));
204
205
__m256i
b_shifted
=
_mm256_shuffle_epi8
(b,
shufmask1
);
206
__m256i
b_lanes_reversed
=
_mm256_permute4x64_epi64
(
b_shifted
,
_MM_SHUFFLE
(1, 0, 3, 2));
207
__m256i
b_data_transfered
=
_mm256_blendv_epi8
(
b_shifted
,
b_lanes_reversed
,
xchg_data_mask
);
208
__m256i
b_epoled
=
_mm256_shuffle_epi8
(
b_data_transfered
,
epol_mask
);
209
210
__m256i
curr_sads_ab
=
_mm256_sad_epu8
(a,
b_epoled
);
211
212
avx_inc
=
_mm256_add_epi64
(
avx_inc
,
curr_sads_ab
);
213
}
214
__m128i
inchi
=
_mm256_extracti128_si256
(
avx_inc
, 1);
215
__m128i
inclo
=
_mm256_castsi256_si128
(
avx_inc
);
216
217
__m128i
sum_1
=
_mm_add_epi64
(
inclo
,
inchi
);
218
__m128i
sum_2
=
_mm_shuffle_epi32
(
sum_1
,
_MM_SHUFFLE
(1, 0, 3, 2));
219
__m128i
sad
=
_mm_add_epi64
(
sum_1
,
sum_2
);
220
221
return
_mm_cvtsi128_si32
(
sad
);
222
}
223
224
#endif
// KVZ_BIT_DEPTH == 8
225
226
#endif
INLINE
#define INLINE
Definition
global.h:240
MAX_TILES_PER_DIM
#define MAX_TILES_PER_DIM
Definition
global.h:232
kvazaar.h
This file defines the public API of Kvazaar when used as a library.
reg_sad_pow2_widths-sse41.h
Generated on Wed May 22 2024 09:41:54 for TUT HEVC Encoder by
1.9.8