FFmpeg  4.0
hpeldsp_altivec.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2002 Brian Foley
3  * Copyright (c) 2002 Dieter Shirley
4  * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 #include "config.h"
24 
25 #include "libavutil/attributes.h"
26 #include "libavutil/cpu.h"
27 #include "libavutil/ppc/cpu.h"
29 
30 #include "libavcodec/hpeldsp.h"
31 
32 #include "hpeldsp_altivec.h"
33 
34 #if HAVE_ALTIVEC
35 /* next one assumes that ((line_size % 16) == 0) */
36 void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
37 {
38  register vector unsigned char pixelsv1;
39  register vector unsigned char pixelsv1B;
40  register vector unsigned char pixelsv1C;
41  register vector unsigned char pixelsv1D;
42 
43  int i;
44  register ptrdiff_t line_size_2 = line_size << 1;
45  register ptrdiff_t line_size_3 = line_size + line_size_2;
46  register ptrdiff_t line_size_4 = line_size << 2;
47 
48 // hand-unrolling the loop by 4 gains about 15%
49 // mininum execution time goes from 74 to 60 cycles
50 // it's faster than -funroll-loops, but using
51 // -funroll-loops w/ this is bad - 74 cycles again.
52 // all this is on a 7450, tuning for the 7450
53  for (i = 0; i < h; i += 4) {
54  pixelsv1 = unaligned_load( 0, pixels);
55  pixelsv1B = unaligned_load(line_size, pixels);
56  pixelsv1C = unaligned_load(line_size_2, pixels);
57  pixelsv1D = unaligned_load(line_size_3, pixels);
58  VEC_ST(pixelsv1, 0, (unsigned char*)block);
59  VEC_ST(pixelsv1B, line_size, (unsigned char*)block);
60  VEC_ST(pixelsv1C, line_size_2, (unsigned char*)block);
61  VEC_ST(pixelsv1D, line_size_3, (unsigned char*)block);
62  pixels+=line_size_4;
63  block +=line_size_4;
64  }
65 }
66 
67 /* next one assumes that ((line_size % 16) == 0) */
68 #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
69 void ff_avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
70 {
71  register vector unsigned char pixelsv, blockv;
72 
73  int i;
74  for (i = 0; i < h; i++) {
75  blockv = vec_ld(0, block);
76  pixelsv = VEC_LD( 0, pixels);
77  blockv = vec_avg(blockv,pixelsv);
78  vec_st(blockv, 0, (unsigned char*)block);
79  pixels+=line_size;
80  block +=line_size;
81  }
82 }
83 
84 /* next one assumes that ((line_size % 8) == 0) */
85 static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
86 {
87  register vector unsigned char pixelsv, blockv;
88  int i;
89 
90  for (i = 0; i < h; i++) {
91  /* block is 8 bytes-aligned, so we're either in the
92  left block (16 bytes-aligned) or in the right block (not) */
93  int rightside = ((unsigned long)block & 0x0000000F);
94 
95  blockv = vec_ld(0, block);
96  pixelsv = VEC_LD( 0, pixels);
97 
98  if (rightside) {
99  pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
100  } else {
101  pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
102  }
103 
104  blockv = vec_avg(blockv, pixelsv);
105 
106  vec_st(blockv, 0, block);
107 
108  pixels += line_size;
109  block += line_size;
110  }
111 }
112 
113 /* next one assumes that ((line_size % 8) == 0) */
114 static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
115 {
116  register int i;
117  register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
118  register vector unsigned char blockv;
119  register vector unsigned short pixelssum1, pixelssum2, temp3;
120  register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
121  register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
122 
123  pixelsv1 = VEC_LD(0, pixels);
124  pixelsv2 = VEC_LD(1, pixels);
125  pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
126  pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
127 
128  pixelssum1 = vec_add((vector unsigned short)pixelsv1,
129  (vector unsigned short)pixelsv2);
130  pixelssum1 = vec_add(pixelssum1, vctwo);
131 
132  for (i = 0; i < h ; i++) {
133  int rightside = ((unsigned long)block & 0x0000000F);
134  blockv = vec_ld(0, block);
135 
136  pixelsv1 = unaligned_load(line_size, pixels);
137  pixelsv2 = unaligned_load(line_size+1, pixels);
138  pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
139  pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
140  pixelssum2 = vec_add((vector unsigned short)pixelsv1,
141  (vector unsigned short)pixelsv2);
142  temp3 = vec_add(pixelssum1, pixelssum2);
143  temp3 = vec_sra(temp3, vctwo);
144  pixelssum1 = vec_add(pixelssum2, vctwo);
145  pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
146 
147  if (rightside) {
148  blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
149  } else {
150  blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
151  }
152 
153  vec_st(blockv, 0, block);
154 
155  block += line_size;
156  pixels += line_size;
157  }
158 }
159 
160 /* next one assumes that ((line_size % 8) == 0) */
161 static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
162 {
163  register int i;
164  register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
165  register vector unsigned char blockv;
166  register vector unsigned short pixelssum1, pixelssum2, temp3;
167  register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
168  register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
169  register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
170 
171  pixelsv1 = VEC_LD(0, pixels);
172  pixelsv2 = VEC_LD(1, pixels);
173  pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
174  pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
175  pixelssum1 = vec_add((vector unsigned short)pixelsv1,
176  (vector unsigned short)pixelsv2);
177  pixelssum1 = vec_add(pixelssum1, vcone);
178 
179  for (i = 0; i < h ; i++) {
180  int rightside = ((unsigned long)block & 0x0000000F);
181  blockv = vec_ld(0, block);
182 
183  pixelsv1 = unaligned_load(line_size, pixels);
184  pixelsv2 = unaligned_load(line_size+1, pixels);
185  pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
186  pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
187  pixelssum2 = vec_add((vector unsigned short)pixelsv1,
188  (vector unsigned short)pixelsv2);
189  temp3 = vec_add(pixelssum1, pixelssum2);
190  temp3 = vec_sra(temp3, vctwo);
191  pixelssum1 = vec_add(pixelssum2, vcone);
192  pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
193 
194  if (rightside) {
195  blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
196  } else {
197  blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
198  }
199 
200  vec_st(blockv, 0, block);
201 
202  block += line_size;
203  pixels += line_size;
204  }
205 }
206 
207 /* next one assumes that ((line_size % 16) == 0) */
208 static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
209 {
210  register int i;
211  register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
212  register vector unsigned char blockv;
213  register vector unsigned short temp3, temp4,
214  pixelssum1, pixelssum2, pixelssum3, pixelssum4;
215  register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
216  register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
217 
218  pixelsv1 = VEC_LD(0, pixels);
219  pixelsv2 = VEC_LD(1, pixels);
220  pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
221  pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
222  pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
223  pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
224  pixelssum3 = vec_add((vector unsigned short)pixelsv3,
225  (vector unsigned short)pixelsv4);
226  pixelssum3 = vec_add(pixelssum3, vctwo);
227  pixelssum1 = vec_add((vector unsigned short)pixelsv1,
228  (vector unsigned short)pixelsv2);
229  pixelssum1 = vec_add(pixelssum1, vctwo);
230 
231  for (i = 0; i < h ; i++) {
232  blockv = vec_ld(0, block);
233 
234  pixelsv1 = unaligned_load(line_size, pixels);
235  pixelsv2 = unaligned_load(line_size+1, pixels);
236 
237  pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
238  pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
239  pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
240  pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
241  pixelssum4 = vec_add((vector unsigned short)pixelsv3,
242  (vector unsigned short)pixelsv4);
243  pixelssum2 = vec_add((vector unsigned short)pixelsv1,
244  (vector unsigned short)pixelsv2);
245  temp4 = vec_add(pixelssum3, pixelssum4);
246  temp4 = vec_sra(temp4, vctwo);
247  temp3 = vec_add(pixelssum1, pixelssum2);
248  temp3 = vec_sra(temp3, vctwo);
249 
250  pixelssum3 = vec_add(pixelssum4, vctwo);
251  pixelssum1 = vec_add(pixelssum2, vctwo);
252 
253  blockv = vec_packsu(temp3, temp4);
254 
255  vec_st(blockv, 0, block);
256 
257  block += line_size;
258  pixels += line_size;
259  }
260 }
261 
262 /* next one assumes that ((line_size % 16) == 0) */
263 static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
264 {
265  register int i;
266  register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
267  register vector unsigned char blockv;
268  register vector unsigned short temp3, temp4,
269  pixelssum1, pixelssum2, pixelssum3, pixelssum4;
270  register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
271  register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
272  register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
273 
274  pixelsv1 = VEC_LD(0, pixels);
275  pixelsv2 = VEC_LD(1, pixels);
276  pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
277  pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
278  pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
279  pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
280  pixelssum3 = vec_add((vector unsigned short)pixelsv3,
281  (vector unsigned short)pixelsv4);
282  pixelssum3 = vec_add(pixelssum3, vcone);
283  pixelssum1 = vec_add((vector unsigned short)pixelsv1,
284  (vector unsigned short)pixelsv2);
285  pixelssum1 = vec_add(pixelssum1, vcone);
286 
287  for (i = 0; i < h ; i++) {
288  pixelsv1 = unaligned_load(line_size, pixels);
289  pixelsv2 = unaligned_load(line_size+1, pixels);
290 
291  pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
292  pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
293  pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
294  pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
295  pixelssum4 = vec_add((vector unsigned short)pixelsv3,
296  (vector unsigned short)pixelsv4);
297  pixelssum2 = vec_add((vector unsigned short)pixelsv1,
298  (vector unsigned short)pixelsv2);
299  temp4 = vec_add(pixelssum3, pixelssum4);
300  temp4 = vec_sra(temp4, vctwo);
301  temp3 = vec_add(pixelssum1, pixelssum2);
302  temp3 = vec_sra(temp3, vctwo);
303 
304  pixelssum3 = vec_add(pixelssum4, vcone);
305  pixelssum1 = vec_add(pixelssum2, vcone);
306 
307  blockv = vec_packsu(temp3, temp4);
308 
309  VEC_ST(blockv, 0, block);
310 
311  block += line_size;
312  pixels += line_size;
313  }
314 }
315 
316 /* next one assumes that ((line_size % 8) == 0) */
317 static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
318 {
319  register int i;
320  register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
321  register vector unsigned char blockv, blocktemp;
322  register vector unsigned short pixelssum1, pixelssum2, temp3;
323 
324  register const vector unsigned char vczero = (const vector unsigned char)
325  vec_splat_u8(0);
326  register const vector unsigned short vctwo = (const vector unsigned short)
327  vec_splat_u16(2);
328 
329  pixelsv1 = VEC_LD(0, pixels);
330  pixelsv2 = VEC_LD(1, pixels);
331  pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
332  pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
333  pixelssum1 = vec_add((vector unsigned short)pixelsv1,
334  (vector unsigned short)pixelsv2);
335  pixelssum1 = vec_add(pixelssum1, vctwo);
336 
337  for (i = 0; i < h ; i++) {
338  int rightside = ((unsigned long)block & 0x0000000F);
339  blockv = vec_ld(0, block);
340 
341  pixelsv1 = unaligned_load(line_size, pixels);
342  pixelsv2 = unaligned_load(line_size+1, pixels);
343 
344  pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
345  pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
346  pixelssum2 = vec_add((vector unsigned short)pixelsv1,
347  (vector unsigned short)pixelsv2);
348  temp3 = vec_add(pixelssum1, pixelssum2);
349  temp3 = vec_sra(temp3, vctwo);
350  pixelssum1 = vec_add(pixelssum2, vctwo);
351  pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
352 
353  if (rightside) {
354  blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
355  } else {
356  blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
357  }
358 
359  blockv = vec_avg(blocktemp, blockv);
360  vec_st(blockv, 0, block);
361 
362  block += line_size;
363  pixels += line_size;
364  }
365 }
366 #endif /* HAVE_ALTIVEC */
367 
369 {
370 #if HAVE_ALTIVEC
372  return;
373 
375  c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
376  c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec;
377 
379  c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
380  c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
381 
383  c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
384  c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
385 #endif /* HAVE_ALTIVEC */
386 }
op_pixels_func avg_pixels_tab[4][4]
Halfpel motion compensation with rounding (a+b+1)>>1.
Definition: hpeldsp.h:68
Macro definitions for various function/variable attributes.
static int16_t block[64]
Definition: dct.c:115
uint8_t
#define av_cold
Definition: attributes.h:82
void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
static int flags
Definition: log.c:55
void ff_avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
#define PPC_ALTIVEC(flags)
Definition: cpu.h:25
#define s0
Definition: regdef.h:37
Half-pel DSP context.
Definition: hpeldsp.h:45
Half-pel DSP functions.
op_pixels_func put_pixels_tab[4][4]
Halfpel motion compensation with rounding (a+b+1)>>1.
Definition: hpeldsp.h:56
op_pixels_func put_no_rnd_pixels_tab[4][4]
Halfpel motion compensation with no rounding (a+b)>>1.
Definition: hpeldsp.h:82
#define s1
Definition: regdef.h:38
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:93
Contains misc utility macros and inline functions.
static double c[64]
int pixels
Definition: avisynth_c.h:429
av_cold void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags)