FFmpeg  4.0
me_cmp_alpha.c
Go to the documentation of this file.
1 /*
2  * Alpha optimized DSP utils
3  * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include "libavutil/attributes.h"
23 #include "libavcodec/me_cmp.h"
24 #include "asm.h"
25 
26 int pix_abs16x16_mvi_asm(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h);
27 
28 static inline uint64_t avg2(uint64_t a, uint64_t b)
29 {
30  return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
31 }
32 
33 static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
34 {
35  uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
36  + ((l2 & ~BYTE_VEC(0x03)) >> 2)
37  + ((l3 & ~BYTE_VEC(0x03)) >> 2)
38  + ((l4 & ~BYTE_VEC(0x03)) >> 2);
39  uint64_t r2 = (( (l1 & BYTE_VEC(0x03))
40  + (l2 & BYTE_VEC(0x03))
41  + (l3 & BYTE_VEC(0x03))
42  + (l4 & BYTE_VEC(0x03))
43  + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
44  return r1 + r2;
45 }
46 
47 static int pix_abs8x8_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
48 {
49  int result = 0;
50 
51  if ((size_t) pix2 & 0x7) {
52  /* works only when pix2 is actually unaligned */
53  do { /* do 8 pixel a time */
54  uint64_t p1, p2;
55 
56  p1 = ldq(pix1);
57  p2 = uldq(pix2);
58  result += perr(p1, p2);
59 
60  pix1 += line_size;
61  pix2 += line_size;
62  } while (--h);
63  } else {
64  do {
65  uint64_t p1, p2;
66 
67  p1 = ldq(pix1);
68  p2 = ldq(pix2);
69  result += perr(p1, p2);
70 
71  pix1 += line_size;
72  pix2 += line_size;
73  } while (--h);
74  }
75 
76  return result;
77 }
78 
79 #if 0 /* now done in assembly */
80 int pix_abs16x16_mvi(uint8_t *pix1, uint8_t *pix2, int line_size)
81 {
82  int result = 0;
83  int h = 16;
84 
85  if ((size_t) pix2 & 0x7) {
86  /* works only when pix2 is actually unaligned */
87  do { /* do 16 pixel a time */
88  uint64_t p1_l, p1_r, p2_l, p2_r;
89  uint64_t t;
90 
91  p1_l = ldq(pix1);
92  p1_r = ldq(pix1 + 8);
93  t = ldq_u(pix2 + 8);
94  p2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
95  p2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
96  pix1 += line_size;
97  pix2 += line_size;
98 
99  result += perr(p1_l, p2_l)
100  + perr(p1_r, p2_r);
101  } while (--h);
102  } else {
103  do {
104  uint64_t p1_l, p1_r, p2_l, p2_r;
105 
106  p1_l = ldq(pix1);
107  p1_r = ldq(pix1 + 8);
108  p2_l = ldq(pix2);
109  p2_r = ldq(pix2 + 8);
110  pix1 += line_size;
111  pix2 += line_size;
112 
113  result += perr(p1_l, p2_l)
114  + perr(p1_r, p2_r);
115  } while (--h);
116  }
117 
118  return result;
119 }
120 #endif
121 
122 static int pix_abs16x16_x2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
123 {
124  int result = 0;
125  uint64_t disalign = (size_t) pix2 & 0x7;
126 
127  switch (disalign) {
128  case 0:
129  do {
130  uint64_t p1_l, p1_r, p2_l, p2_r;
131  uint64_t l, r;
132 
133  p1_l = ldq(pix1);
134  p1_r = ldq(pix1 + 8);
135  l = ldq(pix2);
136  r = ldq(pix2 + 8);
137  p2_l = avg2(l, (l >> 8) | ((uint64_t) r << 56));
138  p2_r = avg2(r, (r >> 8) | ((uint64_t) pix2[16] << 56));
139  pix1 += line_size;
140  pix2 += line_size;
141 
142  result += perr(p1_l, p2_l)
143  + perr(p1_r, p2_r);
144  } while (--h);
145  break;
146  case 7:
147  /* |.......l|lllllllr|rrrrrrr*|
148  This case is special because disalign1 would be 8, which
149  gets treated as 0 by extqh. At least it is a bit faster
150  that way :) */
151  do {
152  uint64_t p1_l, p1_r, p2_l, p2_r;
153  uint64_t l, m, r;
154 
155  p1_l = ldq(pix1);
156  p1_r = ldq(pix1 + 8);
157  l = ldq_u(pix2);
158  m = ldq_u(pix2 + 8);
159  r = ldq_u(pix2 + 16);
160  p2_l = avg2(extql(l, disalign) | extqh(m, disalign), m);
161  p2_r = avg2(extql(m, disalign) | extqh(r, disalign), r);
162  pix1 += line_size;
163  pix2 += line_size;
164 
165  result += perr(p1_l, p2_l)
166  + perr(p1_r, p2_r);
167  } while (--h);
168  break;
169  default:
170  do {
171  uint64_t disalign1 = disalign + 1;
172  uint64_t p1_l, p1_r, p2_l, p2_r;
173  uint64_t l, m, r;
174 
175  p1_l = ldq(pix1);
176  p1_r = ldq(pix1 + 8);
177  l = ldq_u(pix2);
178  m = ldq_u(pix2 + 8);
179  r = ldq_u(pix2 + 16);
180  p2_l = avg2(extql(l, disalign) | extqh(m, disalign),
181  extql(l, disalign1) | extqh(m, disalign1));
182  p2_r = avg2(extql(m, disalign) | extqh(r, disalign),
183  extql(m, disalign1) | extqh(r, disalign1));
184  pix1 += line_size;
185  pix2 += line_size;
186 
187  result += perr(p1_l, p2_l)
188  + perr(p1_r, p2_r);
189  } while (--h);
190  break;
191  }
192  return result;
193 }
194 
195 static int pix_abs16x16_y2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
196 {
197  int result = 0;
198 
199  if ((size_t) pix2 & 0x7) {
200  uint64_t t, p2_l, p2_r;
201  t = ldq_u(pix2 + 8);
202  p2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
203  p2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
204 
205  do {
206  uint64_t p1_l, p1_r, np2_l, np2_r;
207  uint64_t t;
208 
209  p1_l = ldq(pix1);
210  p1_r = ldq(pix1 + 8);
211  pix2 += line_size;
212  t = ldq_u(pix2 + 8);
213  np2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
214  np2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
215 
216  result += perr(p1_l, avg2(p2_l, np2_l))
217  + perr(p1_r, avg2(p2_r, np2_r));
218 
219  pix1 += line_size;
220  p2_l = np2_l;
221  p2_r = np2_r;
222 
223  } while (--h);
224  } else {
225  uint64_t p2_l, p2_r;
226  p2_l = ldq(pix2);
227  p2_r = ldq(pix2 + 8);
228  do {
229  uint64_t p1_l, p1_r, np2_l, np2_r;
230 
231  p1_l = ldq(pix1);
232  p1_r = ldq(pix1 + 8);
233  pix2 += line_size;
234  np2_l = ldq(pix2);
235  np2_r = ldq(pix2 + 8);
236 
237  result += perr(p1_l, avg2(p2_l, np2_l))
238  + perr(p1_r, avg2(p2_r, np2_r));
239 
240  pix1 += line_size;
241  p2_l = np2_l;
242  p2_r = np2_r;
243  } while (--h);
244  }
245  return result;
246 }
247 
248 static int pix_abs16x16_xy2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
249 {
250  int result = 0;
251 
252  uint64_t p1_l, p1_r;
253  uint64_t p2_l, p2_r, p2_x;
254 
255  p1_l = ldq(pix1);
256  p1_r = ldq(pix1 + 8);
257 
258  if ((size_t) pix2 & 0x7) { /* could be optimized a lot */
259  p2_l = uldq(pix2);
260  p2_r = uldq(pix2 + 8);
261  p2_x = (uint64_t) pix2[16] << 56;
262  } else {
263  p2_l = ldq(pix2);
264  p2_r = ldq(pix2 + 8);
265  p2_x = ldq(pix2 + 16) << 56;
266  }
267 
268  do {
269  uint64_t np1_l, np1_r;
270  uint64_t np2_l, np2_r, np2_x;
271 
272  pix1 += line_size;
273  pix2 += line_size;
274 
275  np1_l = ldq(pix1);
276  np1_r = ldq(pix1 + 8);
277 
278  if ((size_t) pix2 & 0x7) { /* could be optimized a lot */
279  np2_l = uldq(pix2);
280  np2_r = uldq(pix2 + 8);
281  np2_x = (uint64_t) pix2[16] << 56;
282  } else {
283  np2_l = ldq(pix2);
284  np2_r = ldq(pix2 + 8);
285  np2_x = ldq(pix2 + 16) << 56;
286  }
287 
288  result += perr(p1_l,
289  avg4( p2_l, ( p2_l >> 8) | ((uint64_t) p2_r << 56),
290  np2_l, (np2_l >> 8) | ((uint64_t) np2_r << 56)))
291  + perr(p1_r,
292  avg4( p2_r, ( p2_r >> 8) | ((uint64_t) p2_x),
293  np2_r, (np2_r >> 8) | ((uint64_t) np2_x)));
294 
295  p1_l = np1_l;
296  p1_r = np1_r;
297  p2_l = np2_l;
298  p2_r = np2_r;
299  p2_x = np2_x;
300  } while (--h);
301 
302  return result;
303 }
304 
306 {
307  /* amask clears all bits that correspond to present features. */
308  if (amask(AMASK_MVI) == 0) {
309  c->sad[0] = pix_abs16x16_mvi_asm;
310  c->sad[1] = pix_abs8x8_mvi;
311  c->pix_abs[0][0] = pix_abs16x16_mvi_asm;
312  c->pix_abs[1][0] = pix_abs8x8_mvi;
313  c->pix_abs[0][1] = pix_abs16x16_x2_mvi;
314  c->pix_abs[0][2] = pix_abs16x16_y2_mvi;
315  c->pix_abs[0][3] = pix_abs16x16_xy2_mvi;
316  }
317 }
static int pix_abs16x16_xy2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
Definition: me_cmp_alpha.c:248
static int pix_abs16x16_y2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
Definition: me_cmp_alpha.c:195
#define perr(a, b)
Definition: asm.h:142
static int pix_abs8x8_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
Definition: me_cmp_alpha.c:47
const char * b
Definition: vf_curves.c:113
#define ldq(p)
Definition: asm.h:59
Macro definitions for various function/variable attributes.
#define extql(a, b)
Definition: asm.h:94
uint8_t
#define av_cold
Definition: attributes.h:82
#define AMASK_MVI
Definition: asm.h:40
#define extqh(a, b)
Definition: asm.h:96
#define amask
Definition: asm.h:99
#define uldq(a)
Definition: asm.h:85
const char * r
Definition: vf_curves.c:111
static int pix_abs16x16_x2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
Definition: me_cmp_alpha.c:122
int pix_abs16x16_mvi_asm(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
static uint64_t avg2(uint64_t a, uint64_t b)
Definition: me_cmp_alpha.c:28
me_cmp_func pix_abs[2][4]
Definition: me_cmp.h:78
av_cold void ff_me_cmp_init_alpha(MECmpContext *c, AVCodecContext *avctx)
Definition: me_cmp_alpha.c:305
#define ldq_u(p)
Definition: asm.h:84
main external API structure.
Definition: avcodec.h:1518
me_cmp_func sad[6]
Definition: me_cmp.h:56
static double c[64]
static uint64_t BYTE_VEC(uint64_t x)
Definition: asm.h:42
static uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
Definition: me_cmp_alpha.c:33