81 #define hadamard_func(cpu) \ 82 int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \ 83 uint8_t *src2, ptrdiff_t stride, int h); \ 84 int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1, \ 85 uint8_t *src2, ptrdiff_t stride, int h); 108 return score1 +
FFABS(score2) * 8;
114 int score1 =
ff_sse8_mmx(c, pix1, pix2, stride, h);
121 return score1 +
FFABS(score2) * 8;
129 ptrdiff_t stride,
int h)
136 #define SUM(in0, in1, out0, out1) \ 137 "movq (%0), %%mm2\n" \ 138 "movq 8(%0), %%mm3\n" \ 140 "movq %%mm2, " #out0 "\n" \ 141 "movq %%mm3, " #out1 "\n" \ 142 "psubusb " #in0 ", %%mm2\n" \ 143 "psubusb " #in1 ", %%mm3\n" \ 144 "psubusb " #out0 ", " #in0 "\n" \ 145 "psubusb " #out1 ", " #in1 "\n" \ 146 "por %%mm2, " #in0 "\n" \ 147 "por %%mm3, " #in1 "\n" \ 148 "movq " #in0 ", %%mm2\n" \ 149 "movq " #in1 ", %%mm3\n" \ 150 "punpcklbw %%mm7, " #in0 "\n" \ 151 "punpcklbw %%mm7, " #in1 "\n" \ 152 "punpckhbw %%mm7, %%mm2\n" \ 153 "punpckhbw %%mm7, %%mm3\n" \ 154 "paddw " #in1 ", " #in0 "\n" \ 155 "paddw %%mm3, %%mm2\n" \ 156 "paddw %%mm2, " #in0 "\n" \ 157 "paddw " #in0 ", %%mm6\n" 162 "pxor %%mm6, %%mm6\n" 163 "pxor %%mm7, %%mm7\n" 165 "movq 8(%0), %%mm1\n" 170 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
172 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
177 "movq %%mm6, %%mm0\n" 179 "paddw %%mm6, %%mm0\n" 180 "movq %%mm0, %%mm6\n" 182 "paddw %%mm6, %%mm0\n" 184 :
"+r" (pix),
"=r" (tmp)
193 ptrdiff_t stride,
int h)
201 #define SUM(in0, in1, out0, out1) \ 202 "movq (%0), %%mm2\n" \ 203 "movq (%1), " #out0 "\n" \ 204 "movq 8(%0), %%mm3\n" \ 205 "movq 8(%1), " #out1 "\n" \ 208 "psubb " #out0 ", %%mm2\n" \ 209 "psubb " #out1 ", %%mm3\n" \ 210 "pxor %%mm7, %%mm2\n" \ 211 "pxor %%mm7, %%mm3\n" \ 212 "movq %%mm2, " #out0 "\n" \ 213 "movq %%mm3, " #out1 "\n" \ 214 "psubusb " #in0 ", %%mm2\n" \ 215 "psubusb " #in1 ", %%mm3\n" \ 216 "psubusb " #out0 ", " #in0 "\n" \ 217 "psubusb " #out1 ", " #in1 "\n" \ 218 "por %%mm2, " #in0 "\n" \ 219 "por %%mm3, " #in1 "\n" \ 220 "movq " #in0 ", %%mm2\n" \ 221 "movq " #in1 ", %%mm3\n" \ 222 "punpcklbw %%mm7, " #in0 "\n" \ 223 "punpcklbw %%mm7, " #in1 "\n" \ 224 "punpckhbw %%mm7, %%mm2\n" \ 225 "punpckhbw %%mm7, %%mm3\n" \ 226 "paddw " #in1 ", " #in0 "\n" \ 227 "paddw %%mm3, %%mm2\n" \ 228 "paddw %%mm2, " #in0 "\n" \ 229 "paddw " #in0 ", %%mm6\n" 234 "pxor %%mm6, %%mm6\n" 235 "pcmpeqw %%mm7, %%mm7\n" 237 "packsswb %%mm7, %%mm7\n" 240 "movq 8(%0), %%mm1\n" 241 "movq 8(%1), %%mm3\n" 244 "psubb %%mm2, %%mm0\n" 245 "psubb %%mm3, %%mm1\n" 246 "pxor %%mm7, %%mm0\n" 247 "pxor %%mm7, %%mm1\n" 251 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
253 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
258 "movq %%mm6, %%mm0\n" 260 "paddw %%mm6, %%mm0\n" 261 "movq %%mm0, %%mm6\n" 263 "paddw %%mm6, %%mm0\n" 265 :
"+r" (pix1),
"+r" (pix2),
"=r" (
tmp)
266 :
"r" (stride),
"m" (
h)
274 0x0000000000000000ULL,
275 0x0001000100010001ULL,
276 0x0002000200020002ULL,
280 ptrdiff_t stride,
int h)
286 "movq (%1, %%"FF_REG_a
"), %%mm0 \n\t" 287 "movq (%2, %%"FF_REG_a
"), %%mm2 \n\t" 288 "movq (%2, %%"FF_REG_a
"), %%mm4 \n\t" 289 "add %3, %%"FF_REG_a
" \n\t" 290 "psubusb %%mm0, %%mm2 \n\t" 291 "psubusb %%mm4, %%mm0 \n\t" 292 "movq (%1, %%"FF_REG_a
"), %%mm1 \n\t" 293 "movq (%2, %%"FF_REG_a
"), %%mm3 \n\t" 294 "movq (%2, %%"FF_REG_a
"), %%mm5 \n\t" 295 "psubusb %%mm1, %%mm3 \n\t" 296 "psubusb %%mm5, %%mm1 \n\t" 297 "por %%mm2, %%mm0 \n\t" 298 "por %%mm1, %%mm3 \n\t" 299 "movq %%mm0, %%mm1 \n\t" 300 "movq %%mm3, %%mm2 \n\t" 301 "punpcklbw %%mm7, %%mm0 \n\t" 302 "punpckhbw %%mm7, %%mm1 \n\t" 303 "punpcklbw %%mm7, %%mm3 \n\t" 304 "punpckhbw %%mm7, %%mm2 \n\t" 305 "paddw %%mm1, %%mm0 \n\t" 306 "paddw %%mm3, %%mm2 \n\t" 307 "paddw %%mm2, %%mm0 \n\t" 308 "paddw %%mm0, %%mm6 \n\t" 309 "add %3, %%"FF_REG_a
" \n\t" 312 :
"r" (blk1 - len),
"r" (blk2 -
len),
"r" (stride));
316 ptrdiff_t stride,
int h)
322 "movq (%1, %%"FF_REG_a
"), %%mm0 \n\t" 323 "movq (%2, %%"FF_REG_a
"), %%mm1 \n\t" 324 "movq (%1, %%"FF_REG_a
"), %%mm2 \n\t" 325 "movq (%2, %%"FF_REG_a
"), %%mm3 \n\t" 326 "punpcklbw %%mm7, %%mm0 \n\t" 327 "punpcklbw %%mm7, %%mm1 \n\t" 328 "punpckhbw %%mm7, %%mm2 \n\t" 329 "punpckhbw %%mm7, %%mm3 \n\t" 330 "paddw %%mm0, %%mm1 \n\t" 331 "paddw %%mm2, %%mm3 \n\t" 332 "movq (%3, %%"FF_REG_a
"), %%mm4 \n\t" 333 "movq (%3, %%"FF_REG_a
"), %%mm2 \n\t" 334 "paddw %%mm5, %%mm1 \n\t" 335 "paddw %%mm5, %%mm3 \n\t" 336 "psrlw $1, %%mm1 \n\t" 337 "psrlw $1, %%mm3 \n\t" 338 "packuswb %%mm3, %%mm1 \n\t" 339 "psubusb %%mm1, %%mm4 \n\t" 340 "psubusb %%mm2, %%mm1 \n\t" 341 "por %%mm4, %%mm1 \n\t" 342 "movq %%mm1, %%mm0 \n\t" 343 "punpcklbw %%mm7, %%mm0 \n\t" 344 "punpckhbw %%mm7, %%mm1 \n\t" 345 "paddw %%mm1, %%mm0 \n\t" 346 "paddw %%mm0, %%mm6 \n\t" 347 "add %4, %%"FF_REG_a
" \n\t" 350 :
"r" (blk1a - len),
"r" (blk1b -
len),
"r" (blk2 - len),
355 ptrdiff_t stride,
int h)
359 "movq (%1, %%"FF_REG_a
"), %%mm0\n\t" 360 "movq 1(%1, %%"FF_REG_a
"), %%mm2\n\t" 361 "movq %%mm0, %%mm1 \n\t" 362 "movq %%mm2, %%mm3 \n\t" 363 "punpcklbw %%mm7, %%mm0 \n\t" 364 "punpckhbw %%mm7, %%mm1 \n\t" 365 "punpcklbw %%mm7, %%mm2 \n\t" 366 "punpckhbw %%mm7, %%mm3 \n\t" 367 "paddw %%mm2, %%mm0 \n\t" 368 "paddw %%mm3, %%mm1 \n\t" 371 "movq (%2, %%"FF_REG_a
"), %%mm2\n\t" 372 "movq 1(%2, %%"FF_REG_a
"), %%mm4\n\t" 373 "movq %%mm2, %%mm3 \n\t" 374 "movq %%mm4, %%mm5 \n\t" 375 "punpcklbw %%mm7, %%mm2 \n\t" 376 "punpckhbw %%mm7, %%mm3 \n\t" 377 "punpcklbw %%mm7, %%mm4 \n\t" 378 "punpckhbw %%mm7, %%mm5 \n\t" 379 "paddw %%mm4, %%mm2 \n\t" 380 "paddw %%mm5, %%mm3 \n\t" 381 "movq %5, %%mm5 \n\t" 382 "paddw %%mm2, %%mm0 \n\t" 383 "paddw %%mm3, %%mm1 \n\t" 384 "paddw %%mm5, %%mm0 \n\t" 385 "paddw %%mm5, %%mm1 \n\t" 386 "movq (%3, %%"FF_REG_a
"), %%mm4 \n\t" 387 "movq (%3, %%"FF_REG_a
"), %%mm5 \n\t" 388 "psrlw $2, %%mm0 \n\t" 389 "psrlw $2, %%mm1 \n\t" 390 "packuswb %%mm1, %%mm0 \n\t" 391 "psubusb %%mm0, %%mm4 \n\t" 392 "psubusb %%mm5, %%mm0 \n\t" 393 "por %%mm4, %%mm0 \n\t" 394 "movq %%mm0, %%mm4 \n\t" 395 "punpcklbw %%mm7, %%mm0 \n\t" 396 "punpckhbw %%mm7, %%mm4 \n\t" 397 "paddw %%mm0, %%mm6 \n\t" 398 "paddw %%mm4, %%mm6 \n\t" 399 "movq %%mm2, %%mm0 \n\t" 400 "movq %%mm3, %%mm1 \n\t" 401 "add %4, %%"FF_REG_a
" \n\t" 404 :
"r" (blk1 - len),
"r" (blk1 - len +
stride),
"r" (blk2 - len),
405 "r" (
stride),
"m" (round_tab[2]));
408 static inline int sum_mmx(
void)
412 "movq %%mm6, %%mm0 \n\t" 413 "psrlq $32, %%mm6 \n\t" 414 "paddw %%mm0, %%mm6 \n\t" 415 "movq %%mm6, %%mm0 \n\t" 416 "psrlq $16, %%mm6 \n\t" 417 "paddw %%mm0, %%mm6 \n\t" 418 "movd %%mm6, %0 \n\t" 424 ptrdiff_t stride,
int h)
426 sad8_2_mmx(blk1, blk1 + 1, blk2, stride, h);
430 ptrdiff_t stride,
int h)
432 sad8_2_mmx(blk1, blk1 + stride, blk2, stride, h);
435 #define PIX_SAD(suf) \ 436 static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2, \ 437 uint8_t *blk1, ptrdiff_t stride, int h) \ 439 av_assert2(h == 8); \ 441 "pxor %%mm7, %%mm7 \n\t" \ 442 "pxor %%mm6, %%mm6 \n\t" \ 445 sad8_1_ ## suf(blk1, blk2, stride, 8); \ 447 return sum_ ## suf(); \ 450 static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ 451 uint8_t *blk1, ptrdiff_t stride, int h) \ 453 av_assert2(h == 8); \ 455 "pxor %%mm7, %%mm7 \n\t" \ 456 "pxor %%mm6, %%mm6 \n\t" \ 457 "movq %0, %%mm5 \n\t" \ 458 :: "m" (round_tab[1])); \ 460 sad8_x2a_ ## suf(blk1, blk2, stride, 8); \ 462 return sum_ ## suf(); \ 465 static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ 466 uint8_t *blk1, ptrdiff_t stride, int h) \ 468 av_assert2(h == 8); \ 470 "pxor %%mm7, %%mm7 \n\t" \ 471 "pxor %%mm6, %%mm6 \n\t" \ 472 "movq %0, %%mm5 \n\t" \ 473 :: "m" (round_tab[1])); \ 475 sad8_y2a_ ## suf(blk1, blk2, stride, 8); \ 477 return sum_ ## suf(); \ 480 static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ 481 uint8_t *blk1, ptrdiff_t stride, int h) \ 483 av_assert2(h == 8); \ 485 "pxor %%mm7, %%mm7 \n\t" \ 486 "pxor %%mm6, %%mm6 \n\t" \ 489 sad8_4_ ## suf(blk1, blk2, stride, 8); \ 491 return sum_ ## suf(); \ 494 static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2, \ 495 uint8_t *blk1, ptrdiff_t stride, int h) \ 498 "pxor %%mm7, %%mm7 \n\t" \ 499 "pxor %%mm6, %%mm6 \n\t" \ 502 sad8_1_ ## suf(blk1, blk2, stride, h); \ 503 sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ 505 return sum_ ## suf(); \ 508 static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ 509 uint8_t *blk1, ptrdiff_t stride, int h) \ 512 "pxor %%mm7, %%mm7 \n\t" \ 513 "pxor %%mm6, %%mm6 \n\t" \ 514 "movq %0, %%mm5 \n\t" \ 515 :: "m" (round_tab[1])); \ 517 sad8_x2a_ ## suf(blk1, blk2, stride, h); \ 518 sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ 520 return sum_ ## suf(); \ 523 static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ 524 uint8_t *blk1, ptrdiff_t stride, int h) \ 527 "pxor %%mm7, %%mm7 \n\t" \ 528 "pxor %%mm6, %%mm6 \n\t" \ 529 "movq %0, %%mm5 \n\t" \ 530 :: "m" (round_tab[1])); \ 532 sad8_y2a_ ## suf(blk1, blk2, stride, h); \ 533 sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ 535 return sum_ ## suf(); \ 538 static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ 539 uint8_t *blk1, ptrdiff_t stride, int h) \ 542 "pxor %%mm7, %%mm7 \n\t" \ 543 "pxor %%mm6, %%mm6 \n\t" \ 546 sad8_4_ ## suf(blk1, blk2, stride, h); \ 547 sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ 549 return sum_ ## suf(); \ 563 c->
pix_abs[0][1] = sad16_x2_mmx;
564 c->
pix_abs[0][2] = sad16_y2_mmx;
565 c->
pix_abs[0][3] = sad16_xy2_mmx;
567 c->
pix_abs[1][1] = sad8_x2_mmx;
568 c->
pix_abs[1][2] = sad8_y2_mmx;
569 c->
pix_abs[1][3] = sad8_xy2_mmx;
571 c->
sad[0] = sad16_mmx;
572 c->
sad[1] = sad8_mmx;
574 c->
vsad[4] = vsad_intra16_mmx;
577 c->
vsad[0] = vsad16_mmx;
590 c->
nsse[0] = nsse16_mmx;
591 c->
nsse[1] = nsse8_mmx;
626 #if HAVE_ALIGNED_STACK 646 #if HAVE_ALIGNED_STACK #define EXTERNAL_MMX(flags)
int ff_sum_abs_dctelem_mmx(int16_t *block)
int(* sum_abs_dctelem)(int16_t *block)
int ff_hf_noise8_mmx(uint8_t *pix1, ptrdiff_t stride, int h)
int ff_vsad16_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
static atomic_int cpu_flags
int ff_hf_noise16_mmx(uint8_t *pix1, ptrdiff_t stride, int h)
int ff_vsad_intra8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
Macro definitions for various function/variable attributes.
me_cmp_func hadamard8_diff[6]
int ff_vsad16_approx_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
#define av_assert2(cond)
assert() equivalent, that does lie in speed critical code.
#define hadamard_func(cpu)
int ff_sum_abs_dctelem_sse2(int16_t *block)
#define AV_CPU_FLAG_SSE2SLOW
SSE2 supported, but usually not faster.
#define EXTERNAL_SSE2(flags)
int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sum_abs_dctelem_ssse3(int16_t *block)
int ff_vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
#define INLINE_MMX(flags)
int ff_sad8_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int flags
AV_CODEC_FLAG_*.
me_cmp_func pix_abs[2][4]
int ff_sad16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sad16_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sad16_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
#define DECLARE_ASM_CONST(n, t, v)
Declare a static constant aligned variable appropriate for use in inline assembly code...
int ff_sad8_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
#define AV_CODEC_FLAG_BITEXACT
Use only bitexact stuff (except (I)DCT).
int ff_sum_abs_dctelem_mmxext(int16_t *block)
#define FFABS(a)
Absolute value, Note, INT_MIN / INT64_MIN result in undefined behavior as they are not representable ...
int ff_sad16_approx_xy2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
main external API structure.
#define EXTERNAL_SSSE3(flags)
int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
#define EXTERNAL_MMXEXT(flags)
struct AVCodecContext * avctx
int ff_sad8_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sad16_y2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_vsad_intra16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_vsad8_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sad16_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sad8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sad16_x2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)