32 #if COMPILE_TEMPLATE_MMXEXT 33 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t" 34 #define MOVNTQ2 "movntq " 36 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t" 37 #define MOVNTQ2 "movq " 39 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b) 41 #if !COMPILE_TEMPLATE_MMXEXT 46 __asm__
volatile(
"pxor %%mm0, %%mm0\n\t" 47 "movq (%0), %%mm3\n\t" 48 "movq %%mm3, %%mm4\n\t" 49 "psrlq $24, %%mm3\n\t" 50 "psllq $40, %%mm4\n\t" 51 "por %%mm4, %%mm3\n\t" 52 "movq %%mm3, %%mm4\n\t" 53 "punpcklbw %%mm0, %%mm3\n\t" 54 "punpckhbw %%mm0, %%mm4\n\t" 58 __asm__
volatile(
"pxor %%mm0, %%mm0\n\t" 59 "movq (%0), %%mm3\n\t" 60 "movq %%mm3, %%mm4\n\t" 61 "punpcklbw %%mm0, %%mm3\n\t" 62 "punpckhbw %%mm0, %%mm4\n\t" 77 "punpcklwd %%mm1, %%mm1\n\t" 78 "punpckldq %%mm1, %%mm1\n\t" 80 "paddw %%mm1, %%mm3\n\t" 81 "paddw %%mm1, %%mm4\n\t" 88 "movq %%mm3, %%mm6\n\t" 89 "movq %%mm4, %%mm7\n\t" 91 "mov %0, %%"FF_REG_d
" \n\t"\
92 "mov (%%"FF_REG_d
"), %%"FF_REG_S
" \n\t"\
95 "movq 8(%%"FF_REG_d
"), %%mm0 \n\t" \
96 "movq (%%"FF_REG_S
", %%"FF_REG_c
", 2), %%mm2 \n\t" \
97 "movq 8(%%"FF_REG_S
", %%"FF_REG_c
", 2), %%mm5 \n\t" \
98 "add $16, %%"FF_REG_d
" \n\t"\
99 "mov (%%"FF_REG_d
"), %%"FF_REG_S
" \n\t"\
100 "test %%"FF_REG_S
", %%"FF_REG_S
" \n\t"\
101 "pmulhw %%mm0, %%mm2 \n\t"\
102 "pmulhw %%mm0, %%mm5 \n\t"\
103 "paddw %%mm2, %%mm3 \n\t"\
104 "paddw %%mm5, %%mm4 \n\t"\
106 "psraw $3, %%mm3 \n\t"\
107 "psraw $3, %%mm4 \n\t"\
108 "packuswb %%mm4, %%mm3 \n\t" 109 MOVNTQ2 " %%mm3, (%1, %%"FF_REG_c
")\n\t" 110 "add $8, %%"FF_REG_c
" \n\t"\
111 "cmp %2, %%"FF_REG_c
" \n\t"\
112 "movq %%mm6, %%mm3\n\t" 113 "movq %%mm7, %%mm4\n\t" 114 "mov %0, %%"FF_REG_d
" \n\t"\
115 "mov (%%"FF_REG_d
"), %%"FF_REG_S
" \n\t"\
119 :
"%"FF_REG_d,
"%"FF_REG_S,
"%"FF_REG_c
123 #define YSCALEYUV2PACKEDX_UV \ 125 "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\ 129 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"FF_REG_d" \n\t"\ 130 "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ 131 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ 132 "movq %%mm3, %%mm4 \n\t"\ 135 "movq 8(%%"FF_REG_d"), %%mm0 \n\t" \ 136 "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm2 \n\t" \ 137 "add %6, %%"FF_REG_S" \n\t" \ 138 "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm5 \n\t" \ 139 "add $16, %%"FF_REG_d" \n\t"\ 140 "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ 141 "pmulhw %%mm0, %%mm2 \n\t"\ 142 "pmulhw %%mm0, %%mm5 \n\t"\ 143 "paddw %%mm2, %%mm3 \n\t"\ 144 "paddw %%mm5, %%mm4 \n\t"\ 145 "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\ 148 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \ 149 "lea "offset"(%0), %%"FF_REG_d" \n\t"\ 150 "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ 151 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\ 152 "movq "#dst1", "#dst2" \n\t"\ 155 "movq 8(%%"FF_REG_d"), "#coeff" \n\t" \ 156 "movq (%%"FF_REG_S", %%"FF_REG_a", 2), "#src1" \n\t" \ 157 "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), "#src2" \n\t" \ 158 "add $16, %%"FF_REG_d" \n\t"\ 159 "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ 160 "pmulhw "#coeff", "#src1" \n\t"\ 161 "pmulhw "#coeff", "#src2" \n\t"\ 162 "paddw "#src1", "#dst1" \n\t"\ 163 "paddw "#src2", "#dst2" \n\t"\ 164 "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\ 167 #define YSCALEYUV2PACKEDX \ 168 YSCALEYUV2PACKEDX_UV \ 169 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \ 171 #define YSCALEYUV2PACKEDX_END \ 172 :: "r" (&c->redDither), \ 173 "m" (dummy), "m" (dummy), "m" (dummy),\ 174 "r" (dest), "m" (dstW_reg), "m"(uv_off) \ 175 NAMED_CONSTRAINTS_ADD(bF8,bFC) \ 176 : "%"FF_REG_a, "%"FF_REG_d, "%"FF_REG_S \ 179 #define YSCALEYUV2PACKEDX_ACCURATE_UV \ 181 "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\ 185 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"FF_REG_d" \n\t"\ 186 "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ 187 "pxor %%mm4, %%mm4 \n\t"\ 188 "pxor %%mm5, %%mm5 \n\t"\ 189 "pxor %%mm6, %%mm6 \n\t"\ 190 "pxor %%mm7, %%mm7 \n\t"\ 193 "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm0 \n\t" \ 194 "add %6, %%"FF_REG_S" \n\t" \ 195 "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm2 \n\t" \ 196 "mov "STR(APCK_PTR2)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ 197 "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm1 \n\t" \ 198 "movq %%mm0, %%mm3 \n\t"\ 199 "punpcklwd %%mm1, %%mm0 \n\t"\ 200 "punpckhwd %%mm1, %%mm3 \n\t"\ 201 "movq "STR(APCK_COEF)"(%%"FF_REG_d"),%%mm1 \n\t" \ 202 "pmaddwd %%mm1, %%mm0 \n\t"\ 203 "pmaddwd %%mm1, %%mm3 \n\t"\ 204 "paddd %%mm0, %%mm4 \n\t"\ 205 "paddd %%mm3, %%mm5 \n\t"\ 206 "add %6, %%"FF_REG_S" \n\t" \ 207 "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm3 \n\t" \ 208 "mov "STR(APCK_SIZE)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ 209 "add $"STR(APCK_SIZE)", %%"FF_REG_d" \n\t"\ 210 "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\ 211 "movq %%mm2, %%mm0 \n\t"\ 212 "punpcklwd %%mm3, %%mm2 \n\t"\ 213 "punpckhwd %%mm3, %%mm0 \n\t"\ 214 "pmaddwd %%mm1, %%mm2 \n\t"\ 215 "pmaddwd %%mm1, %%mm0 \n\t"\ 216 "paddd %%mm2, %%mm6 \n\t"\ 217 "paddd %%mm0, %%mm7 \n\t"\ 219 "psrad $16, %%mm4 \n\t"\ 220 "psrad $16, %%mm5 \n\t"\ 221 "psrad $16, %%mm6 \n\t"\ 222 "psrad $16, %%mm7 \n\t"\ 223 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ 224 "packssdw %%mm5, %%mm4 \n\t"\ 225 "packssdw %%mm7, %%mm6 \n\t"\ 226 "paddw %%mm0, %%mm4 \n\t"\ 227 "paddw %%mm0, %%mm6 \n\t"\ 228 "movq %%mm4, "U_TEMP"(%0) \n\t"\ 229 "movq %%mm6, "V_TEMP"(%0) \n\t"\ 231 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \ 232 "lea "offset"(%0), %%"FF_REG_d" \n\t"\ 233 "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ 234 "pxor %%mm1, %%mm1 \n\t"\ 235 "pxor %%mm5, %%mm5 \n\t"\ 236 "pxor %%mm7, %%mm7 \n\t"\ 237 "pxor %%mm6, %%mm6 \n\t"\ 240 "movq (%%"FF_REG_S", %%"FF_REG_a", 2), %%mm0 \n\t" \ 241 "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), %%mm2 \n\t" \ 242 "mov "STR(APCK_PTR2)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ 243 "movq (%%"FF_REG_S", %%"FF_REG_a", 2), %%mm4 \n\t" \ 244 "movq %%mm0, %%mm3 \n\t"\ 245 "punpcklwd %%mm4, %%mm0 \n\t"\ 246 "punpckhwd %%mm4, %%mm3 \n\t"\ 247 "movq "STR(APCK_COEF)"(%%"FF_REG_d"), %%mm4 \n\t" \ 248 "pmaddwd %%mm4, %%mm0 \n\t"\ 249 "pmaddwd %%mm4, %%mm3 \n\t"\ 250 "paddd %%mm0, %%mm1 \n\t"\ 251 "paddd %%mm3, %%mm5 \n\t"\ 252 "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), %%mm3 \n\t" \ 253 "mov "STR(APCK_SIZE)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ 254 "add $"STR(APCK_SIZE)", %%"FF_REG_d" \n\t"\ 255 "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\ 256 "movq %%mm2, %%mm0 \n\t"\ 257 "punpcklwd %%mm3, %%mm2 \n\t"\ 258 "punpckhwd %%mm3, %%mm0 \n\t"\ 259 "pmaddwd %%mm4, %%mm2 \n\t"\ 260 "pmaddwd %%mm4, %%mm0 \n\t"\ 261 "paddd %%mm2, %%mm7 \n\t"\ 262 "paddd %%mm0, %%mm6 \n\t"\ 264 "psrad $16, %%mm1 \n\t"\ 265 "psrad $16, %%mm5 \n\t"\ 266 "psrad $16, %%mm7 \n\t"\ 267 "psrad $16, %%mm6 \n\t"\ 268 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ 269 "packssdw %%mm5, %%mm1 \n\t"\ 270 "packssdw %%mm6, %%mm7 \n\t"\ 271 "paddw %%mm0, %%mm1 \n\t"\ 272 "paddw %%mm0, %%mm7 \n\t"\ 273 "movq "U_TEMP"(%0), %%mm3 \n\t"\ 274 "movq "V_TEMP"(%0), %%mm4 \n\t"\ 276 #define YSCALEYUV2PACKEDX_ACCURATE \ 277 YSCALEYUV2PACKEDX_ACCURATE_UV \ 278 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET) 280 #define YSCALEYUV2RGBX \ 281 "psubw "U_OFFSET"(%0), %%mm3 \n\t" \ 282 "psubw "V_OFFSET"(%0), %%mm4 \n\t" \ 283 "movq %%mm3, %%mm2 \n\t" \ 284 "movq %%mm4, %%mm5 \n\t" \ 285 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\ 286 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\ 288 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\ 289 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\ 290 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" \ 291 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" \ 292 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\ 293 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\ 295 "paddw %%mm3, %%mm4 \n\t"\ 296 "movq %%mm2, %%mm0 \n\t"\ 297 "movq %%mm5, %%mm6 \n\t"\ 298 "movq %%mm4, %%mm3 \n\t"\ 299 "punpcklwd %%mm2, %%mm2 \n\t"\ 300 "punpcklwd %%mm5, %%mm5 \n\t"\ 301 "punpcklwd %%mm4, %%mm4 \n\t"\ 302 "paddw %%mm1, %%mm2 \n\t"\ 303 "paddw %%mm1, %%mm5 \n\t"\ 304 "paddw %%mm1, %%mm4 \n\t"\ 305 "punpckhwd %%mm0, %%mm0 \n\t"\ 306 "punpckhwd %%mm6, %%mm6 \n\t"\ 307 "punpckhwd %%mm3, %%mm3 \n\t"\ 308 "paddw %%mm7, %%mm0 \n\t"\ 309 "paddw %%mm7, %%mm6 \n\t"\ 310 "paddw %%mm7, %%mm3 \n\t"\ 312 "packuswb %%mm0, %%mm2 \n\t"\ 313 "packuswb %%mm6, %%mm5 \n\t"\ 314 "packuswb %%mm3, %%mm4 \n\t"\ 316 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \ 317 "movq "#b", "#q2" \n\t" \ 318 "movq "#r", "#t" \n\t" \ 319 "punpcklbw "#g", "#b" \n\t" \ 320 "punpcklbw "#a", "#r" \n\t" \ 321 "punpckhbw "#g", "#q2" \n\t" \ 322 "punpckhbw "#a", "#t" \n\t" \ 323 "movq "#b", "#q0" \n\t" \ 324 "movq "#q2", "#q3" \n\t" \ 325 "punpcklwd "#r", "#q0" \n\t" \ 326 "punpckhwd "#r", "#b" \n\t" \ 327 "punpcklwd "#t", "#q2" \n\t" \ 328 "punpckhwd "#t", "#q3" \n\t" \ 330 MOVNTQ( q0, (dst, index, 4))\ 331 MOVNTQ( b, 8(dst, index, 4))\ 332 MOVNTQ( q2, 16(dst, index, 4))\ 333 MOVNTQ( q3, 24(dst, index, 4))\ 335 "add $8, "#index" \n\t"\ 336 "cmp "dstw", "#index" \n\t"\ 338 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) 341 const int16_t **lumSrc,
int lumFilterSize,
342 const int16_t *chrFilter,
const int16_t **chrUSrc,
343 const int16_t **chrVSrc,
344 int chrFilterSize,
const int16_t **alpSrc,
345 uint8_t *dest,
int dstW,
int dstY)
354 "movq %%mm2, "U_TEMP"(%0) \n\t" 355 "movq %%mm4, "V_TEMP"(%0) \n\t" 356 "movq %%mm5, "Y_TEMP"(%0) \n\t" 358 "movq "Y_TEMP"(%0), %%mm5 \n\t" 359 "psraw $3, %%mm1 \n\t" 360 "psraw $3, %%mm7 \n\t" 361 "packuswb %%mm7, %%mm1 \n\t" 362 WRITEBGR32(%4,
"%5", %%FF_REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
367 "pcmpeqd %%mm7, %%mm7 \n\t" 368 WRITEBGR32(%4,
"%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
374 const int16_t **lumSrc,
int lumFilterSize,
375 const int16_t *chrFilter,
const int16_t **chrUSrc,
376 const int16_t **chrVSrc,
377 int chrFilterSize,
const int16_t **alpSrc,
378 uint8_t *dest,
int dstW,
int dstY)
388 "psraw $3, %%mm1 \n\t" 389 "psraw $3, %%mm7 \n\t" 390 "packuswb %%mm7, %%mm1 \n\t" 391 WRITEBGR32(%4,
"%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
396 "pcmpeqd %%mm7, %%mm7 \n\t" 397 WRITEBGR32(%4,
"%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
403 const int16_t **lumSrc,
int lumFilterSize,
404 const int16_t *chrFilter,
const int16_t **chrUSrc,
405 const int16_t **chrVSrc,
406 int chrFilterSize,
const int16_t **alpSrc,
407 uint8_t *dest,
int dstW,
int dstY)
417 "psraw $3, %%mm1 \n\t" 418 "psraw $3, %%mm7 \n\t" 419 "packuswb %%mm7, %%mm1 \n\t" 420 WRITEBGR32(%4,
"%5", %%FF_REGa, %%mm5, %%mm4, %%mm2, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
425 "pcmpeqd %%mm7, %%mm7 \n\t" 426 WRITEBGR32(%4,
"%5", %%FF_REGa, %%mm5, %%mm4, %%mm2, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
431 #define REAL_WRITERGB16(dst, dstw, index) \ 432 "pand "MANGLE(bF8)", %%mm2 \n\t" \ 433 "pand "MANGLE(bFC)", %%mm4 \n\t" \ 434 "pand "MANGLE(bF8)", %%mm5 \n\t" \ 435 "psrlq $3, %%mm2 \n\t"\ 437 "movq %%mm2, %%mm1 \n\t"\ 438 "movq %%mm4, %%mm3 \n\t"\ 440 "punpcklbw %%mm7, %%mm3 \n\t"\ 441 "punpcklbw %%mm5, %%mm2 \n\t"\ 442 "punpckhbw %%mm7, %%mm4 \n\t"\ 443 "punpckhbw %%mm5, %%mm1 \n\t"\ 445 "psllq $3, %%mm3 \n\t"\ 446 "psllq $3, %%mm4 \n\t"\ 448 "por %%mm3, %%mm2 \n\t"\ 449 "por %%mm4, %%mm1 \n\t"\ 451 MOVNTQ(%%mm2, (dst, index, 2))\ 452 MOVNTQ(%%mm1, 8(dst, index, 2))\ 454 "add $8, "#index" \n\t"\ 455 "cmp "dstw", "#index" \n\t"\ 457 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index) 460 const int16_t **lumSrc,
int lumFilterSize,
461 const int16_t *chrFilter,
const int16_t **chrUSrc,
462 const int16_t **chrVSrc,
463 int chrFilterSize,
const int16_t **alpSrc,
464 uint8_t *dest,
int dstW,
int dstY)
472 "pxor %%mm7, %%mm7 \n\t" 484 const int16_t **lumSrc,
int lumFilterSize,
485 const int16_t *chrFilter,
const int16_t **chrUSrc,
486 const int16_t **chrVSrc,
487 int chrFilterSize,
const int16_t **alpSrc,
488 uint8_t *dest,
int dstW,
int dstY)
496 "pxor %%mm7, %%mm7 \n\t" 507 #define REAL_WRITERGB15(dst, dstw, index) \ 508 "pand "MANGLE(bF8)", %%mm2 \n\t" \ 509 "pand "MANGLE(bF8)", %%mm4 \n\t" \ 510 "pand "MANGLE(bF8)", %%mm5 \n\t" \ 511 "psrlq $3, %%mm2 \n\t"\ 512 "psrlq $1, %%mm5 \n\t"\ 514 "movq %%mm2, %%mm1 \n\t"\ 515 "movq %%mm4, %%mm3 \n\t"\ 517 "punpcklbw %%mm7, %%mm3 \n\t"\ 518 "punpcklbw %%mm5, %%mm2 \n\t"\ 519 "punpckhbw %%mm7, %%mm4 \n\t"\ 520 "punpckhbw %%mm5, %%mm1 \n\t"\ 522 "psllq $2, %%mm3 \n\t"\ 523 "psllq $2, %%mm4 \n\t"\ 525 "por %%mm3, %%mm2 \n\t"\ 526 "por %%mm4, %%mm1 \n\t"\ 528 MOVNTQ(%%mm2, (dst, index, 2))\ 529 MOVNTQ(%%mm1, 8(dst, index, 2))\ 531 "add $8, "#index" \n\t"\ 532 "cmp "dstw", "#index" \n\t"\ 534 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index) 537 const int16_t **lumSrc,
int lumFilterSize,
538 const int16_t *chrFilter,
const int16_t **chrUSrc,
539 const int16_t **chrVSrc,
540 int chrFilterSize,
const int16_t **alpSrc,
541 uint8_t *dest,
int dstW,
int dstY)
549 "pxor %%mm7, %%mm7 \n\t" 561 const int16_t **lumSrc,
int lumFilterSize,
562 const int16_t *chrFilter,
const int16_t **chrUSrc,
563 const int16_t **chrVSrc,
564 int chrFilterSize,
const int16_t **alpSrc,
565 uint8_t *dest,
int dstW,
int dstY)
573 "pxor %%mm7, %%mm7 \n\t" 584 #define WRITEBGR24MMX(dst, dstw, index) \ 586 "movq %%mm2, %%mm1 \n\t" \ 587 "movq %%mm5, %%mm6 \n\t" \ 588 "punpcklbw %%mm4, %%mm2 \n\t" \ 589 "punpcklbw %%mm7, %%mm5 \n\t" \ 590 "punpckhbw %%mm4, %%mm1 \n\t" \ 591 "punpckhbw %%mm7, %%mm6 \n\t" \ 592 "movq %%mm2, %%mm0 \n\t" \ 593 "movq %%mm1, %%mm3 \n\t" \ 594 "punpcklwd %%mm5, %%mm0 \n\t" \ 595 "punpckhwd %%mm5, %%mm2 \n\t" \ 596 "punpcklwd %%mm6, %%mm1 \n\t" \ 597 "punpckhwd %%mm6, %%mm3 \n\t" \ 599 "movq %%mm0, %%mm4 \n\t" \ 600 "movq %%mm2, %%mm6 \n\t" \ 601 "movq %%mm1, %%mm5 \n\t" \ 602 "movq %%mm3, %%mm7 \n\t" \ 604 "psllq $40, %%mm0 \n\t" \ 605 "psllq $40, %%mm2 \n\t" \ 606 "psllq $40, %%mm1 \n\t" \ 607 "psllq $40, %%mm3 \n\t" \ 609 "punpckhdq %%mm4, %%mm0 \n\t" \ 610 "punpckhdq %%mm6, %%mm2 \n\t" \ 611 "punpckhdq %%mm5, %%mm1 \n\t" \ 612 "punpckhdq %%mm7, %%mm3 \n\t" \ 614 "psrlq $8, %%mm0 \n\t" \ 615 "movq %%mm2, %%mm6 \n\t" \ 616 "psllq $40, %%mm2 \n\t" \ 617 "por %%mm2, %%mm0 \n\t" \ 618 MOVNTQ(%%mm0, (dst))\ 620 "psrlq $24, %%mm6 \n\t" \ 621 "movq %%mm1, %%mm5 \n\t" \ 622 "psllq $24, %%mm1 \n\t" \ 623 "por %%mm1, %%mm6 \n\t" \ 624 MOVNTQ(%%mm6, 8(dst))\ 626 "psrlq $40, %%mm5 \n\t" \ 627 "psllq $8, %%mm3 \n\t" \ 628 "por %%mm3, %%mm5 \n\t" \ 629 MOVNTQ(%%mm5, 16(dst))\ 631 "add $24, "#dst" \n\t"\ 633 "add $8, "#index" \n\t"\ 634 "cmp "dstw", "#index" \n\t"\ 637 #define WRITEBGR24MMXEXT(dst, dstw, index) \ 639 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\ 640 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\ 641 "pshufw $0x50, %%mm2, %%mm1 \n\t" \ 642 "pshufw $0x50, %%mm4, %%mm3 \n\t" \ 643 "pshufw $0x00, %%mm5, %%mm6 \n\t" \ 645 "pand %%mm0, %%mm1 \n\t" \ 646 "pand %%mm0, %%mm3 \n\t" \ 647 "pand %%mm7, %%mm6 \n\t" \ 649 "psllq $8, %%mm3 \n\t" \ 650 "por %%mm1, %%mm6 \n\t"\ 651 "por %%mm3, %%mm6 \n\t"\ 652 MOVNTQ(%%mm6, (dst))\ 654 "psrlq $8, %%mm4 \n\t" \ 655 "pshufw $0xA5, %%mm2, %%mm1 \n\t" \ 656 "pshufw $0x55, %%mm4, %%mm3 \n\t" \ 657 "pshufw $0xA5, %%mm5, %%mm6 \n\t" \ 659 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" \ 660 "pand %%mm7, %%mm3 \n\t" \ 661 "pand %%mm0, %%mm6 \n\t" \ 663 "por %%mm1, %%mm3 \n\t" \ 664 "por %%mm3, %%mm6 \n\t"\ 665 MOVNTQ(%%mm6, 8(dst))\ 667 "pshufw $0xFF, %%mm2, %%mm1 \n\t" \ 668 "pshufw $0xFA, %%mm4, %%mm3 \n\t" \ 669 "pshufw $0xFA, %%mm5, %%mm6 \n\t" \ 671 "pand %%mm7, %%mm1 \n\t" \ 672 "pand %%mm0, %%mm3 \n\t" \ 673 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" \ 675 "por %%mm1, %%mm3 \n\t"\ 676 "por %%mm3, %%mm6 \n\t"\ 677 MOVNTQ(%%mm6, 16(dst))\ 679 "add $24, "#dst" \n\t"\ 681 "add $8, "#index" \n\t"\ 682 "cmp "dstw", "#index" \n\t"\ 685 #if COMPILE_TEMPLATE_MMXEXT 687 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMXEXT(dst, dstw, index) 690 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index) 695 const int16_t **lumSrc,
int lumFilterSize,
696 const int16_t *chrFilter,
const int16_t **chrUSrc,
697 const int16_t **chrVSrc,
698 int chrFilterSize,
const int16_t **alpSrc,
699 uint8_t *dest,
int dstW,
int dstY)
707 "pxor %%mm7, %%mm7 \n\t" 708 "lea (%%"FF_REG_a
", %%"FF_REG_a
", 2), %%"FF_REG_c
"\n\t" 709 "add %4, %%"FF_REG_c
" \n\t" 711 ::
"r" (&
c->redDither),
712 "m" (dummy),
"m" (
dummy),
"m" (dummy),
713 "r" (dest),
"m" (dstW_reg),
"m"(uv_off)
715 :
"%"FF_REG_a,
"%"FF_REG_c,
"%"FF_REG_d,
"%"FF_REG_S
720 const int16_t **lumSrc,
int lumFilterSize,
721 const int16_t *chrFilter,
const int16_t **chrUSrc,
722 const int16_t **chrVSrc,
723 int chrFilterSize,
const int16_t **alpSrc,
724 uint8_t *dest,
int dstW,
int dstY)
732 "pxor %%mm7, %%mm7 \n\t" 733 "lea (%%"FF_REG_a
", %%"FF_REG_a
", 2), %%"FF_REG_c
" \n\t" 734 "add %4, %%"FF_REG_c
" \n\t" 736 ::
"r" (&
c->redDither),
737 "m" (dummy),
"m" (
dummy),
"m" (dummy),
738 "r" (dest),
"m" (dstW_reg),
"m"(uv_off)
740 :
"%"FF_REG_a,
"%"FF_REG_c,
"%"FF_REG_d,
"%"FF_REG_S
745 #define REAL_WRITEYUY2(dst, dstw, index) \ 746 "packuswb %%mm3, %%mm3 \n\t"\ 747 "packuswb %%mm4, %%mm4 \n\t"\ 748 "packuswb %%mm7, %%mm1 \n\t"\ 749 "punpcklbw %%mm4, %%mm3 \n\t"\ 750 "movq %%mm1, %%mm7 \n\t"\ 751 "punpcklbw %%mm3, %%mm1 \n\t"\ 752 "punpckhbw %%mm3, %%mm7 \n\t"\ 754 MOVNTQ(%%mm1, (dst, index, 2))\ 755 MOVNTQ(%%mm7, 8(dst, index, 2))\ 757 "add $8, "#index" \n\t"\ 758 "cmp "dstw", "#index" \n\t"\ 760 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index) 763 const int16_t **lumSrc,
int lumFilterSize,
764 const int16_t *chrFilter,
const int16_t **chrUSrc,
765 const int16_t **chrVSrc,
766 int chrFilterSize,
const int16_t **alpSrc,
767 uint8_t *dest,
int dstW,
int dstY)
775 "psraw $3, %%mm3 \n\t" 776 "psraw $3, %%mm4 \n\t" 777 "psraw $3, %%mm1 \n\t" 778 "psraw $3, %%mm7 \n\t" 784 const int16_t **lumSrc,
int lumFilterSize,
785 const int16_t *chrFilter,
const int16_t **chrUSrc,
786 const int16_t **chrVSrc,
787 int chrFilterSize,
const int16_t **alpSrc,
788 uint8_t *dest,
int dstW,
int dstY)
796 "psraw $3, %%mm3 \n\t" 797 "psraw $3, %%mm4 \n\t" 798 "psraw $3, %%mm1 \n\t" 799 "psraw $3, %%mm7 \n\t" 804 #define REAL_YSCALEYUV2RGB_UV(index, c) \ 805 "xor "#index", "#index" \n\t"\ 808 "movq (%2, "#index"), %%mm2 \n\t" \ 809 "movq (%3, "#index"), %%mm3 \n\t" \ 810 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 811 "movq (%2, "#index"), %%mm5 \n\t" \ 812 "movq (%3, "#index"), %%mm4 \n\t" \ 813 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 814 "psubw %%mm3, %%mm2 \n\t" \ 815 "psubw %%mm4, %%mm5 \n\t" \ 816 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ 817 "pmulhw %%mm0, %%mm2 \n\t" \ 818 "pmulhw %%mm0, %%mm5 \n\t" \ 819 "psraw $4, %%mm3 \n\t" \ 820 "psraw $4, %%mm4 \n\t" \ 821 "paddw %%mm2, %%mm3 \n\t" \ 822 "paddw %%mm5, %%mm4 \n\t" \ 823 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" \ 824 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" \ 825 "movq %%mm3, %%mm2 \n\t" \ 826 "movq %%mm4, %%mm5 \n\t" \ 827 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ 828 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ 831 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \ 832 "movq ("#b1", "#index", 2), %%mm0 \n\t" \ 833 "movq ("#b2", "#index", 2), %%mm1 \n\t" \ 834 "movq 8("#b1", "#index", 2), %%mm6 \n\t" \ 835 "movq 8("#b2", "#index", 2), %%mm7 \n\t" \ 836 "psubw %%mm1, %%mm0 \n\t" \ 837 "psubw %%mm7, %%mm6 \n\t" \ 838 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" \ 839 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" \ 840 "psraw $4, %%mm1 \n\t" \ 841 "psraw $4, %%mm7 \n\t" \ 842 "paddw %%mm0, %%mm1 \n\t" \ 843 "paddw %%mm6, %%mm7 \n\t" \ 845 #define REAL_YSCALEYUV2RGB_COEFF(c) \ 846 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ 847 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ 848 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" \ 849 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" \ 850 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ 851 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ 853 "paddw %%mm3, %%mm4 \n\t"\ 854 "movq %%mm2, %%mm0 \n\t"\ 855 "movq %%mm5, %%mm6 \n\t"\ 856 "movq %%mm4, %%mm3 \n\t"\ 857 "punpcklwd %%mm2, %%mm2 \n\t"\ 858 "punpcklwd %%mm5, %%mm5 \n\t"\ 859 "punpcklwd %%mm4, %%mm4 \n\t"\ 860 "paddw %%mm1, %%mm2 \n\t"\ 861 "paddw %%mm1, %%mm5 \n\t"\ 862 "paddw %%mm1, %%mm4 \n\t"\ 863 "punpckhwd %%mm0, %%mm0 \n\t"\ 864 "punpckhwd %%mm6, %%mm6 \n\t"\ 865 "punpckhwd %%mm3, %%mm3 \n\t"\ 866 "paddw %%mm7, %%mm0 \n\t"\ 867 "paddw %%mm7, %%mm6 \n\t"\ 868 "paddw %%mm7, %%mm3 \n\t"\ 870 "packuswb %%mm0, %%mm2 \n\t"\ 871 "packuswb %%mm6, %%mm5 \n\t"\ 872 "packuswb %%mm3, %%mm4 \n\t"\ 874 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) 876 #define YSCALEYUV2RGB(index, c) \ 877 REAL_YSCALEYUV2RGB_UV(index, c) \ 878 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \ 879 REAL_YSCALEYUV2RGB_COEFF(c) 885 const int16_t *ubuf[2],
const int16_t *vbuf[2],
886 const int16_t *abuf[2],
uint8_t *dest,
887 int dstW,
int yalpha,
int uvalpha,
int y)
889 const int16_t *buf0 =
buf[0], *buf1 =
buf[1],
890 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
893 const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1];
898 "psraw $3, %%mm1 \n\t" 899 "psraw $3, %%mm7 \n\t" 900 "packuswb %%mm7, %%mm1 \n\t" 901 WRITEBGR32(%4,
DSTW_OFFSET"(%5)", %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
902 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"r" (dest),
904 "r" (abuf0),
"r" (abuf1)
908 c->u_temp=(intptr_t)abuf0;
909 c->v_temp=(intptr_t)abuf1;
912 "mov %4, %%"FF_REG_b
" \n\t" 913 "push %%"FF_REG_BP
" \n\t" 917 "mov "U_TEMP"(%5), %0 \n\t" 918 "mov "V_TEMP"(%5), %1 \n\t" 920 "psraw $3, %%mm1 \n\t" 921 "psraw $3, %%mm7 \n\t" 922 "packuswb %%mm7, %%mm1 \n\t" 925 WRITEBGR32(%%FF_REGb,
DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
926 "pop %%"FF_REG_BP
" \n\t" 928 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
935 "mov %4, %%"FF_REG_b
" \n\t" 936 "push %%"FF_REG_BP
" \n\t" 938 "pcmpeqd %%mm7, %%mm7 \n\t" 939 WRITEBGR32(%%FF_REGb,
DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
940 "pop %%"FF_REG_BP
" \n\t" 942 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
949 const int16_t *ubuf[2],
const int16_t *vbuf[2],
950 const int16_t *abuf[2],
uint8_t *dest,
951 int dstW,
int yalpha,
int uvalpha,
int y)
953 const int16_t *buf0 =
buf[0], *buf1 =
buf[1],
954 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
958 "mov %4, %%"FF_REG_b
" \n\t" 959 "push %%"FF_REG_BP
" \n\t" 961 "pxor %%mm7, %%mm7 \n\t" 963 "pop %%"FF_REG_BP
" \n\t" 965 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
972 const int16_t *ubuf[2],
const int16_t *vbuf[2],
973 const int16_t *abuf[2],
uint8_t *dest,
974 int dstW,
int yalpha,
int uvalpha,
int y)
976 const int16_t *buf0 =
buf[0], *buf1 =
buf[1],
977 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
981 "mov %4, %%"FF_REG_b
" \n\t" 982 "push %%"FF_REG_BP
" \n\t" 984 "pxor %%mm7, %%mm7 \n\t" 992 "pop %%"FF_REG_BP
" \n\t" 994 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1001 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1002 const int16_t *abuf[2],
uint8_t *dest,
1003 int dstW,
int yalpha,
int uvalpha,
int y)
1005 const int16_t *buf0 =
buf[0], *buf1 =
buf[1],
1006 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
1010 "mov %4, %%"FF_REG_b
" \n\t" 1011 "push %%"FF_REG_BP
" \n\t" 1013 "pxor %%mm7, %%mm7 \n\t" 1021 "pop %%"FF_REG_BP
" \n\t" 1023 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1029 #define REAL_YSCALEYUV2PACKED(index, c) \ 1030 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ 1031 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\ 1032 "psraw $3, %%mm0 \n\t"\ 1033 "psraw $3, %%mm1 \n\t"\ 1034 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\ 1035 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\ 1036 "xor "#index", "#index" \n\t"\ 1039 "movq (%2, "#index"), %%mm2 \n\t" \ 1040 "movq (%3, "#index"), %%mm3 \n\t" \ 1041 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1042 "movq (%2, "#index"), %%mm5 \n\t" \ 1043 "movq (%3, "#index"), %%mm4 \n\t" \ 1044 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1045 "psubw %%mm3, %%mm2 \n\t" \ 1046 "psubw %%mm4, %%mm5 \n\t" \ 1047 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ 1048 "pmulhw %%mm0, %%mm2 \n\t" \ 1049 "pmulhw %%mm0, %%mm5 \n\t" \ 1050 "psraw $7, %%mm3 \n\t" \ 1051 "psraw $7, %%mm4 \n\t" \ 1052 "paddw %%mm2, %%mm3 \n\t" \ 1053 "paddw %%mm5, %%mm4 \n\t" \ 1054 "movq (%0, "#index", 2), %%mm0 \n\t" \ 1055 "movq (%1, "#index", 2), %%mm1 \n\t" \ 1056 "movq 8(%0, "#index", 2), %%mm6 \n\t" \ 1057 "movq 8(%1, "#index", 2), %%mm7 \n\t" \ 1058 "psubw %%mm1, %%mm0 \n\t" \ 1059 "psubw %%mm7, %%mm6 \n\t" \ 1060 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" \ 1061 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" \ 1062 "psraw $7, %%mm1 \n\t" \ 1063 "psraw $7, %%mm7 \n\t" \ 1064 "paddw %%mm0, %%mm1 \n\t" \ 1065 "paddw %%mm6, %%mm7 \n\t" \ 1067 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c) 1070 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1071 const int16_t *abuf[2],
uint8_t *dest,
1072 int dstW,
int yalpha,
int uvalpha,
int y)
1074 const int16_t *buf0 =
buf[0], *buf1 =
buf[1],
1075 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
1079 "mov %4, %%"FF_REG_b
" \n\t" 1080 "push %%"FF_REG_BP
" \n\t" 1083 "pop %%"FF_REG_BP
" \n\t" 1085 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1090 #define REAL_YSCALEYUV2RGB1(index, c) \ 1091 "xor "#index", "#index" \n\t"\ 1094 "movq (%2, "#index"), %%mm3 \n\t" \ 1095 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1096 "movq (%2, "#index"), %%mm4 \n\t" \ 1097 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1098 "psraw $4, %%mm3 \n\t" \ 1099 "psraw $4, %%mm4 \n\t" \ 1100 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" \ 1101 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" \ 1102 "movq %%mm3, %%mm2 \n\t" \ 1103 "movq %%mm4, %%mm5 \n\t" \ 1104 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ 1105 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ 1107 "movq (%0, "#index", 2), %%mm1 \n\t" \ 1108 "movq 8(%0, "#index", 2), %%mm7 \n\t" \ 1109 "psraw $4, %%mm1 \n\t" \ 1110 "psraw $4, %%mm7 \n\t" \ 1111 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ 1112 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ 1113 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" \ 1114 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" \ 1115 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ 1116 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ 1118 "paddw %%mm3, %%mm4 \n\t"\ 1119 "movq %%mm2, %%mm0 \n\t"\ 1120 "movq %%mm5, %%mm6 \n\t"\ 1121 "movq %%mm4, %%mm3 \n\t"\ 1122 "punpcklwd %%mm2, %%mm2 \n\t"\ 1123 "punpcklwd %%mm5, %%mm5 \n\t"\ 1124 "punpcklwd %%mm4, %%mm4 \n\t"\ 1125 "paddw %%mm1, %%mm2 \n\t"\ 1126 "paddw %%mm1, %%mm5 \n\t"\ 1127 "paddw %%mm1, %%mm4 \n\t"\ 1128 "punpckhwd %%mm0, %%mm0 \n\t"\ 1129 "punpckhwd %%mm6, %%mm6 \n\t"\ 1130 "punpckhwd %%mm3, %%mm3 \n\t"\ 1131 "paddw %%mm7, %%mm0 \n\t"\ 1132 "paddw %%mm7, %%mm6 \n\t"\ 1133 "paddw %%mm7, %%mm3 \n\t"\ 1135 "packuswb %%mm0, %%mm2 \n\t"\ 1136 "packuswb %%mm6, %%mm5 \n\t"\ 1137 "packuswb %%mm3, %%mm4 \n\t"\ 1139 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c) 1142 #define REAL_YSCALEYUV2RGB1b(index, c) \ 1143 "xor "#index", "#index" \n\t"\ 1146 "movq (%2, "#index"), %%mm2 \n\t" \ 1147 "movq (%3, "#index"), %%mm3 \n\t" \ 1148 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1149 "movq (%2, "#index"), %%mm5 \n\t" \ 1150 "movq (%3, "#index"), %%mm4 \n\t" \ 1151 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1152 "paddw %%mm2, %%mm3 \n\t" \ 1153 "paddw %%mm5, %%mm4 \n\t" \ 1154 "psrlw $5, %%mm3 \n\t" \ 1155 "psrlw $5, %%mm4 \n\t" \ 1156 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" \ 1157 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" \ 1158 "movq %%mm3, %%mm2 \n\t" \ 1159 "movq %%mm4, %%mm5 \n\t" \ 1160 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ 1161 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ 1163 "movq (%0, "#index", 2), %%mm1 \n\t" \ 1164 "movq 8(%0, "#index", 2), %%mm7 \n\t" \ 1165 "psraw $4, %%mm1 \n\t" \ 1166 "psraw $4, %%mm7 \n\t" \ 1167 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ 1168 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ 1169 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" \ 1170 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" \ 1171 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ 1172 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ 1174 "paddw %%mm3, %%mm4 \n\t"\ 1175 "movq %%mm2, %%mm0 \n\t"\ 1176 "movq %%mm5, %%mm6 \n\t"\ 1177 "movq %%mm4, %%mm3 \n\t"\ 1178 "punpcklwd %%mm2, %%mm2 \n\t"\ 1179 "punpcklwd %%mm5, %%mm5 \n\t"\ 1180 "punpcklwd %%mm4, %%mm4 \n\t"\ 1181 "paddw %%mm1, %%mm2 \n\t"\ 1182 "paddw %%mm1, %%mm5 \n\t"\ 1183 "paddw %%mm1, %%mm4 \n\t"\ 1184 "punpckhwd %%mm0, %%mm0 \n\t"\ 1185 "punpckhwd %%mm6, %%mm6 \n\t"\ 1186 "punpckhwd %%mm3, %%mm3 \n\t"\ 1187 "paddw %%mm7, %%mm0 \n\t"\ 1188 "paddw %%mm7, %%mm6 \n\t"\ 1189 "paddw %%mm7, %%mm3 \n\t"\ 1191 "packuswb %%mm0, %%mm2 \n\t"\ 1192 "packuswb %%mm6, %%mm5 \n\t"\ 1193 "packuswb %%mm3, %%mm4 \n\t"\ 1195 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c) 1197 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \ 1198 "movq (%1, "#index", 2), %%mm7 \n\t" \ 1199 "movq 8(%1, "#index", 2), %%mm1 \n\t" \ 1200 "psraw $7, %%mm7 \n\t" \ 1201 "psraw $7, %%mm1 \n\t" \ 1202 "packuswb %%mm1, %%mm7 \n\t" 1203 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index) 1209 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1210 const int16_t *abuf0,
uint8_t *dest,
1211 int dstW,
int uvalpha,
int y)
1213 const int16_t *ubuf0 = ubuf[0];
1214 const int16_t *buf1= buf0;
1216 if (uvalpha < 2048) {
1217 const int16_t *ubuf1 = ubuf[0];
1221 "mov %4, %%"FF_REG_b
" \n\t" 1222 "push %%"FF_REG_BP
" \n\t" 1225 WRITEBGR32(%%FF_REGb,
DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1226 "pop %%"FF_REG_BP
" \n\t" 1228 ::
"c" (buf0),
"d" (abuf0),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1234 "mov %4, %%"FF_REG_b
" \n\t" 1235 "push %%"FF_REG_BP
" \n\t" 1237 "pcmpeqd %%mm7, %%mm7 \n\t" 1238 WRITEBGR32(%%FF_REGb,
DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1239 "pop %%"FF_REG_BP
" \n\t" 1241 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1246 const int16_t *ubuf1 = ubuf[1];
1250 "mov %4, %%"FF_REG_b
" \n\t" 1251 "push %%"FF_REG_BP
" \n\t" 1254 WRITEBGR32(%%FF_REGb,
DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1255 "pop %%"FF_REG_BP
" \n\t" 1257 ::
"c" (buf0),
"d" (abuf0),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1263 "mov %4, %%"FF_REG_b
" \n\t" 1264 "push %%"FF_REG_BP
" \n\t" 1266 "pcmpeqd %%mm7, %%mm7 \n\t" 1267 WRITEBGR32(%%FF_REGb,
DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1268 "pop %%"FF_REG_BP
" \n\t" 1270 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1278 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1279 const int16_t *abuf0,
uint8_t *dest,
1280 int dstW,
int uvalpha,
int y)
1282 const int16_t *ubuf0 = ubuf[0];
1283 const int16_t *buf1= buf0;
1285 if (uvalpha < 2048) {
1286 const int16_t *ubuf1 = ubuf[0];
1289 "mov %4, %%"FF_REG_b
" \n\t" 1290 "push %%"FF_REG_BP
" \n\t" 1292 "pxor %%mm7, %%mm7 \n\t" 1294 "pop %%"FF_REG_BP
" \n\t" 1296 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1301 const int16_t *ubuf1 = ubuf[1];
1304 "mov %4, %%"FF_REG_b
" \n\t" 1305 "push %%"FF_REG_BP
" \n\t" 1307 "pxor %%mm7, %%mm7 \n\t" 1309 "pop %%"FF_REG_BP
" \n\t" 1311 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1319 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1320 const int16_t *abuf0,
uint8_t *dest,
1321 int dstW,
int uvalpha,
int y)
1323 const int16_t *ubuf0 = ubuf[0];
1324 const int16_t *buf1= buf0;
1326 if (uvalpha < 2048) {
1327 const int16_t *ubuf1 = ubuf[0];
1330 "mov %4, %%"FF_REG_b
" \n\t" 1331 "push %%"FF_REG_BP
" \n\t" 1333 "pxor %%mm7, %%mm7 \n\t" 1341 "pop %%"FF_REG_BP
" \n\t" 1343 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1348 const int16_t *ubuf1 = ubuf[1];
1351 "mov %4, %%"FF_REG_b
" \n\t" 1352 "push %%"FF_REG_BP
" \n\t" 1354 "pxor %%mm7, %%mm7 \n\t" 1362 "pop %%"FF_REG_BP
" \n\t" 1364 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1372 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1373 const int16_t *abuf0,
uint8_t *dest,
1374 int dstW,
int uvalpha,
int y)
1376 const int16_t *ubuf0 = ubuf[0];
1377 const int16_t *buf1= buf0;
1379 if (uvalpha < 2048) {
1380 const int16_t *ubuf1 = ubuf[0];
1383 "mov %4, %%"FF_REG_b
" \n\t" 1384 "push %%"FF_REG_BP
" \n\t" 1386 "pxor %%mm7, %%mm7 \n\t" 1394 "pop %%"FF_REG_BP
" \n\t" 1396 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1401 const int16_t *ubuf1 = ubuf[1];
1404 "mov %4, %%"FF_REG_b
" \n\t" 1405 "push %%"FF_REG_BP
" \n\t" 1407 "pxor %%mm7, %%mm7 \n\t" 1415 "pop %%"FF_REG_BP
" \n\t" 1417 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1424 #define REAL_YSCALEYUV2PACKED1(index, c) \ 1425 "xor "#index", "#index" \n\t"\ 1428 "movq (%2, "#index"), %%mm3 \n\t" \ 1429 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1430 "movq (%2, "#index"), %%mm4 \n\t" \ 1431 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1432 "psraw $7, %%mm3 \n\t" \ 1433 "psraw $7, %%mm4 \n\t" \ 1434 "movq (%0, "#index", 2), %%mm1 \n\t" \ 1435 "movq 8(%0, "#index", 2), %%mm7 \n\t" \ 1436 "psraw $7, %%mm1 \n\t" \ 1437 "psraw $7, %%mm7 \n\t" \ 1439 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c) 1441 #define REAL_YSCALEYUV2PACKED1b(index, c) \ 1442 "xor "#index", "#index" \n\t"\ 1445 "movq (%2, "#index"), %%mm2 \n\t" \ 1446 "movq (%3, "#index"), %%mm3 \n\t" \ 1447 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1448 "movq (%2, "#index"), %%mm5 \n\t" \ 1449 "movq (%3, "#index"), %%mm4 \n\t" \ 1450 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1451 "paddw %%mm2, %%mm3 \n\t" \ 1452 "paddw %%mm5, %%mm4 \n\t" \ 1453 "psrlw $8, %%mm3 \n\t" \ 1454 "psrlw $8, %%mm4 \n\t" \ 1455 "movq (%0, "#index", 2), %%mm1 \n\t" \ 1456 "movq 8(%0, "#index", 2), %%mm7 \n\t" \ 1457 "psraw $7, %%mm1 \n\t" \ 1458 "psraw $7, %%mm7 \n\t" 1459 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c) 1462 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1463 const int16_t *abuf0,
uint8_t *dest,
1464 int dstW,
int uvalpha,
int y)
1466 const int16_t *ubuf0 = ubuf[0];
1467 const int16_t *buf1= buf0;
1469 if (uvalpha < 2048) {
1470 const int16_t *ubuf1 = ubuf[0];
1473 "mov %4, %%"FF_REG_b
" \n\t" 1474 "push %%"FF_REG_BP
" \n\t" 1477 "pop %%"FF_REG_BP
" \n\t" 1479 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1483 const int16_t *ubuf1 = ubuf[1];
1486 "mov %4, %%"FF_REG_b
" \n\t" 1487 "push %%"FF_REG_BP
" \n\t" 1490 "pop %%"FF_REG_BP
" \n\t" 1492 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1501 c->use_mmx_vfilter= 0;
1506 switch (
c->dstFormat) {
1518 c->use_mmx_vfilter= 1;
1521 switch (
c->dstFormat) {
1535 switch (
c->dstFormat) {
1562 if (
c->srcBpc == 8 &&
c->dstBpc <= 14) {
1564 #if COMPILE_TEMPLATE_MMXEXT 1570 c->hyscale_fast =
NULL;
1571 c->hcscale_fast =
NULL;
1572 #if COMPILE_TEMPLATE_MMXEXT
#define YSCALEYUV2RGB1_ALPHA(index)
#define ALP_MMX_FILTER_OFFSET
static void RENAME() yuv2rgb32_X_ar(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
#define YSCALEYUV2PACKED1(index, c)
static void RENAME() yuv2yuyv422_X(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
void ff_hcscale_fast_mmxext(SwsContext *c, int16_t *dst1, int16_t *dst2, int dstWidth, const uint8_t *src1, const uint8_t *src2, int srcW, int xInc)
static av_always_inline int is16BPS(enum AVPixelFormat pix_fmt)
#define YSCALEYUV2PACKEDX_END
void ff_hyscale_fast_mmxext(SwsContext *c, int16_t *dst, int dstWidth, const uint8_t *src, int srcW, int xInc)
static void filter(int16_t *output, ptrdiff_t out_stride, int16_t *low, ptrdiff_t low_stride, int16_t *high, ptrdiff_t high_stride, int len, int clip)
#define SWS_FULL_CHR_H_INT
#define SWS_FAST_BILINEAR
#define WRITERGB15(dst, dstw, index)
static void RENAME() yuv2rgb565_2(SwsContext *c, const int16_t *buf[2], const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf[2], uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
static void RENAME() yuv2rgb32_X(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
static void RENAME() yuv2rgb565_X_ar(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
#define YSCALEYUV2PACKEDX
static void RENAME() yuv2yuvX(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW, const uint8_t *dither, int offset)
static void RENAME() yuv2rgb32_1(SwsContext *c, const int16_t *buf0, const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int y)
YV12 to RGB without scaling or interpolating.
static const uint8_t dither[8][8]
#define WRITEBGR24(dst, dstw, index)
planar YUV 4:2:0, 12bpp, 1 plane for Y and 1 plane for the UV components, which are interleaved (firs...
#define YSCALEYUV2RGB1b(index, c)
static const uint8_t offset[127][2]
#define YSCALEYUV2RGB_YA(index, c, b1, b2)
#define YSCALEYUV2PACKED(index, c)
static void RENAME() yuv2rgb565_X(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
#define WRITERGB16(dst, dstw, index)
static void RENAME() yuv2bgr32_X(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
as above, but U and V bytes are swapped
#define YSCALEYUV2PACKEDX_ACCURATE
static void RENAME() yuv2rgb555_1(SwsContext *c, const int16_t *buf0, const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int y)
#define YSCALEYUV2PACKEDX_YA(offset, coeff, src1, src2, dst1, dst2)
static void RENAME() yuv2yuyv422_1(SwsContext *c, const int16_t *buf0, const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int y)
static av_cold void RENAME() sws_init_swscale(SwsContext *c)
packed RGB 8:8:8, 24bpp, BGRBGR...
static void RENAME() yuv2yuyv422_2(SwsContext *c, const int16_t *buf[2], const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf[2], uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
static void RENAME() yuv2rgb555_X(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
static av_always_inline void dither_8to16(const uint8_t *srcDither, int rot)
#define YSCALEYUV2RGB(index, c)
packed YUV 4:2:2, 16bpp, Y0 Cb Y1 Cr
#define YSCALEYUV2PACKED1b(index, c)
static void RENAME() yuv2rgb555_X_ar(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
static void RENAME() yuv2yuyv422_X_ar(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
static void RENAME() yuv2rgb32_2(SwsContext *c, const int16_t *buf[2], const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf[2], uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
vertical bilinear scale YV12 to RGB
#define CONFIG_SWSCALE_ALPHA
#define AV_PIX_FMT_RGB555
static void RENAME() yuv2rgb565_1(SwsContext *c, const int16_t *buf0, const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int y)
#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset)
#define AV_PIX_FMT_RGB565
static void RENAME() yuv2bgr24_2(SwsContext *c, const int16_t *buf[2], const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf[2], uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
static void RENAME() yuv2rgb555_2(SwsContext *c, const int16_t *buf[2], const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf[2], uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
#define WRITEYUY2(dst, dstw, index)
AVPixelFormat
Pixel format.
static void RENAME() yuv2bgr24_1(SwsContext *c, const int16_t *buf0, const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int y)
static av_always_inline int isNBPS(enum AVPixelFormat pix_fmt)
#define YSCALEYUV2RGB1(index, c)
#define NAMED_CONSTRAINTS_ADD(...)