FFmpeg  4.0
vf_spp.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License along
17  * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
18  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19  */
20 
21 
22 #include "libavutil/attributes.h"
23 #include "libavutil/cpu.h"
24 #include "libavutil/crc.h"
25 #include "libavutil/mem.h"
26 #include "libavutil/x86/asm.h"
27 #include "libavfilter/vf_spp.h"
28 
29 #if HAVE_MMX_INLINE
30 static void hardthresh_mmx(int16_t dst[64], const int16_t src[64],
31  int qp, const uint8_t *permutation)
32 {
33  int bias = 0; //FIXME
34  unsigned int threshold1;
35 
36  threshold1 = qp * ((1<<4) - bias) - 1;
37 
38 #define REQUANT_CORE(dst0, dst1, dst2, dst3, src0, src1, src2, src3) \
39  "movq " #src0 ", %%mm0 \n" \
40  "movq " #src1 ", %%mm1 \n" \
41  "movq " #src2 ", %%mm2 \n" \
42  "movq " #src3 ", %%mm3 \n" \
43  "psubw %%mm4, %%mm0 \n" \
44  "psubw %%mm4, %%mm1 \n" \
45  "psubw %%mm4, %%mm2 \n" \
46  "psubw %%mm4, %%mm3 \n" \
47  "paddusw %%mm5, %%mm0 \n" \
48  "paddusw %%mm5, %%mm1 \n" \
49  "paddusw %%mm5, %%mm2 \n" \
50  "paddusw %%mm5, %%mm3 \n" \
51  "paddw %%mm6, %%mm0 \n" \
52  "paddw %%mm6, %%mm1 \n" \
53  "paddw %%mm6, %%mm2 \n" \
54  "paddw %%mm6, %%mm3 \n" \
55  "psubusw %%mm6, %%mm0 \n" \
56  "psubusw %%mm6, %%mm1 \n" \
57  "psubusw %%mm6, %%mm2 \n" \
58  "psubusw %%mm6, %%mm3 \n" \
59  "psraw $3, %%mm0 \n" \
60  "psraw $3, %%mm1 \n" \
61  "psraw $3, %%mm2 \n" \
62  "psraw $3, %%mm3 \n" \
63  \
64  "movq %%mm0, %%mm7 \n" \
65  "punpcklwd %%mm2, %%mm0 \n" /*A*/ \
66  "punpckhwd %%mm2, %%mm7 \n" /*C*/ \
67  "movq %%mm1, %%mm2 \n" \
68  "punpcklwd %%mm3, %%mm1 \n" /*B*/ \
69  "punpckhwd %%mm3, %%mm2 \n" /*D*/ \
70  "movq %%mm0, %%mm3 \n" \
71  "punpcklwd %%mm1, %%mm0 \n" /*A*/ \
72  "punpckhwd %%mm7, %%mm3 \n" /*C*/ \
73  "punpcklwd %%mm2, %%mm7 \n" /*B*/ \
74  "punpckhwd %%mm2, %%mm1 \n" /*D*/ \
75  \
76  "movq %%mm0, " #dst0 " \n" \
77  "movq %%mm7, " #dst1 " \n" \
78  "movq %%mm3, " #dst2 " \n" \
79  "movq %%mm1, " #dst3 " \n"
80 
81  __asm__ volatile(
82  "movd %2, %%mm4 \n"
83  "movd %3, %%mm5 \n"
84  "movd %4, %%mm6 \n"
85  "packssdw %%mm4, %%mm4 \n"
86  "packssdw %%mm5, %%mm5 \n"
87  "packssdw %%mm6, %%mm6 \n"
88  "packssdw %%mm4, %%mm4 \n"
89  "packssdw %%mm5, %%mm5 \n"
90  "packssdw %%mm6, %%mm6 \n"
91  REQUANT_CORE( (%1), 8(%1), 16(%1), 24(%1), (%0), 8(%0), 64(%0), 72(%0))
92  REQUANT_CORE(32(%1), 40(%1), 48(%1), 56(%1),16(%0),24(%0), 48(%0), 56(%0))
93  REQUANT_CORE(64(%1), 72(%1), 80(%1), 88(%1),32(%0),40(%0), 96(%0),104(%0))
94  REQUANT_CORE(96(%1),104(%1),112(%1),120(%1),80(%0),88(%0),112(%0),120(%0))
95  : : "r" (src), "r" (dst), "g" (threshold1+1), "g" (threshold1+5), "g" (threshold1-4) //FIXME maybe more accurate then needed?
96  );
97  dst[0] = (src[0] + 4) >> 3;
98 }
99 
100 static void softthresh_mmx(int16_t dst[64], const int16_t src[64],
101  int qp, const uint8_t *permutation)
102 {
103  int bias = 0; //FIXME
104  unsigned int threshold1;
105 
106  threshold1 = qp*((1<<4) - bias) - 1;
107 
108 #undef REQUANT_CORE
109 #define REQUANT_CORE(dst0, dst1, dst2, dst3, src0, src1, src2, src3) \
110  "movq " #src0 ", %%mm0 \n" \
111  "movq " #src1 ", %%mm1 \n" \
112  "pxor %%mm6, %%mm6 \n" \
113  "pxor %%mm7, %%mm7 \n" \
114  "pcmpgtw %%mm0, %%mm6 \n" \
115  "pcmpgtw %%mm1, %%mm7 \n" \
116  "pxor %%mm6, %%mm0 \n" \
117  "pxor %%mm7, %%mm1 \n" \
118  "psubusw %%mm4, %%mm0 \n" \
119  "psubusw %%mm4, %%mm1 \n" \
120  "pxor %%mm6, %%mm0 \n" \
121  "pxor %%mm7, %%mm1 \n" \
122  "movq " #src2 ", %%mm2 \n" \
123  "movq " #src3 ", %%mm3 \n" \
124  "pxor %%mm6, %%mm6 \n" \
125  "pxor %%mm7, %%mm7 \n" \
126  "pcmpgtw %%mm2, %%mm6 \n" \
127  "pcmpgtw %%mm3, %%mm7 \n" \
128  "pxor %%mm6, %%mm2 \n" \
129  "pxor %%mm7, %%mm3 \n" \
130  "psubusw %%mm4, %%mm2 \n" \
131  "psubusw %%mm4, %%mm3 \n" \
132  "pxor %%mm6, %%mm2 \n" \
133  "pxor %%mm7, %%mm3 \n" \
134  \
135  "paddsw %%mm5, %%mm0 \n" \
136  "paddsw %%mm5, %%mm1 \n" \
137  "paddsw %%mm5, %%mm2 \n" \
138  "paddsw %%mm5, %%mm3 \n" \
139  "psraw $3, %%mm0 \n" \
140  "psraw $3, %%mm1 \n" \
141  "psraw $3, %%mm2 \n" \
142  "psraw $3, %%mm3 \n" \
143  \
144  "movq %%mm0, %%mm7 \n" \
145  "punpcklwd %%mm2, %%mm0 \n" /*A*/ \
146  "punpckhwd %%mm2, %%mm7 \n" /*C*/ \
147  "movq %%mm1, %%mm2 \n" \
148  "punpcklwd %%mm3, %%mm1 \n" /*B*/ \
149  "punpckhwd %%mm3, %%mm2 \n" /*D*/ \
150  "movq %%mm0, %%mm3 \n" \
151  "punpcklwd %%mm1, %%mm0 \n" /*A*/ \
152  "punpckhwd %%mm7, %%mm3 \n" /*C*/ \
153  "punpcklwd %%mm2, %%mm7 \n" /*B*/ \
154  "punpckhwd %%mm2, %%mm1 \n" /*D*/ \
155  \
156  "movq %%mm0, " #dst0 " \n" \
157  "movq %%mm7, " #dst1 " \n" \
158  "movq %%mm3, " #dst2 " \n" \
159  "movq %%mm1, " #dst3 " \n"
160 
161  __asm__ volatile(
162  "movd %2, %%mm4 \n"
163  "movd %3, %%mm5 \n"
164  "packssdw %%mm4, %%mm4 \n"
165  "packssdw %%mm5, %%mm5 \n"
166  "packssdw %%mm4, %%mm4 \n"
167  "packssdw %%mm5, %%mm5 \n"
168  REQUANT_CORE( (%1), 8(%1), 16(%1), 24(%1), (%0), 8(%0), 64(%0), 72(%0))
169  REQUANT_CORE(32(%1), 40(%1), 48(%1), 56(%1),16(%0),24(%0), 48(%0), 56(%0))
170  REQUANT_CORE(64(%1), 72(%1), 80(%1), 88(%1),32(%0),40(%0), 96(%0),104(%0))
171  REQUANT_CORE(96(%1),104(%1),112(%1),120(%1),80(%0),88(%0),112(%0),120(%0))
172  : : "r" (src), "r" (dst), "g" (threshold1), "rm" (4) //FIXME maybe more accurate then needed?
173  );
174 
175  dst[0] = (src[0] + 4) >> 3;
176 }
177 
178 static void store_slice_mmx(uint8_t *dst, const int16_t *src,
179  int dst_stride, int src_stride,
180  int width, int height, int log2_scale,
181  const uint8_t dither[8][8])
182 {
183  int y;
184 
185  for (y = 0; y < height; y++) {
186  uint8_t *dst1 = dst;
187  const int16_t *src1 = src;
188  __asm__ volatile(
189  "movq (%3), %%mm3 \n"
190  "movq (%3), %%mm4 \n"
191  "movd %4, %%mm2 \n"
192  "pxor %%mm0, %%mm0 \n"
193  "punpcklbw %%mm0, %%mm3 \n"
194  "punpckhbw %%mm0, %%mm4 \n"
195  "psraw %%mm2, %%mm3 \n"
196  "psraw %%mm2, %%mm4 \n"
197  "movd %5, %%mm2 \n"
198  "1: \n"
199  "movq (%0), %%mm0 \n"
200  "movq 8(%0), %%mm1 \n"
201  "paddw %%mm3, %%mm0 \n"
202  "paddw %%mm4, %%mm1 \n"
203  "psraw %%mm2, %%mm0 \n"
204  "psraw %%mm2, %%mm1 \n"
205  "packuswb %%mm1, %%mm0 \n"
206  "movq %%mm0, (%1) \n"
207  "add $16, %0 \n"
208  "add $8, %1 \n"
209  "cmp %2, %1 \n"
210  " jb 1b \n"
211  : "+r" (src1), "+r"(dst1)
212  : "r"(dst + width), "r"(dither[y]), "g"(log2_scale), "g"(MAX_LEVEL - log2_scale)
213  );
214  src += src_stride;
215  dst += dst_stride;
216  }
217 }
218 
219 #endif /* HAVE_MMX_INLINE */
220 
222 {
223 #if HAVE_MMX_INLINE
224  int cpu_flags = av_get_cpu_flags();
225 
226  if (cpu_flags & AV_CPU_FLAG_MMX) {
227  static const uint32_t mmx_idct_perm_crc = 0xe5e8adc4;
228  uint32_t idct_perm_crc =
230  s->dct->idct_permutation,
231  sizeof(s->dct->idct_permutation));
232  int64_t bps;
233  s->store_slice = store_slice_mmx;
234  av_opt_get_int(s->dct, "bits_per_sample", 0, &bps);
235  if (bps <= 8 && idct_perm_crc == mmx_idct_perm_crc) {
236  switch (s->mode) {
237  case 0: s->requantize = hardthresh_mmx; break;
238  case 1: s->requantize = softthresh_mmx; break;
239  }
240  }
241  }
242 #endif
243 }
void(* store_slice)(uint8_t *dst, const int16_t *src, int dst_stride, int src_stride, int width, int height, int log2_scale, const uint8_t dither[8][8])
Definition: vf_spp.h:48
const char * s
Definition: avisynth_c.h:768
av_cold void ff_spp_init_x86(SPPContext *s)
Definition: vf_spp.c:221
Memory handling functions.
const char * g
Definition: vf_curves.c:112
static atomic_int cpu_flags
Definition: cpu.c:50
#define src
Definition: vp8dsp.c:254
Macro definitions for various function/variable attributes.
uint8_t
#define av_cold
Definition: attributes.h:82
Public header for CRC hash function implementation.
#define height
#define MAX_LEVEL
Definition: rl.h:36
const char * r
Definition: vf_curves.c:111
static const uint8_t dither[8][8]
Definition: vf_fspp.c:57
uint16_t width
Definition: gdv.c:47
uint8_t idct_permutation[64]
IDCT input permutation.
Definition: avdct.h:48
uint32_t av_crc(const AVCRC *ctx, uint32_t crc, const uint8_t *buffer, size_t length)
Calculate the CRC of a block.
Definition: crc.c:392
AVDCT * dct
Definition: vf_spp.h:42
int av_opt_get_int(void *obj, const char *name, int search_flags, int64_t *out_val)
Definition: opt.c:875
#define src1
Definition: h264pred.c:139
#define AV_CPU_FLAG_MMX
standard MMX
Definition: cpu.h:31
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:93
void(* requantize)(int16_t dst[64], const int16_t src[64], int qp, const uint8_t *permutation)
Definition: vf_spp.h:53
const AVCRC * av_crc_get_table(AVCRCId crc_id)
Get an initialized standard CRC table.
Definition: crc.c:374
int mode
Definition: vf_spp.h:36
unsigned bps
Definition: movenc.c:1456