From 6fd4ed3965ecfebdcba954ccb97d9464f3bd12d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastian=20Dr=C3=B6ge?= Date: Tue, 24 Jun 2008 09:10:46 +0000 Subject: [PATCH] gst/deinterlace2/tvtime/greedy.c: Fix the C implementation to produce correct results and optimize the Original commit message from CVS: * gst/deinterlace2/tvtime/greedy.c: (deinterlace_greedy_packed422_scanline_c), (deinterlace_greedy_packed422_scanline_mmxext), (deinterlace_greedy_packed422_scanline): Fix the C implementation to produce correct results and optimize the MMXEXT implementation. Handle odd widths and don't read over array boundaries in the MMXEXT implementation. * gst/deinterlace2/tvtime/vfir.c: (deinterlace_line_c), (deinterlace_line_mmx), (deinterlace_scanline_vfir): Fix a small rounding bug in the MMX implementation, the MMX implementation doesn't actually need MMXEXT instructions so don't mark it as such. Handle odd widths in both implementations. --- ChangeLog | 20 +++ gst/deinterlace2/tvtime/greedy.c | 246 +++++++++++++++---------------- gst/deinterlace2/tvtime/vfir.c | 107 +++++++------- 3 files changed, 191 insertions(+), 182 deletions(-) diff --git a/ChangeLog b/ChangeLog index ef05fa1156..a1c50b466f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,23 @@ +2008-06-24 Sebastian Dröge + + * gst/deinterlace2/tvtime/greedy.c: + (deinterlace_greedy_packed422_scanline_c), + (deinterlace_greedy_packed422_scanline_mmxext), + (deinterlace_greedy_packed422_scanline): + Fix the C implementation to produce correct results and optimize the + MMXEXT implementation. + + Handle odd widths and don't read over array boundaries in the MMXEXT + implementation. + + * gst/deinterlace2/tvtime/vfir.c: (deinterlace_line_c), + (deinterlace_line_mmx), (deinterlace_scanline_vfir): + Fix a small rounding bug in the MMX implementation, the MMX + implementation doesn't actually need MMXEXT instructions so don't mark + it as such. + + Handle odd widths in both implementations. + 2008-06-22 Stefan Kost * ext/resindvd/rsnbasesrc.c: diff --git a/gst/deinterlace2/tvtime/greedy.c b/gst/deinterlace2/tvtime/greedy.c index c25af036e2..66b8799d9a 100644 --- a/gst/deinterlace2/tvtime/greedy.c +++ b/gst/deinterlace2/tvtime/greedy.c @@ -60,135 +60,14 @@ copy_scanline (GstDeinterlace2 * object, blit_packed422_scanline (output, data->m1, object->frame_width); } -static int GreedyMaxComb = 15; +static const int GreedyMaxComb = 15; -#ifdef HAVE_CPU_I386 -#include "mmx.h" -#include "sse.h" -static void -deinterlace_greedy_packed422_scanline_sse (GstDeinterlace2 * object, - deinterlace_scanline_data_t * data, uint8_t * output) -{ - mmx_t MaxComb; - - uint8_t *m0 = data->m0; - - uint8_t *t1 = data->t1; - - uint8_t *b1 = data->b1; - - uint8_t *m2 = data->m2; - - int width = object->frame_width; - - // How badly do we let it weave? 0-255 - MaxComb.ub[0] = GreedyMaxComb; - MaxComb.ub[1] = GreedyMaxComb; - MaxComb.ub[2] = GreedyMaxComb; - MaxComb.ub[3] = GreedyMaxComb; - MaxComb.ub[4] = GreedyMaxComb; - MaxComb.ub[5] = GreedyMaxComb; - MaxComb.ub[6] = GreedyMaxComb; - MaxComb.ub[7] = GreedyMaxComb; - - // L2 == m0 - // L1 == t1 - // L3 == b1 - // LP2 == m2 - - width /= 4; - while (width--) { - movq_m2r (*t1, mm1); // L1 - movq_m2r (*m0, mm2); // L2 - movq_m2r (*b1, mm3); // L3 - movq_m2r (*m2, mm0); // LP2 - - // average L1 and L3 leave result in mm4 - movq_r2r (mm1, mm4); // L1 - pavgb_r2r (mm3, mm4); // (L1 + L3)/2 - - - // get abs value of possible L2 comb - movq_r2r (mm2, mm7); // L2 - psubusb_r2r (mm4, mm7); // L2 - avg - movq_r2r (mm4, mm5); // avg - psubusb_r2r (mm2, mm5); // avg - L2 - por_r2r (mm7, mm5); // abs(avg-L2) - movq_r2r (mm4, mm6); // copy of avg for later - - - // get abs value of possible LP2 comb - movq_r2r (mm0, mm7); // LP2 - psubusb_r2r (mm4, mm7); // LP2 - avg - psubusb_r2r (mm0, mm4); // avg - LP2 - por_r2r (mm7, mm4); // abs(avg-LP2) - - // use L2 or LP2 depending upon which makes smaller comb - psubusb_r2r (mm5, mm4); // see if it goes to zero - psubusb_r2r (mm5, mm5); // 0 - pcmpeqb_r2r (mm5, mm4); // if (mm4=0) then FF else 0 - pcmpeqb_r2r (mm4, mm5); // opposite of mm4 - - // if Comb(LP2) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55 - pand_r2r (mm2, mm5); // use L2 if mm5 == ff, else 0 - pand_r2r (mm0, mm4); // use LP2 if mm4 = ff, else 0 - por_r2r (mm5, mm4); // may the best win - - // Now lets clip our chosen value to be not outside of the range - // of the high/low range L1-L3 by more than abs(L1-L3) - // This allows some comb but limits the damages and also allows more - // detail than a boring oversmoothed clip. - - movq_r2r (mm1, mm2); // copy L1 - psubusb_r2r (mm3, mm2); // - L3, with saturation - paddusb_r2r (mm3, mm2); // now = Max(L1,L3) - - pcmpeqb_r2r (mm7, mm7); // all ffffffff - psubusb_r2r (mm1, mm7); // - L1 - paddusb_r2r (mm7, mm3); // add, may sat at fff.. - psubusb_r2r (mm7, mm3); // now = Min(L1,L3) - - // allow the value to be above the high or below the low by amt of MaxComb - paddusb_m2r (MaxComb, mm2); // increase max by diff - psubusb_m2r (MaxComb, mm3); // lower min by diff - - psubusb_r2r (mm3, mm4); // best - Min - paddusb_r2r (mm3, mm4); // now = Max(best,Min(L1,L3) - - pcmpeqb_r2r (mm7, mm7); // all ffffffff - psubusb_r2r (mm4, mm7); // - Max(best,Min(best,L3) - paddusb_r2r (mm7, mm2); // add may sat at FFF.. - psubusb_r2r (mm7, mm2); // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped - - movntq_r2m (mm2, *output); // move in our clipped best - - // Advance to the next set of pixels. - output += 8; - m0 += 8; - t1 += 8; - b1 += 8; - m2 += 8; - } - sfence (); - emms (); -} -#endif - -static void +static inline void deinterlace_greedy_packed422_scanline_c (GstDeinterlace2 * object, - deinterlace_scanline_data_t * data, uint8_t * output) + uint8_t * m0, uint8_t * t1, uint8_t * b1, uint8_t * m2, uint8_t * output, + int width) { - uint8_t *m0 = data->m0; - - uint8_t *t1 = data->t1; - - uint8_t *b1 = data->b1; - - uint8_t *m2 = data->m2; - - int width = 2 * object->frame_width; - - uint16_t avg, l2_diff, lp2_diff, max, min, best; + int avg, l2_diff, lp2_diff, max, min, best; // L2 == m0 // L1 == t1 @@ -211,10 +90,15 @@ deinterlace_greedy_packed422_scanline_c (GstDeinterlace2 * object, if (max < 256 - GreedyMaxComb) max += GreedyMaxComb; + else + max = 255; + if (min > GreedyMaxComb) min -= GreedyMaxComb; + else + min = 0; - *output = MIN (MAX (best, min), max); + *output = CLAMP (best, min, max); // Advance to the next set of pixels. output += 1; @@ -225,18 +109,118 @@ deinterlace_greedy_packed422_scanline_c (GstDeinterlace2 * object, } } +#ifdef HAVE_CPU_I386 +#include "mmx.h" +#include "sse.h" + +static void +deinterlace_greedy_packed422_scanline_mmxext (GstDeinterlace2 * object, + uint8_t * m0, uint8_t * t1, uint8_t * b1, uint8_t * m2, uint8_t * output, + int width) +{ + mmx_t MaxComb; + + // How badly do we let it weave? 0-255 + MaxComb.ub[0] = GreedyMaxComb; + MaxComb.ub[1] = GreedyMaxComb; + MaxComb.ub[2] = GreedyMaxComb; + MaxComb.ub[3] = GreedyMaxComb; + MaxComb.ub[4] = GreedyMaxComb; + MaxComb.ub[5] = GreedyMaxComb; + MaxComb.ub[6] = GreedyMaxComb; + MaxComb.ub[7] = GreedyMaxComb; + + // L2 == m0 + // L1 == t1 + // L3 == b1 + // LP2 == m2 + + for (; width > 7; width -= 8) { + movq_m2r (*t1, mm1); // L1 + movq_m2r (*m0, mm2); // L2 + movq_m2r (*b1, mm3); // L3 + movq_m2r (*m2, mm0); // LP2 + + // average L1 and L3 leave result in mm4 + movq_r2r (mm1, mm4); // L1 + pavgb_r2r (mm3, mm4); // (L1 + L3)/2 + + // get abs value of possible L2 comb + movq_r2r (mm2, mm7); // L2 + psubusb_r2r (mm4, mm7); // L2 - avg + movq_r2r (mm4, mm5); // avg + psubusb_r2r (mm2, mm5); // avg - L2 + por_r2r (mm7, mm5); // abs(avg-L2) + + // get abs value of possible LP2 comb + movq_r2r (mm0, mm7); // LP2 + psubusb_r2r (mm4, mm7); // LP2 - avg + psubusb_r2r (mm0, mm4); // avg - LP2 + por_r2r (mm7, mm4); // abs(avg-LP2) + + // use L2 or LP2 depending upon which makes smaller comb + psubusb_r2r (mm5, mm4); // see if it goes to zero + pxor_r2r (mm5, mm5); // 0 + pcmpeqb_r2r (mm5, mm4); // if (mm4=0) then FF else 0 + pcmpeqb_r2r (mm4, mm5); // opposite of mm4 + + // if Comb(LP2) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55 + pand_r2r (mm2, mm5); // use L2 if mm5 == ff, else 0 + pand_r2r (mm0, mm4); // use LP2 if mm4 = ff, else 0 + por_r2r (mm5, mm4); // may the best win + + // Now lets clip our chosen value to be not outside of the range + // of the high/low range L1-L3 by more than abs(L1-L3) + // This allows some comb but limits the damages and also allows more + // detail than a boring oversmoothed clip. + + movq_r2r (mm1, mm2); // copy L1 + pmaxub_r2r (mm3, mm2); // now = Max(L1,L3) + + pminub_r2r (mm1, mm3); // now = Min(L1,L3) + + // allow the value to be above the high or below the low by amt of MaxComb + paddusb_m2r (MaxComb, mm2); // increase max by diff + psubusb_m2r (MaxComb, mm3); // lower min by diff + + + pmaxub_r2r (mm3, mm4); // now = Max(best,Min(L1,L3) + pminub_r2r (mm4, mm2); // now = Min( Max(best, Min(L1,L3)), L2 )=L2 clipped + + movq_r2m (mm2, *output); // move in our clipped best + + // Advance to the next set of pixels. + output += 8; + m0 += 8; + t1 += 8; + b1 += 8; + m2 += 8; + } + sfence (); + emms (); + + if (width > 0) + deinterlace_greedy_packed422_scanline_c (object, m0, t1, b1, m2, output, + width); +} + +#endif + static void deinterlace_greedy_packed422_scanline (GstDeinterlace2 * object, deinterlace_scanline_data_t * data, uint8_t * output) { #ifdef HAVE_CPU_I386 - if (object->cpu_feature_flags & OIL_IMPL_FLAG_SSE) { - deinterlace_greedy_packed422_scanline_sse (object, data, output); + if (object->cpu_feature_flags & OIL_IMPL_FLAG_MMXEXT) { + deinterlace_greedy_packed422_scanline_mmxext (object, data->m0, data->t1, + data->b1, data->m2, output, 2 * object->frame_width); } else { - deinterlace_greedy_packed422_scanline_c (object, data, output); + deinterlace_greedy_packed422_scanline_c (object, data->m0, data->t1, + data->b1, data->m2, output, 2 * object->frame_width); } #else - deinterlace_greedy_packed422_scanline_c (object, data, output); + deinterlace_greedy_packed422_scanline_c (object, data->m0, data->t1, data->b1, + data->m2, output, 2 * object->frame_width); #endif } diff --git a/gst/deinterlace2/tvtime/vfir.c b/gst/deinterlace2/tvtime/vfir.c index f32be65475..479ee44060 100644 --- a/gst/deinterlace2/tvtime/vfir.c +++ b/gst/deinterlace2/tvtime/vfir.c @@ -49,58 +49,10 @@ * filter taps here are: [-1 4 2 4 -1]. */ -#ifdef HAVE_CPU_I386 -#include "mmx.h" -static void -deinterlace_line_mmxext (uint8_t * dst, uint8_t * lum_m4, - uint8_t * lum_m3, uint8_t * lum_m2, - uint8_t * lum_m1, uint8_t * lum, int size) -{ - mmx_t rounder; - - rounder.uw[0] = 4; - rounder.uw[1] = 4; - rounder.uw[2] = 4; - rounder.uw[3] = 4; - pxor_r2r (mm7, mm7); - movq_m2r (rounder, mm6); - - for (; size > 3; size -= 4) { - movd_m2r (lum_m4[0], mm0); - movd_m2r (lum_m3[0], mm1); - movd_m2r (lum_m2[0], mm2); - movd_m2r (lum_m1[0], mm3); - movd_m2r (lum[0], mm4); - punpcklbw_r2r (mm7, mm0); - punpcklbw_r2r (mm7, mm1); - punpcklbw_r2r (mm7, mm2); - punpcklbw_r2r (mm7, mm3); - punpcklbw_r2r (mm7, mm4); - paddw_r2r (mm3, mm1); - psllw_i2r (1, mm2); - paddw_r2r (mm4, mm0); - psllw_i2r (2, mm1); // 2 - paddw_r2r (mm6, mm2); - paddw_r2r (mm2, mm1); - psubusw_r2r (mm0, mm1); - psrlw_i2r (3, mm1); // 3 - packuswb_r2r (mm7, mm1); - movd_r2m (mm1, dst[0]); - lum_m4 += 4; - lum_m3 += 4; - lum_m2 += 4; - lum_m1 += 4; - lum += 4; - dst += 4; - } - emms (); -} -#endif - /** * C implementation. */ -static void +static inline void deinterlace_line_c (uint8_t * dst, uint8_t * lum_m4, uint8_t * lum_m3, uint8_t * lum_m2, uint8_t * lum_m1, uint8_t * lum, int size) @@ -123,6 +75,59 @@ deinterlace_line_c (uint8_t * dst, uint8_t * lum_m4, } } +#ifdef HAVE_CPU_I386 +#include "mmx.h" +static void +deinterlace_line_mmx (uint8_t * dst, uint8_t * lum_m4, + uint8_t * lum_m3, uint8_t * lum_m2, + uint8_t * lum_m1, uint8_t * lum, int size) +{ + mmx_t rounder; + + rounder.uw[0] = 4; + rounder.uw[1] = 4; + rounder.uw[2] = 4; + rounder.uw[3] = 4; + pxor_r2r (mm7, mm7); + movd_m2r (rounder, mm6); + punpcklbw_r2r (mm7, mm6); + + for (; size > 3; size -= 4) { + movd_m2r (*lum_m4, mm0); + movd_m2r (*lum_m3, mm1); + movd_m2r (*lum_m2, mm2); + movd_m2r (*lum_m1, mm3); + movd_m2r (*lum, mm4); + punpcklbw_r2r (mm7, mm0); + punpcklbw_r2r (mm7, mm1); + punpcklbw_r2r (mm7, mm2); + punpcklbw_r2r (mm7, mm3); + punpcklbw_r2r (mm7, mm4); + paddw_r2r (mm3, mm1); + psllw_i2r (1, mm2); + paddw_r2r (mm4, mm0); + psllw_i2r (2, mm1); // 2 + paddw_r2r (mm6, mm2); + paddw_r2r (mm2, mm1); + psubusw_r2r (mm0, mm1); + psrlw_i2r (3, mm1); // 3 + packuswb_r2r (mm7, mm1); + movd_r2m (mm1, *dst); + lum_m4 += 4; + lum_m3 += 4; + lum_m2 += 4; + lum_m1 += 4; + lum += 4; + dst += 4; + } + emms (); + + /* Handle odd widths */ + if (size > 0) + deinterlace_line_c (dst, lum_m4, lum_m3, lum_m2, lum_m1, lum, size); +} +#endif + /* * The commented-out method below that uses the bottom_field member is more * like the filter as specified in the MPEG2 spec, but it doesn't seem to @@ -134,8 +139,8 @@ deinterlace_scanline_vfir (GstDeinterlace2 * object, deinterlace_scanline_data_t * data, uint8_t * output) { #ifdef HAVE_CPU_I386 - if (object->cpu_feature_flags & OIL_IMPL_FLAG_MMXEXT) { - deinterlace_line_mmxext (output, data->tt1, data->t0, data->m1, data->b0, + if (object->cpu_feature_flags & OIL_IMPL_FLAG_MMX) { + deinterlace_line_mmx (output, data->tt1, data->t0, data->m1, data->b0, data->bb1, object->frame_width * 2); } else { deinterlace_line_c (output, data->tt1, data->t0, data->m1, data->b0,