From 6fd4ed3965ecfebdcba954ccb97d9464f3bd12d7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sebastian=20Dr=C3=B6ge?= <slomo@circular-chaos.org>
Date: Tue, 24 Jun 2008 09:10:46 +0000
Subject: [PATCH] gst/deinterlace2/tvtime/greedy.c: Fix the C implementation to
 produce correct results and optimize the

Original commit message from CVS:
* gst/deinterlace2/tvtime/greedy.c:
(deinterlace_greedy_packed422_scanline_c),
(deinterlace_greedy_packed422_scanline_mmxext),
(deinterlace_greedy_packed422_scanline):
Fix the C implementation to produce correct results and optimize the
MMXEXT implementation.
Handle odd widths and don't read over array boundaries in the MMXEXT
implementation.
* gst/deinterlace2/tvtime/vfir.c: (deinterlace_line_c),
(deinterlace_line_mmx), (deinterlace_scanline_vfir):
Fix a small rounding bug in the MMX implementation, the MMX
implementation doesn't actually need MMXEXT instructions so don't mark
it as such.
Handle odd widths in both implementations.
---
 ChangeLog                        |  20 +++
 gst/deinterlace2/tvtime/greedy.c | 246 +++++++++++++++----------------
 gst/deinterlace2/tvtime/vfir.c   | 107 +++++++-------
 3 files changed, 191 insertions(+), 182 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index ef05fa1156..a1c50b466f 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,23 @@
+2008-06-24  Sebastian Dröge  <sebastian.droege@collabora.co.uk>
+
+	* gst/deinterlace2/tvtime/greedy.c:
+	(deinterlace_greedy_packed422_scanline_c),
+	(deinterlace_greedy_packed422_scanline_mmxext),
+	(deinterlace_greedy_packed422_scanline):
+	Fix the C implementation to produce correct results and optimize the
+	MMXEXT implementation.
+
+	Handle odd widths and don't read over array boundaries in the MMXEXT
+	implementation.
+
+	* gst/deinterlace2/tvtime/vfir.c: (deinterlace_line_c),
+	(deinterlace_line_mmx), (deinterlace_scanline_vfir):
+	Fix a small rounding bug in the MMX implementation, the MMX
+	implementation doesn't actually need MMXEXT instructions so don't mark
+	it as such.
+
+	Handle odd widths in both implementations.
+
 2008-06-22  Stefan Kost  <ensonic@users.sf.net>
 
 	* ext/resindvd/rsnbasesrc.c:
diff --git a/gst/deinterlace2/tvtime/greedy.c b/gst/deinterlace2/tvtime/greedy.c
index c25af036e2..66b8799d9a 100644
--- a/gst/deinterlace2/tvtime/greedy.c
+++ b/gst/deinterlace2/tvtime/greedy.c
@@ -60,135 +60,14 @@ copy_scanline (GstDeinterlace2 * object,
   blit_packed422_scanline (output, data->m1, object->frame_width);
 }
 
-static int GreedyMaxComb = 15;
+static const int GreedyMaxComb = 15;
 
-#ifdef HAVE_CPU_I386
-#include "mmx.h"
-#include "sse.h"
-static void
-deinterlace_greedy_packed422_scanline_sse (GstDeinterlace2 * object,
-    deinterlace_scanline_data_t * data, uint8_t * output)
-{
-  mmx_t MaxComb;
-
-  uint8_t *m0 = data->m0;
-
-  uint8_t *t1 = data->t1;
-
-  uint8_t *b1 = data->b1;
-
-  uint8_t *m2 = data->m2;
-
-  int width = object->frame_width;
-
-  // How badly do we let it weave? 0-255
-  MaxComb.ub[0] = GreedyMaxComb;
-  MaxComb.ub[1] = GreedyMaxComb;
-  MaxComb.ub[2] = GreedyMaxComb;
-  MaxComb.ub[3] = GreedyMaxComb;
-  MaxComb.ub[4] = GreedyMaxComb;
-  MaxComb.ub[5] = GreedyMaxComb;
-  MaxComb.ub[6] = GreedyMaxComb;
-  MaxComb.ub[7] = GreedyMaxComb;
-
-  // L2 == m0
-  // L1 == t1
-  // L3 == b1
-  // LP2 == m2
-
-  width /= 4;
-  while (width--) {
-    movq_m2r (*t1, mm1);        // L1
-    movq_m2r (*m0, mm2);        // L2
-    movq_m2r (*b1, mm3);        // L3
-    movq_m2r (*m2, mm0);        // LP2
-
-    // average L1 and L3 leave result in mm4
-    movq_r2r (mm1, mm4);        // L1
-    pavgb_r2r (mm3, mm4);       // (L1 + L3)/2
-
-
-    // get abs value of possible L2 comb
-    movq_r2r (mm2, mm7);        // L2
-    psubusb_r2r (mm4, mm7);     // L2 - avg
-    movq_r2r (mm4, mm5);        // avg
-    psubusb_r2r (mm2, mm5);     // avg - L2
-    por_r2r (mm7, mm5);         // abs(avg-L2)
-    movq_r2r (mm4, mm6);        // copy of avg for later
-
-
-    // get abs value of possible LP2 comb
-    movq_r2r (mm0, mm7);        // LP2
-    psubusb_r2r (mm4, mm7);     // LP2 - avg
-    psubusb_r2r (mm0, mm4);     // avg - LP2
-    por_r2r (mm7, mm4);         // abs(avg-LP2)
-
-    // use L2 or LP2 depending upon which makes smaller comb
-    psubusb_r2r (mm5, mm4);     // see if it goes to zero
-    psubusb_r2r (mm5, mm5);     // 0
-    pcmpeqb_r2r (mm5, mm4);     // if (mm4=0) then FF else 0
-    pcmpeqb_r2r (mm4, mm5);     // opposite of mm4
-
-    // if Comb(LP2) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55
-    pand_r2r (mm2, mm5);        // use L2 if mm5 == ff, else 0
-    pand_r2r (mm0, mm4);        // use LP2 if mm4 = ff, else 0
-    por_r2r (mm5, mm4);         // may the best win
-
-    // Now lets clip our chosen value to be not outside of the range
-    // of the high/low range L1-L3 by more than abs(L1-L3)
-    // This allows some comb but limits the damages and also allows more
-    // detail than a boring oversmoothed clip.
-
-    movq_r2r (mm1, mm2);        // copy L1
-    psubusb_r2r (mm3, mm2);     // - L3, with saturation
-    paddusb_r2r (mm3, mm2);     // now = Max(L1,L3)
-
-    pcmpeqb_r2r (mm7, mm7);     // all ffffffff
-    psubusb_r2r (mm1, mm7);     // - L1 
-    paddusb_r2r (mm7, mm3);     // add, may sat at fff..
-    psubusb_r2r (mm7, mm3);     // now = Min(L1,L3)
-
-    // allow the value to be above the high or below the low by amt of MaxComb
-    paddusb_m2r (MaxComb, mm2); // increase max by diff
-    psubusb_m2r (MaxComb, mm3); // lower min by diff
-
-    psubusb_r2r (mm3, mm4);     // best - Min
-    paddusb_r2r (mm3, mm4);     // now = Max(best,Min(L1,L3)
-
-    pcmpeqb_r2r (mm7, mm7);     // all ffffffff
-    psubusb_r2r (mm4, mm7);     // - Max(best,Min(best,L3) 
-    paddusb_r2r (mm7, mm2);     // add may sat at FFF..
-    psubusb_r2r (mm7, mm2);     // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped
-
-    movntq_r2m (mm2, *output);  // move in our clipped best
-
-    // Advance to the next set of pixels.
-    output += 8;
-    m0 += 8;
-    t1 += 8;
-    b1 += 8;
-    m2 += 8;
-  }
-  sfence ();
-  emms ();
-}
-#endif
-
-static void
+static inline void
 deinterlace_greedy_packed422_scanline_c (GstDeinterlace2 * object,
-    deinterlace_scanline_data_t * data, uint8_t * output)
+    uint8_t * m0, uint8_t * t1, uint8_t * b1, uint8_t * m2, uint8_t * output,
+    int width)
 {
-  uint8_t *m0 = data->m0;
-
-  uint8_t *t1 = data->t1;
-
-  uint8_t *b1 = data->b1;
-
-  uint8_t *m2 = data->m2;
-
-  int width = 2 * object->frame_width;
-
-  uint16_t avg, l2_diff, lp2_diff, max, min, best;
+  int avg, l2_diff, lp2_diff, max, min, best;
 
   // L2 == m0
   // L1 == t1
@@ -211,10 +90,15 @@ deinterlace_greedy_packed422_scanline_c (GstDeinterlace2 * object,
 
     if (max < 256 - GreedyMaxComb)
       max += GreedyMaxComb;
+    else
+      max = 255;
+
     if (min > GreedyMaxComb)
       min -= GreedyMaxComb;
+    else
+      min = 0;
 
-    *output = MIN (MAX (best, min), max);
+    *output = CLAMP (best, min, max);
 
     // Advance to the next set of pixels.
     output += 1;
@@ -225,18 +109,118 @@ deinterlace_greedy_packed422_scanline_c (GstDeinterlace2 * object,
   }
 }
 
+#ifdef HAVE_CPU_I386
+#include "mmx.h"
+#include "sse.h"
+
+static void
+deinterlace_greedy_packed422_scanline_mmxext (GstDeinterlace2 * object,
+    uint8_t * m0, uint8_t * t1, uint8_t * b1, uint8_t * m2, uint8_t * output,
+    int width)
+{
+  mmx_t MaxComb;
+
+  // How badly do we let it weave? 0-255
+  MaxComb.ub[0] = GreedyMaxComb;
+  MaxComb.ub[1] = GreedyMaxComb;
+  MaxComb.ub[2] = GreedyMaxComb;
+  MaxComb.ub[3] = GreedyMaxComb;
+  MaxComb.ub[4] = GreedyMaxComb;
+  MaxComb.ub[5] = GreedyMaxComb;
+  MaxComb.ub[6] = GreedyMaxComb;
+  MaxComb.ub[7] = GreedyMaxComb;
+
+  // L2 == m0
+  // L1 == t1
+  // L3 == b1
+  // LP2 == m2
+
+  for (; width > 7; width -= 8) {
+    movq_m2r (*t1, mm1);        // L1
+    movq_m2r (*m0, mm2);        // L2
+    movq_m2r (*b1, mm3);        // L3
+    movq_m2r (*m2, mm0);        // LP2
+
+    // average L1 and L3 leave result in mm4
+    movq_r2r (mm1, mm4);        // L1
+    pavgb_r2r (mm3, mm4);       // (L1 + L3)/2
+
+    // get abs value of possible L2 comb
+    movq_r2r (mm2, mm7);        // L2
+    psubusb_r2r (mm4, mm7);     // L2 - avg
+    movq_r2r (mm4, mm5);        // avg
+    psubusb_r2r (mm2, mm5);     // avg - L2
+    por_r2r (mm7, mm5);         // abs(avg-L2)
+
+    // get abs value of possible LP2 comb
+    movq_r2r (mm0, mm7);        // LP2
+    psubusb_r2r (mm4, mm7);     // LP2 - avg
+    psubusb_r2r (mm0, mm4);     // avg - LP2
+    por_r2r (mm7, mm4);         // abs(avg-LP2)
+
+    // use L2 or LP2 depending upon which makes smaller comb
+    psubusb_r2r (mm5, mm4);     // see if it goes to zero
+    pxor_r2r (mm5, mm5);        // 0
+    pcmpeqb_r2r (mm5, mm4);     // if (mm4=0) then FF else 0
+    pcmpeqb_r2r (mm4, mm5);     // opposite of mm4
+
+    // if Comb(LP2) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55
+    pand_r2r (mm2, mm5);        // use L2 if mm5 == ff, else 0
+    pand_r2r (mm0, mm4);        // use LP2 if mm4 = ff, else 0
+    por_r2r (mm5, mm4);         // may the best win
+
+    // Now lets clip our chosen value to be not outside of the range
+    // of the high/low range L1-L3 by more than abs(L1-L3)
+    // This allows some comb but limits the damages and also allows more
+    // detail than a boring oversmoothed clip.
+
+    movq_r2r (mm1, mm2);        // copy L1
+    pmaxub_r2r (mm3, mm2);      // now = Max(L1,L3)
+
+    pminub_r2r (mm1, mm3);      // now = Min(L1,L3)
+
+    // allow the value to be above the high or below the low by amt of MaxComb
+    paddusb_m2r (MaxComb, mm2); // increase max by diff
+    psubusb_m2r (MaxComb, mm3); // lower min by diff
+
+
+    pmaxub_r2r (mm3, mm4);      // now = Max(best,Min(L1,L3)
+    pminub_r2r (mm4, mm2);      // now = Min( Max(best, Min(L1,L3)), L2 )=L2 clipped
+
+    movq_r2m (mm2, *output);    // move in our clipped best
+
+    // Advance to the next set of pixels.
+    output += 8;
+    m0 += 8;
+    t1 += 8;
+    b1 += 8;
+    m2 += 8;
+  }
+  sfence ();
+  emms ();
+
+  if (width > 0)
+    deinterlace_greedy_packed422_scanline_c (object, m0, t1, b1, m2, output,
+        width);
+}
+
+#endif
+
 static void
 deinterlace_greedy_packed422_scanline (GstDeinterlace2 * object,
     deinterlace_scanline_data_t * data, uint8_t * output)
 {
 #ifdef HAVE_CPU_I386
-  if (object->cpu_feature_flags & OIL_IMPL_FLAG_SSE) {
-    deinterlace_greedy_packed422_scanline_sse (object, data, output);
+  if (object->cpu_feature_flags & OIL_IMPL_FLAG_MMXEXT) {
+    deinterlace_greedy_packed422_scanline_mmxext (object, data->m0, data->t1,
+        data->b1, data->m2, output, 2 * object->frame_width);
   } else {
-    deinterlace_greedy_packed422_scanline_c (object, data, output);
+    deinterlace_greedy_packed422_scanline_c (object, data->m0, data->t1,
+        data->b1, data->m2, output, 2 * object->frame_width);
   }
 #else
-  deinterlace_greedy_packed422_scanline_c (object, data, output);
+  deinterlace_greedy_packed422_scanline_c (object, data->m0, data->t1, data->b1,
+      data->m2, output, 2 * object->frame_width);
 #endif
 }
 
diff --git a/gst/deinterlace2/tvtime/vfir.c b/gst/deinterlace2/tvtime/vfir.c
index f32be65475..479ee44060 100644
--- a/gst/deinterlace2/tvtime/vfir.c
+++ b/gst/deinterlace2/tvtime/vfir.c
@@ -49,58 +49,10 @@
  * filter taps here are: [-1 4 2 4 -1].
  */
 
-#ifdef HAVE_CPU_I386
-#include "mmx.h"
-static void
-deinterlace_line_mmxext (uint8_t * dst, uint8_t * lum_m4,
-    uint8_t * lum_m3, uint8_t * lum_m2,
-    uint8_t * lum_m1, uint8_t * lum, int size)
-{
-  mmx_t rounder;
-
-  rounder.uw[0] = 4;
-  rounder.uw[1] = 4;
-  rounder.uw[2] = 4;
-  rounder.uw[3] = 4;
-  pxor_r2r (mm7, mm7);
-  movq_m2r (rounder, mm6);
-
-  for (; size > 3; size -= 4) {
-    movd_m2r (lum_m4[0], mm0);
-    movd_m2r (lum_m3[0], mm1);
-    movd_m2r (lum_m2[0], mm2);
-    movd_m2r (lum_m1[0], mm3);
-    movd_m2r (lum[0], mm4);
-    punpcklbw_r2r (mm7, mm0);
-    punpcklbw_r2r (mm7, mm1);
-    punpcklbw_r2r (mm7, mm2);
-    punpcklbw_r2r (mm7, mm3);
-    punpcklbw_r2r (mm7, mm4);
-    paddw_r2r (mm3, mm1);
-    psllw_i2r (1, mm2);
-    paddw_r2r (mm4, mm0);
-    psllw_i2r (2, mm1);         // 2
-    paddw_r2r (mm6, mm2);
-    paddw_r2r (mm2, mm1);
-    psubusw_r2r (mm0, mm1);
-    psrlw_i2r (3, mm1);         // 3
-    packuswb_r2r (mm7, mm1);
-    movd_r2m (mm1, dst[0]);
-    lum_m4 += 4;
-    lum_m3 += 4;
-    lum_m2 += 4;
-    lum_m1 += 4;
-    lum += 4;
-    dst += 4;
-  }
-  emms ();
-}
-#endif
-
 /**
   * C implementation.
   */
-static void
+static inline void
 deinterlace_line_c (uint8_t * dst, uint8_t * lum_m4,
     uint8_t * lum_m3, uint8_t * lum_m2,
     uint8_t * lum_m1, uint8_t * lum, int size)
@@ -123,6 +75,59 @@ deinterlace_line_c (uint8_t * dst, uint8_t * lum_m4,
   }
 }
 
+#ifdef HAVE_CPU_I386
+#include "mmx.h"
+static void
+deinterlace_line_mmx (uint8_t * dst, uint8_t * lum_m4,
+    uint8_t * lum_m3, uint8_t * lum_m2,
+    uint8_t * lum_m1, uint8_t * lum, int size)
+{
+  mmx_t rounder;
+
+  rounder.uw[0] = 4;
+  rounder.uw[1] = 4;
+  rounder.uw[2] = 4;
+  rounder.uw[3] = 4;
+  pxor_r2r (mm7, mm7);
+  movd_m2r (rounder, mm6);
+  punpcklbw_r2r (mm7, mm6);
+
+  for (; size > 3; size -= 4) {
+    movd_m2r (*lum_m4, mm0);
+    movd_m2r (*lum_m3, mm1);
+    movd_m2r (*lum_m2, mm2);
+    movd_m2r (*lum_m1, mm3);
+    movd_m2r (*lum, mm4);
+    punpcklbw_r2r (mm7, mm0);
+    punpcklbw_r2r (mm7, mm1);
+    punpcklbw_r2r (mm7, mm2);
+    punpcklbw_r2r (mm7, mm3);
+    punpcklbw_r2r (mm7, mm4);
+    paddw_r2r (mm3, mm1);
+    psllw_i2r (1, mm2);
+    paddw_r2r (mm4, mm0);
+    psllw_i2r (2, mm1);         // 2
+    paddw_r2r (mm6, mm2);
+    paddw_r2r (mm2, mm1);
+    psubusw_r2r (mm0, mm1);
+    psrlw_i2r (3, mm1);         // 3
+    packuswb_r2r (mm7, mm1);
+    movd_r2m (mm1, *dst);
+    lum_m4 += 4;
+    lum_m3 += 4;
+    lum_m2 += 4;
+    lum_m1 += 4;
+    lum += 4;
+    dst += 4;
+  }
+  emms ();
+
+  /* Handle odd widths */
+  if (size > 0)
+    deinterlace_line_c (dst, lum_m4, lum_m3, lum_m2, lum_m1, lum, size);
+}
+#endif
+
 /*
  * The commented-out method below that uses the bottom_field member is more
  * like the filter as specified in the MPEG2 spec, but it doesn't seem to
@@ -134,8 +139,8 @@ deinterlace_scanline_vfir (GstDeinterlace2 * object,
     deinterlace_scanline_data_t * data, uint8_t * output)
 {
 #ifdef HAVE_CPU_I386
-  if (object->cpu_feature_flags & OIL_IMPL_FLAG_MMXEXT) {
-    deinterlace_line_mmxext (output, data->tt1, data->t0, data->m1, data->b0,
+  if (object->cpu_feature_flags & OIL_IMPL_FLAG_MMX) {
+    deinterlace_line_mmx (output, data->tt1, data->t0, data->m1, data->b0,
         data->bb1, object->frame_width * 2);
   } else {
     deinterlace_line_c (output, data->tt1, data->t0, data->m1, data->b0,