diff --git a/gst/videomixer/Makefile.am b/gst/videomixer/Makefile.am index eeaa184b43..fee1c06a56 100644 --- a/gst/videomixer/Makefile.am +++ b/gst/videomixer/Makefile.am @@ -1,6 +1,10 @@ plugin_LTLIBRARIES = libgstvideomixer.la +ORC_SOURCE=blendorc +include $(top_srcdir)/common/orc.mak + libgstvideomixer_la_SOURCES = videomixer.c blend.c +nodist_libgstvideomixer_la_SOURCES = $(ORC_NODIST_SOURCES) libgstvideomixer_la_CFLAGS = $(GST_PLUGINS_BASE_CFLAGS) \ $(GST_BASE_CFLAGS) $(GST_CONTROLLER_CFLAGS) $(GST_CFLAGS) $(ORC_CFLAGS) libgstvideomixer_la_LIBADD = $(GST_PLUGINS_BASE_LIBS) -lgstvideo-@GST_MAJORMINOR@ \ diff --git a/gst/videomixer/blend.c b/gst/videomixer/blend.c index ebc3816d35..787889d101 100644 --- a/gst/videomixer/blend.c +++ b/gst/videomixer/blend.c @@ -27,6 +27,7 @@ #endif #include "blend.h" +#include "blendorc.h" #include @@ -40,35 +41,11 @@ #ifdef HAVE_GCC_ASM #if defined(HAVE_ORC) && (defined(HAVE_CPU_I386) || defined(HAVE_CPU_X86_64)) #define BUILD_X86_ASM - -#define GENERIC -#include "blend_mmx.h" -#undef GENERIC #endif #endif /* Below are the implementations of everything */ -inline static void -_blend_u8_c (guint8 * dest, const guint8 * src, - gint src_stride, gint dest_stride, gint src_width, gint src_height, - gint dest_width, gint b_alpha) -{ - gint i, j; - gint src_add = src_stride - src_width; - gint dest_add = dest_stride - dest_width; - - for (i = 0; i < src_height; i++) { - for (j = 0; j < src_width; j++) { - *dest = BLEND (*dest, *src, b_alpha); - dest++; - src++; - } - src += src_add; - dest += dest_add; - } -} - /* A32 is for AYUV, ARGB and BGRA */ #define BLEND_A32(name, LOOP) \ static void \ @@ -181,11 +158,12 @@ A32_CHECKER_C (ayuv, FALSE, 0, 1, 2, 3); #define YUV_TO_G(Y,U,V) (CLAMP (1.164 * (Y - 16) - 0.813 * (V - 128) - 0.391 * (U - 128), 0, 255)) #define YUV_TO_B(Y,U,V) (CLAMP (1.164 * (Y - 16) + 2.018 * (U - 128), 0, 255)) -#define A32_COLOR(name, RGB, LOOP) \ +#define A32_COLOR(name, RGB, A, C1, C2, C3) \ static void \ fill_color_##name (guint8 * dest, gint width, gint height, gint Y, gint U, gint V) \ { \ gint c1, c2, c3; \ + guint32 val; \ \ if (RGB) { \ c1 = YUV_TO_R (Y, U, V); \ @@ -196,41 +174,23 @@ fill_color_##name (guint8 * dest, gint width, gint height, gint Y, gint U, gint c2 = U; \ c3 = V; \ } \ - LOOP (dest, height, width, c1, c2, c3); \ -} - -#define A32_COLOR_LOOP_C(name, A, C1, C2, C3) \ -static inline void \ -_fill_color_loop_##name##_c (guint8 *dest, gint height, gint width, gint c1, gint c2, gint c3) { \ - gint i, j; \ + val = GUINT32_FROM_BE ((0xff << A) | (c1 << C1) | (c2 << C2) | (c3 << C3)); \ \ - for (i = 0; i < height; i++) { \ - for (j = 0; j < width; j++) { \ - dest[A] = 0xff; \ - dest[C1] = c1; \ - dest[C2] = c2; \ - dest[C3] = c3; \ - dest += 4; \ - } \ - } \ + orc_splat_u32 ((guint32 *) dest, val, height * width); \ } -A32_COLOR_LOOP_C (ac1c2c3, 0, 1, 2, 3); -A32_COLOR_LOOP_C (c3c2c1a, 3, 2, 1, 0); -A32_COLOR_LOOP_C (ac3c2c1, 0, 3, 2, 1); -A32_COLOR_LOOP_C (c1c2c3a, 1, 2, 3, 0); -A32_COLOR (argb_c, TRUE, _fill_color_loop_ac1c2c3_c); -A32_COLOR (bgra_c, TRUE, _fill_color_loop_c3c2c1a_c); -A32_COLOR (abgr_c, TRUE, _fill_color_loop_ac3c2c1_c); -A32_COLOR (rgba_c, TRUE, _fill_color_loop_c1c2c3a_c); -A32_COLOR (ayuv_c, FALSE, _fill_color_loop_ac1c2c3_c); +A32_COLOR (argb, TRUE, 24, 16, 8, 0); +A32_COLOR (bgra, TRUE, 0, 8, 16, 24); +A32_COLOR (abgr, TRUE, 24, 0, 8, 16); +A32_COLOR (rgba, TRUE, 0, 24, 16, 8); +A32_COLOR (ayuv, FALSE, 24, 16, 8, 0); /* Y444, Y42B, I420, YV12, Y41B */ -#define PLANAR_YUV_BLEND(name,format_name,format_enum,x_round,y_round,MEMCPY,BLENDLOOP) \ +#define PLANAR_YUV_BLEND(format_name,format_enum,x_round,y_round,MEMCPY,BLENDLOOP) \ inline static void \ -_blend_##format_name##_##name (const guint8 * src, guint8 * dest, \ +_blend_##format_name (const guint8 * src, guint8 * dest, \ gint src_stride, gint dest_stride, gint src_width, gint src_height, \ - gint dest_width, gdouble src_alpha) \ + gdouble src_alpha) \ { \ gint i; \ gint b_alpha; \ @@ -254,11 +214,11 @@ _blend_##format_name##_##name (const guint8 * src, guint8 * dest, \ \ b_alpha = CLAMP ((gint) (src_alpha * 256), 0, 256); \ \ - BLENDLOOP(dest, src, src_stride, dest_stride, src_width, src_height, dest_width, b_alpha); \ + BLENDLOOP(dest, dest_stride, src, src_stride, b_alpha, src_width, src_height); \ } \ \ static void \ -blend_##format_name##_##name (const guint8 * src, gint xpos, gint ypos, \ +blend_##format_name (const guint8 * src, gint xpos, gint ypos, \ gint src_width, gint src_height, gdouble src_alpha, \ guint8 * dest, gint dest_width, gint dest_height) \ { \ @@ -317,11 +277,11 @@ blend_##format_name##_##name (const guint8 * src, gint xpos, gint ypos, \ comp_ypos = (ypos == 0) ? 0 : gst_video_format_get_component_height (format_enum, 0, ypos); \ comp_xoffset = (xoffset == 0) ? 0 : gst_video_format_get_component_width (format_enum, 0, xoffset); \ comp_yoffset = (yoffset == 0) ? 0 : gst_video_format_get_component_height (format_enum, 0, yoffset); \ - _blend_##format_name##_##name (b_src + comp_xoffset + comp_yoffset * src_comp_rowstride, \ + _blend_##format_name (b_src + comp_xoffset + comp_yoffset * src_comp_rowstride, \ b_dest + comp_xpos + comp_ypos * dest_comp_rowstride, \ src_comp_rowstride, \ dest_comp_rowstride, src_comp_width, src_comp_height, \ - dest_comp_width, src_alpha); \ + src_alpha); \ \ b_src = src + gst_video_format_get_component_offset (format_enum, 1, src_width, src_height); \ b_dest = dest + gst_video_format_get_component_offset (format_enum, 1, dest_width, dest_height); \ @@ -335,11 +295,11 @@ blend_##format_name##_##name (const guint8 * src, gint xpos, gint ypos, \ comp_ypos = (ypos == 0) ? 0 : gst_video_format_get_component_height (format_enum, 1, ypos); \ comp_xoffset = (xoffset == 0) ? 0 : gst_video_format_get_component_width (format_enum, 1, xoffset); \ comp_yoffset = (yoffset == 0) ? 0 : gst_video_format_get_component_height (format_enum, 1, yoffset); \ - _blend_##format_name##_##name (b_src + comp_xoffset + comp_yoffset * src_comp_rowstride, \ + _blend_##format_name (b_src + comp_xoffset + comp_yoffset * src_comp_rowstride, \ b_dest + comp_xpos + comp_ypos * dest_comp_rowstride, \ src_comp_rowstride, \ dest_comp_rowstride, src_comp_width, src_comp_height, \ - dest_comp_width, src_alpha); \ + src_alpha); \ \ b_src = src + gst_video_format_get_component_offset (format_enum, 2, src_width, src_height); \ b_dest = dest + gst_video_format_get_component_offset (format_enum, 2, dest_width, dest_height); \ @@ -353,16 +313,16 @@ blend_##format_name##_##name (const guint8 * src, gint xpos, gint ypos, \ comp_ypos = (ypos == 0) ? 0 : gst_video_format_get_component_height (format_enum, 2, ypos); \ comp_xoffset = (xoffset == 0) ? 0 : gst_video_format_get_component_width (format_enum, 2, xoffset); \ comp_yoffset = (yoffset == 0) ? 0 : gst_video_format_get_component_height (format_enum, 2, yoffset); \ - _blend_##format_name##_##name (b_src + comp_xoffset + comp_yoffset * src_comp_rowstride, \ + _blend_##format_name (b_src + comp_xoffset + comp_yoffset * src_comp_rowstride, \ b_dest + comp_xpos + comp_ypos * dest_comp_rowstride, \ src_comp_rowstride, \ dest_comp_rowstride, src_comp_width, src_comp_height, \ - dest_comp_width, src_alpha); \ + src_alpha); \ } -#define PLANAR_YUV_FILL_CHECKER(name, format_name, format_enum, MEMSET) \ +#define PLANAR_YUV_FILL_CHECKER(format_name, format_enum, MEMSET) \ static void \ -fill_checker_##format_name##_##name (guint8 * dest, gint width, gint height) \ +fill_checker_##format_name (guint8 * dest, gint width, gint height) \ { \ gint i, j; \ static const int tab[] = { 80, 160, 80, 160 }; \ @@ -403,9 +363,9 @@ fill_checker_##format_name##_##name (guint8 * dest, gint width, gint height) \ } \ } -#define PLANAR_YUV_FILL_COLOR(name,format_name,format_enum,MEMSET) \ +#define PLANAR_YUV_FILL_COLOR(format_name,format_enum,MEMSET) \ static void \ -fill_color_##format_name##_##name (guint8 * dest, gint width, gint height, \ +fill_color_##format_name (guint8 * dest, gint width, gint height, \ gint colY, gint colU, gint colV) \ { \ guint8 *p; \ @@ -446,23 +406,23 @@ fill_color_##format_name##_##name (guint8 * dest, gint width, gint height, \ #define GST_ROUND_UP_1(x) (x) -PLANAR_YUV_BLEND (c, i420, GST_VIDEO_FORMAT_I420, GST_ROUND_UP_2, - GST_ROUND_UP_2, memcpy, _blend_u8_c); -PLANAR_YUV_FILL_CHECKER (c, i420, GST_VIDEO_FORMAT_I420, memset); -PLANAR_YUV_FILL_COLOR (c, i420, GST_VIDEO_FORMAT_I420, memset); -PLANAR_YUV_FILL_COLOR (c, yv12, GST_VIDEO_FORMAT_YV12, memset); -PLANAR_YUV_BLEND (c, y444, GST_VIDEO_FORMAT_Y444, GST_ROUND_UP_1, - GST_ROUND_UP_1, memcpy, _blend_u8_c); -PLANAR_YUV_FILL_CHECKER (c, y444, GST_VIDEO_FORMAT_Y444, memset); -PLANAR_YUV_FILL_COLOR (c, y444, GST_VIDEO_FORMAT_Y444, memset); -PLANAR_YUV_BLEND (c, y42b, GST_VIDEO_FORMAT_Y42B, GST_ROUND_UP_2, - GST_ROUND_UP_1, memcpy, _blend_u8_c); -PLANAR_YUV_FILL_CHECKER (c, y42b, GST_VIDEO_FORMAT_Y42B, memset); -PLANAR_YUV_FILL_COLOR (c, y42b, GST_VIDEO_FORMAT_Y42B, memset); -PLANAR_YUV_BLEND (c, y41b, GST_VIDEO_FORMAT_Y41B, GST_ROUND_UP_4, - GST_ROUND_UP_1, memcpy, _blend_u8_c); -PLANAR_YUV_FILL_CHECKER (c, y41b, GST_VIDEO_FORMAT_Y41B, memset); -PLANAR_YUV_FILL_COLOR (c, y41b, GST_VIDEO_FORMAT_Y41B, memset); +PLANAR_YUV_BLEND (i420, GST_VIDEO_FORMAT_I420, GST_ROUND_UP_2, + GST_ROUND_UP_2, memcpy, orc_blend_u8); +PLANAR_YUV_FILL_CHECKER (i420, GST_VIDEO_FORMAT_I420, memset); +PLANAR_YUV_FILL_COLOR (i420, GST_VIDEO_FORMAT_I420, memset); +PLANAR_YUV_FILL_COLOR (yv12, GST_VIDEO_FORMAT_YV12, memset); +PLANAR_YUV_BLEND (y444, GST_VIDEO_FORMAT_Y444, GST_ROUND_UP_1, + GST_ROUND_UP_1, memcpy, orc_blend_u8); +PLANAR_YUV_FILL_CHECKER (y444, GST_VIDEO_FORMAT_Y444, memset); +PLANAR_YUV_FILL_COLOR (y444, GST_VIDEO_FORMAT_Y444, memset); +PLANAR_YUV_BLEND (y42b, GST_VIDEO_FORMAT_Y42B, GST_ROUND_UP_2, + GST_ROUND_UP_1, memcpy, orc_blend_u8); +PLANAR_YUV_FILL_CHECKER (y42b, GST_VIDEO_FORMAT_Y42B, memset); +PLANAR_YUV_FILL_COLOR (y42b, GST_VIDEO_FORMAT_Y42B, memset); +PLANAR_YUV_BLEND (y41b, GST_VIDEO_FORMAT_Y41B, GST_ROUND_UP_4, + GST_ROUND_UP_1, memcpy, orc_blend_u8); +PLANAR_YUV_FILL_CHECKER (y41b, GST_VIDEO_FORMAT_Y41B, memset); +PLANAR_YUV_FILL_COLOR (y41b, GST_VIDEO_FORMAT_Y41B, memset); /* RGB, BGR, xRGB, xBGR, RGBx, BGRx */ @@ -518,7 +478,7 @@ blend_##name (const guint8 * src, gint xpos, gint ypos, \ return; \ } \ \ - BLENDLOOP(dest, src, src_stride, dest_stride, bpp * src_width, src_height, bpp * dest_width, b_alpha); \ + BLENDLOOP(dest, dest_stride, src, src_stride, b_alpha, src_width * bpp, src_height); \ } #define RGB_FILL_CHECKER_C(name, bpp, r, g, b) \ @@ -559,7 +519,7 @@ fill_color_##name (guint8 * dest, gint width, gint height, \ } \ } -#define MEMSET_RGB_C(name, bpp, r, g, b) \ +#define MEMSET_RGB_C(name, r, g, b) \ static inline void \ _memset_##name##_c (guint8* dest, gint red, gint green, gint blue, gint width) { \ gint j; \ @@ -568,31 +528,42 @@ _memset_##name##_c (guint8* dest, gint red, gint green, gint blue, gint width) { dest[r] = red; \ dest[g] = green; \ dest[b] = blue; \ - dest += bpp; \ + dest += 3; \ } \ } -RGB_BLEND (rgb_c, 3, memcpy, _blend_u8_c); +#define MEMSET_XRGB(name, r, g, b) \ +static inline void \ +_memset_##name (guint8* dest, gint red, gint green, gint blue, gint width) { \ + guint32 val; \ + \ + val = GUINT32_FROM_BE ((red << r) | (green << g) | (blue << b)); \ + orc_splat_u32 ((guint32 *) dest, val, width); \ +} + +#define _orc_memcpy_u32(dest,src,len) orc_memcpy_u32((guint32 *) dest, (const guint32 *) src, len/4) + +RGB_BLEND (rgb, 3, memcpy, orc_blend_u8); RGB_FILL_CHECKER_C (rgb, 3, 0, 1, 2); -MEMSET_RGB_C (rgb, 3, 0, 1, 2); +MEMSET_RGB_C (rgb, 0, 1, 2); RGB_FILL_COLOR (rgb_c, 3, _memset_rgb_c); -MEMSET_RGB_C (bgr, 3, 2, 1, 0); +MEMSET_RGB_C (bgr, 2, 1, 0); RGB_FILL_COLOR (bgr_c, 3, _memset_bgr_c); -RGB_BLEND (xrgb_c, 4, memcpy, _blend_u8_c); +RGB_BLEND (xrgb, 4, _orc_memcpy_u32, orc_blend_u8); RGB_FILL_CHECKER_C (xrgb, 4, 1, 2, 3); -MEMSET_RGB_C (xrgb, 4, 1, 2, 3); -RGB_FILL_COLOR (xrgb_c, 4, _memset_xrgb_c); +MEMSET_XRGB (xrgb, 24, 16, 0); +RGB_FILL_COLOR (xrgb, 4, _memset_xrgb); -MEMSET_RGB_C (xbgr, 4, 3, 2, 1); -RGB_FILL_COLOR (xbgr_c, 4, _memset_xbgr_c); +MEMSET_XRGB (xbgr, 0, 16, 24); +RGB_FILL_COLOR (xbgr, 4, _memset_xbgr); -MEMSET_RGB_C (rgbx, 4, 0, 1, 2); -RGB_FILL_COLOR (rgbx_c, 4, _memset_rgbx_c); +MEMSET_XRGB (rgbx, 24, 16, 8); +RGB_FILL_COLOR (rgbx, 4, _memset_rgbx); -MEMSET_RGB_C (bgrx, 4, 2, 1, 0); -RGB_FILL_COLOR (bgrx_c, 4, _memset_bgrx_c); +MEMSET_XRGB (bgrx, 8, 16, 24); +RGB_FILL_COLOR (bgrx, 4, _memset_bgrx); /* YUY2, YVYU, UYVY */ @@ -651,7 +622,7 @@ blend_##name (const guint8 * src, gint xpos, gint ypos, \ return; \ } \ \ - BLENDLOOP(dest, src, src_stride, dest_stride, 2 * src_width, src_height, 2 * dest_width, b_alpha); \ + BLENDLOOP(dest, dest_stride, src, src_stride, b_alpha, 2 * src_width, src_height); \ } #define PACKED_422_FILL_CHECKER_C(name, Y1, U, Y2, V) \ @@ -683,43 +654,32 @@ static void \ fill_color_##name (guint8 * dest, gint width, gint height, \ gint colY, gint colU, gint colV) \ { \ - gint i, j; \ - gint dest_add; \ + gint i; \ + gint dest_stride; \ + guint32 val; \ \ width = GST_ROUND_UP_2 (width); \ - dest_add = GST_ROUND_UP_4 (width * 2) - width * 2; \ + dest_stride = GST_ROUND_UP_4 (width * 2); \ width /= 2; \ \ + val = GUINT32_FROM_BE ((colY << Y1) | (colY << Y2) | (colU << U) | (colV << V)); \ + \ for (i = 0; i < height; i++) { \ - for (j = 0; j < width; j++) { \ - dest[Y1] = colY; \ - dest[Y2] = colY; \ - dest[U] = colU; \ - dest[V] = colV; \ - dest += 4; \ - } \ - dest += dest_add; \ + orc_splat_u32 ((guint32 *) dest, val, width); \ + dest += dest_stride; \ } \ } -PACKED_422_BLEND (yuy2_c, memcpy, _blend_u8_c); +PACKED_422_BLEND (yuy2, memcpy, orc_blend_u8); PACKED_422_FILL_CHECKER_C (yuy2, 0, 1, 2, 3); PACKED_422_FILL_CHECKER_C (uyvy, 1, 0, 3, 2); -PACKED_422_FILL_COLOR (yuy2_c, 0, 1, 2, 3); -PACKED_422_FILL_COLOR (yvyu_c, 0, 3, 2, 1); -PACKED_422_FILL_COLOR (uyvy_c, 1, 0, 3, 2); +PACKED_422_FILL_COLOR (yuy2, 24, 16, 8, 0); +PACKED_422_FILL_COLOR (yvyu, 24, 0, 8, 16); +PACKED_422_FILL_COLOR (uyvy, 16, 24, 0, 8); /* MMX Implementations */ #ifdef BUILD_X86_ASM -#define MEMSET_xRGB_MMX(name, r, g, b) \ -static inline void \ -_memset_##name##_mmx (guint8* dest, gint red, gint green, gint blue, gint width) { \ - guint32 val = (red << r) | (green << g) | (blue << b); \ - \ - _memset_u32_mmx ((guint32 *) dest, val, width); \ -} - #define A32 #define NAME_BLEND _blend_loop_argb_mmx #define A_OFF 0 @@ -736,60 +696,6 @@ _memset_##name##_mmx (guint8* dest, gint red, gint green, gint blue, gint width) BLEND_A32 (argb_mmx, _blend_loop_argb_mmx); BLEND_A32 (bgra_mmx, _blend_loop_bgra_mmx); - -#define A32_COLOR_LOOP_MMX(name, A, C1, C2, C3) \ -static inline void \ -_fill_color_loop_##name##_mmx (guint8 *dest, gint height, gint width, gint c1, gint c2, gint c3) { \ - guint32 val = (0xff << A) | (c1 << C1) | (c2 << C2) | (c3 << C3); \ - \ - _memset_u32_mmx ((guint32 *) dest, val, height*width); \ -} - -A32_COLOR_LOOP_MMX (argb, 0, 8, 16, 24); -A32_COLOR_LOOP_MMX (abgr, 0, 24, 16, 8); -A32_COLOR_LOOP_MMX (rgba, 24, 0, 8, 16); -A32_COLOR_LOOP_MMX (bgra, 24, 16, 8, 0); - -A32_COLOR (argb_mmx, TRUE, _fill_color_loop_argb_mmx); -A32_COLOR (bgra_mmx, TRUE, _fill_color_loop_bgra_mmx); -A32_COLOR (abgr_mmx, TRUE, _fill_color_loop_abgr_mmx); -A32_COLOR (rgba_mmx, TRUE, _fill_color_loop_rgba_mmx); -A32_COLOR (ayuv_mmx, FALSE, _fill_color_loop_argb_mmx); - -PLANAR_YUV_BLEND (mmx, i420, GST_VIDEO_FORMAT_I420, GST_ROUND_UP_2, - GST_ROUND_UP_2, _memcpy_u8_mmx, _blend_u8_mmx); -PLANAR_YUV_FILL_CHECKER (mmx, i420, GST_VIDEO_FORMAT_I420, _memset_u8_mmx); -PLANAR_YUV_FILL_COLOR (mmx, i420, GST_VIDEO_FORMAT_I420, _memset_u8_mmx); -PLANAR_YUV_FILL_COLOR (mmx, yv12, GST_VIDEO_FORMAT_YV12, _memset_u8_mmx); -PLANAR_YUV_BLEND (mmx, y444, GST_VIDEO_FORMAT_Y444, GST_ROUND_UP_1, - GST_ROUND_UP_1, _memcpy_u8_mmx, _blend_u8_mmx); -PLANAR_YUV_FILL_CHECKER (mmx, y444, GST_VIDEO_FORMAT_Y444, _memset_u8_mmx); -PLANAR_YUV_FILL_COLOR (mmx, y444, GST_VIDEO_FORMAT_Y444, _memset_u8_mmx); -PLANAR_YUV_BLEND (mmx, y42b, GST_VIDEO_FORMAT_Y42B, GST_ROUND_UP_2, - GST_ROUND_UP_1, _memcpy_u8_mmx, _blend_u8_mmx); -PLANAR_YUV_FILL_CHECKER (mmx, y42b, GST_VIDEO_FORMAT_Y42B, _memset_u8_mmx); -PLANAR_YUV_FILL_COLOR (mmx, y42b, GST_VIDEO_FORMAT_Y42B, _memset_u8_mmx); -PLANAR_YUV_BLEND (mmx, y41b, GST_VIDEO_FORMAT_Y41B, GST_ROUND_UP_4, - GST_ROUND_UP_1, _memcpy_u8_mmx, _blend_u8_mmx); -PLANAR_YUV_FILL_CHECKER (mmx, y41b, GST_VIDEO_FORMAT_Y41B, _memset_u8_mmx); -PLANAR_YUV_FILL_COLOR (mmx, y41b, GST_VIDEO_FORMAT_Y41B, _memset_u8_mmx); - -RGB_BLEND (rgb_mmx, 3, _memcpy_u8_mmx, _blend_u8_mmx); - -RGB_BLEND (xrgb_mmx, 4, _memcpy_u8_mmx, _blend_u8_mmx); -MEMSET_xRGB_MMX (xrgb, 16, 8, 0); -RGB_FILL_COLOR (xrgb_mmx, 4, _memset_xrgb_mmx); - -MEMSET_xRGB_MMX (xbgr, 0, 8, 16); -RGB_FILL_COLOR (xbgr_mmx, 4, _memset_xbgr_mmx); - -MEMSET_xRGB_MMX (rgbx, 24, 16, 8); -RGB_FILL_COLOR (rgbx_mmx, 4, _memset_rgbx_mmx); - -MEMSET_xRGB_MMX (bgrx, 8, 16, 24); -RGB_FILL_COLOR (bgrx_mmx, 4, _memset_bgrx_mmx); - -PACKED_422_BLEND (yuy2_mmx, _memcpy_u8_mmx, _blend_u8_mmx); #endif /* Init function */ @@ -857,77 +763,50 @@ gst_video_mixer_init_blend (void) gst_video_mixer_blend_argb = blend_argb_c; gst_video_mixer_blend_bgra = blend_bgra_c; - gst_video_mixer_blend_i420 = blend_i420_c; - gst_video_mixer_blend_y444 = blend_y444_c; - gst_video_mixer_blend_y42b = blend_y42b_c; - gst_video_mixer_blend_y41b = blend_y41b_c; - gst_video_mixer_blend_rgb = blend_rgb_c; - gst_video_mixer_blend_xrgb = blend_xrgb_c; - gst_video_mixer_blend_yuy2 = blend_yuy2_c; + gst_video_mixer_blend_i420 = blend_i420; + gst_video_mixer_blend_y444 = blend_y444; + gst_video_mixer_blend_y42b = blend_y42b; + gst_video_mixer_blend_y41b = blend_y41b; + gst_video_mixer_blend_rgb = blend_rgb; + gst_video_mixer_blend_xrgb = blend_xrgb; + gst_video_mixer_blend_yuy2 = blend_yuy2; gst_video_mixer_fill_checker_argb = fill_checker_argb_c; gst_video_mixer_fill_checker_bgra = fill_checker_bgra_c; gst_video_mixer_fill_checker_ayuv = fill_checker_ayuv_c; - gst_video_mixer_fill_checker_i420 = fill_checker_i420_c; - gst_video_mixer_fill_checker_y444 = fill_checker_y444_c; - gst_video_mixer_fill_checker_y42b = fill_checker_y42b_c; - gst_video_mixer_fill_checker_y41b = fill_checker_y41b_c; + gst_video_mixer_fill_checker_i420 = fill_checker_i420; + gst_video_mixer_fill_checker_y444 = fill_checker_y444; + gst_video_mixer_fill_checker_y42b = fill_checker_y42b; + gst_video_mixer_fill_checker_y41b = fill_checker_y41b; gst_video_mixer_fill_checker_rgb = fill_checker_rgb_c; gst_video_mixer_fill_checker_xrgb = fill_checker_xrgb_c; gst_video_mixer_fill_checker_yuy2 = fill_checker_yuy2_c; gst_video_mixer_fill_checker_uyvy = fill_checker_uyvy_c; - gst_video_mixer_fill_color_argb = fill_color_argb_c; - gst_video_mixer_fill_color_bgra = fill_color_bgra_c; - gst_video_mixer_fill_color_abgr = fill_color_abgr_c; - gst_video_mixer_fill_color_rgba = fill_color_rgba_c; - gst_video_mixer_fill_color_ayuv = fill_color_ayuv_c; - gst_video_mixer_fill_color_i420 = fill_color_i420_c; - gst_video_mixer_fill_color_yv12 = fill_color_yv12_c; - gst_video_mixer_fill_color_y444 = fill_color_y444_c; - gst_video_mixer_fill_color_y42b = fill_color_y42b_c; - gst_video_mixer_fill_color_y41b = fill_color_y41b_c; + gst_video_mixer_fill_color_argb = fill_color_argb; + gst_video_mixer_fill_color_bgra = fill_color_bgra; + gst_video_mixer_fill_color_abgr = fill_color_abgr; + gst_video_mixer_fill_color_rgba = fill_color_rgba; + gst_video_mixer_fill_color_ayuv = fill_color_ayuv; + gst_video_mixer_fill_color_i420 = fill_color_i420; + gst_video_mixer_fill_color_yv12 = fill_color_yv12; + gst_video_mixer_fill_color_y444 = fill_color_y444; + gst_video_mixer_fill_color_y42b = fill_color_y42b; + gst_video_mixer_fill_color_y41b = fill_color_y41b; gst_video_mixer_fill_color_rgb = fill_color_rgb_c; gst_video_mixer_fill_color_bgr = fill_color_bgr_c; - gst_video_mixer_fill_color_xrgb = fill_color_xrgb_c; - gst_video_mixer_fill_color_xbgr = fill_color_xbgr_c; - gst_video_mixer_fill_color_rgbx = fill_color_rgbx_c; - gst_video_mixer_fill_color_bgrx = fill_color_bgrx_c; - gst_video_mixer_fill_color_yuy2 = fill_color_yuy2_c; - gst_video_mixer_fill_color_yvyu = fill_color_yvyu_c; - gst_video_mixer_fill_color_uyvy = fill_color_uyvy_c; + gst_video_mixer_fill_color_xrgb = fill_color_xrgb; + gst_video_mixer_fill_color_xbgr = fill_color_xbgr; + gst_video_mixer_fill_color_rgbx = fill_color_rgbx; + gst_video_mixer_fill_color_bgrx = fill_color_bgrx; + gst_video_mixer_fill_color_yuy2 = fill_color_yuy2; + gst_video_mixer_fill_color_yvyu = fill_color_yvyu; + gst_video_mixer_fill_color_uyvy = fill_color_uyvy; #ifdef BUILD_X86_ASM if (cpu_flags & ORC_TARGET_MMX_MMX) { gst_video_mixer_blend_argb = blend_argb_mmx; gst_video_mixer_blend_bgra = blend_bgra_mmx; - gst_video_mixer_blend_i420 = blend_i420_mmx; - gst_video_mixer_blend_y444 = blend_y444_mmx; - gst_video_mixer_blend_y42b = blend_y42b_mmx; - gst_video_mixer_blend_y41b = blend_y41b_mmx; - gst_video_mixer_blend_rgb = blend_rgb_mmx; - gst_video_mixer_blend_xrgb = blend_xrgb_mmx; - gst_video_mixer_blend_yuy2 = blend_yuy2_mmx; - - gst_video_mixer_fill_checker_i420 = fill_checker_i420_mmx; - gst_video_mixer_fill_checker_y444 = fill_checker_y444_mmx; - gst_video_mixer_fill_checker_y42b = fill_checker_y42b_mmx; - gst_video_mixer_fill_checker_y41b = fill_checker_y41b_mmx; - - gst_video_mixer_fill_color_argb = fill_color_argb_mmx; - gst_video_mixer_fill_color_bgra = fill_color_bgra_mmx; - gst_video_mixer_fill_color_abgr = fill_color_abgr_mmx; - gst_video_mixer_fill_color_rgba = fill_color_rgba_mmx; - gst_video_mixer_fill_color_ayuv = fill_color_ayuv_mmx; - gst_video_mixer_fill_color_i420 = fill_color_i420_mmx; - gst_video_mixer_fill_color_yv12 = fill_color_yv12_mmx; - gst_video_mixer_fill_color_y444 = fill_color_y444_mmx; - gst_video_mixer_fill_color_y42b = fill_color_y42b_mmx; - gst_video_mixer_fill_color_y41b = fill_color_y41b_mmx; - gst_video_mixer_fill_color_xrgb = fill_color_xrgb_mmx; - gst_video_mixer_fill_color_xbgr = fill_color_xbgr_mmx; - gst_video_mixer_fill_color_rgbx = fill_color_rgbx_mmx; - gst_video_mixer_fill_color_bgrx = fill_color_bgrx_mmx; } #endif } diff --git a/gst/videomixer/blend_mmx.h b/gst/videomixer/blend_mmx.h index 192508d0cb..9c0f250aa6 100644 --- a/gst/videomixer/blend_mmx.h +++ b/gst/videomixer/blend_mmx.h @@ -122,229 +122,3 @@ NAME_BLEND (guint8 * dest, const guint8 * src, gint src_height, gint src_width, } #endif -#ifdef GENERIC -static inline void -_memcpy_u8_mmx (guint8 * dest, const guint8 * src, guint count) -{ - /* *INDENT-OFF* */ - __asm__ __volatile__ ( - "1: \n\t" - "test $7, %0 \n\t" - "je 3f \n\t" - "2: \n\t" - "movb (%2), %%al \n\t" - "movb %%al, (%1) \n\t" - "inc %2 \n\t" - "inc %1 \n\t" - "dec %0 \n\t" - "test $7, %0 \n\t" - "jne 2b \n\t" - "3: \n\t" - "sar $3, %0 \n\t" - "cmp $0, %0 \n\t" - "je 5f \n\t" - "4: \n\t" - "movq (%2), %%mm0 \n\t" - "movq %%mm0, (%1) \n\t" - "add $8, %2 \n\t" - "add $8, %1 \n\t" - "dec %0 \n\t" - "jne 4b \n\t" - "5: \n\t" - "emms \n\t" - : "=r" (count), "=q" (dest), "=q" (src) - : "0" (count), "1" (dest), "2" (src) - : "memory", "al", - "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)" -#ifdef __MMX__ - , "mm0" -#endif - ); - /* *INDENT-ON* */ -} - -static inline void -_memset_u8_mmx (guint8 * dest, guint val, guint count) -{ - guint8 val8 = val; - guint64 val64; - - val64 = (val << 24) | (val << 16) | (val << 8) | (val); - val64 = (val64 << 32) | val64; - - /* *INDENT-OFF* */ - __asm__ __volatile__ ( - "1: \n\t" - "test $7, %0 \n\t" - "je 3f \n\t" - "2: \n\t" - "movb %4, (%1) \n\t" - "inc %1 \n\t" - "dec %0 \n\t" - "test $7, %0 \n\t" - "jne 2b \n\t" - "3: \n\t" - "sar $3, %0 \n\t" - "cmp $0, %0 \n\t" - "je 5f \n\t" - "movq %5, %%mm0 \n\t" - "4: \n\t" - "movq %%mm0, (%1) \n\t" - "add $8, %1 \n\t" - "dec %0 \n\t" - "jne 4b \n\t" - "5: \n\t" - "emms \n\t" - : "=r" (count), "=q" (dest) - : "0" (count), "1" (dest), "q" (val8), "m" (val64) - : "memory", - "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)" -#ifdef __MMX__ - , "mm0" -#endif - ); - /* *INDENT-ON* */ -} - -static inline void -_memset_u32_mmx (guint32 * dest, guint32 val, guint count) -{ - guint64 val64 = val; - - val64 |= (val64 << 32); - - /* *INDENT-OFF* */ - __asm__ __volatile__ ( - "1: \n\t" - "test $1, %0 \n\t" - "je 3f \n\t" - "2: \n\t" - "movl %4, (%1) \n\t" - "add $4, %1 \n\t" - "dec %0 \n\t" - "test $1, %0 \n\t" - "jne 2b \n\t" - "3: \n\t" - "sar $1, %0 \n\t" - "cmp $0, %0 \n\t" - "je 5f \n\t" - "movq %5, %%mm0 \n\t" - "4: \n\t" - "movq %%mm0, (%1) \n\t" - "add $8, %1 \n\t" - "dec %0 \n\t" - "jne 4b \n\t" - "5: \n\t" - "emms \n\t" - : "=r" (count), "=r" (dest) - : "0" (count), "1" (dest), "r" (val), "m" (val64) - : "memory", - "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)" -#ifdef __MMX__ - , "mm0" -#endif - ); - /* *INDENT-ON* */ -} - -static inline void -_blend_u8_mmx (guint8 * dest, const guint8 * src, - gint src_stride, gint dest_stride, gint src_width, gint src_height, - gint dest_width, gint s_alpha) -{ - gint i; - gint src_add = src_stride - src_width; - gint dest_add = dest_stride - src_width; - - for (i = 0; i < src_height; i++) { - /* Do first 3 "odd" pixels */ - while ((src_width & 0x03)) { - *dest = BLEND (*dest, *src, s_alpha); - dest++; - src++; - src_width--; - } - - /* (P1 * (256 - A) + (P2 * A)) / 256 - * => (P1 * 256 - P1 * A + P2 * A) / 256 - * => (P1 * 256 + A * (P2 - P1) / 256 - * => P1 + (A * (P2 - P1)) / 256 - */ - /* *INDENT-OFF* */ - __asm__ __volatile__ ( - " mov %4 , %%eax \n\t" /* eax = s_alpha */ - " movd %%eax , %%mm6 \n\t" /* mm6 = s_alpha */ - " punpcklwd %%mm6 , %%mm6 \n\t" /* mm6 = 00 00 00 00 00 aa 00 aa, alpha scale factor */ - " punpckldq %%mm6 , %%mm6 \n\t" /* mm6 = 00 aa 00 aa 00 aa 00 aa */ - - " pxor %%mm7 , %%mm7 \n\t" /* mm7 = 00 00 00 00 00 00 00 00 */ - - " movl %5 , %%ecx \n\t" /* ecx = src_width */ - - "1: \n\t" - " test $7 , %%ecx \n\t" - " je 2f \n\t" - - /* do first 4 "odd" bytes */ - " movd (%2) , %%mm2 \n\t" /* mm2 = src, 00 00 00 00 sv su sy sa */ - " movd (%3) , %%mm1 \n\t" /* mm1 = dest, 00 00 00 00 dv du dy da */ - " punpcklbw %%mm7 , %%mm2 \n\t" - " punpcklbw %%mm7 , %%mm1 \n\t" - " psubw %%mm1 , %%mm2 \n\t" /* mm2 = mm2 - mm1 */ - " pmullw %%mm6 , %%mm2 \n\t" /* mm2 = a * mm2 */ - " psllw $8 , %%mm1 \n\t" /* scale up */ - " paddw %%mm1 , %%mm2 \n\t" /* mm2 = mm2 + mm1 */ - " psrlw $8 , %%mm2 \n\t" /* scale down */ - " packuswb %%mm2 , %%mm2 \n\t" - " movd %%mm2 , (%3) \n\t" /* dest = mm1 */ - " add $4 , %1 \n\t" - " add $4 , %0 \n\t" - - "2: \n\t" - " sar $3 , %%ecx \n\t" /* prepare for 8 bytes per loop */ - " cmp $0 , %%ecx \n\t" - " je 4f \n\t" - - "3: \n\t" - /* do even pixels */ - " movq (%2) , %%mm2 \n\t" /* mm2 = src, sv1 su1 sy1 sa1 sv0 su0 sy0 sa0 */ - " movq (%3) , %%mm1 \n\t" /* mm1 = dest, dv1 du1 dy1 da1 dv0 du0 dy0 da0 */ - " movq %%mm2 , %%mm4 \n\t" - " movq %%mm1 , %%mm3 \n\t" - " punpcklbw %%mm7 , %%mm2 \n\t" - " punpckhbw %%mm7 , %%mm4 \n\t" - " punpcklbw %%mm7 , %%mm1 \n\t" - " punpckhbw %%mm7 , %%mm3 \n\t" - " psubw %%mm1 , %%mm2 \n\t" /* mm2 = mm2 - mm1 */ - " psubw %%mm3 , %%mm4 \n\t" /* mm4 = mm4 - mm3 */ - " pmullw %%mm6 , %%mm2 \n\t" /* mm2 = a * mm2 */ - " pmullw %%mm6 , %%mm4 \n\t" /* mm2 = a * mm2 */ - " psllw $8 , %%mm1 \n\t" /* scale up */ - " psllw $8 , %%mm3 \n\t" /* scale up */ - " paddw %%mm1 , %%mm2 \n\t" /* mm2 = mm2 + mm1 */ - " paddw %%mm3 , %%mm4 \n\t" /* mm4 = mm4 + mm3 */ - " psrlw $8 , %%mm2 \n\t" /* scale down */ - " psrlw $8 , %%mm4 \n\t" /* scale down */ - " packuswb %%mm4 , %%mm2 \n\t" - " movq %%mm2 , (%3) \n\t" - " add $8 , %0 \n\t" - " add $8 , %1 \n\t" - " dec %%ecx \n\t" - " jne 3b \n\t" - - "4: \n\t" - :"=r" (src), "=r" (dest) - :"0" (src), "1" (dest), "m" (s_alpha), "m" (src_width) - :"%eax", "%ecx", "memory", - "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)" -#ifdef __MMX__ - , "mm1", "mm2", "mm3", "mm4", "mm6", "mm7" -#endif - ); - /* *INDENT-ON* */ - src += src_add; - dest += dest_add; - } - __asm__ __volatile__ ("emms"); -} -#endif diff --git a/gst/videomixer/blendorc-dist.c b/gst/videomixer/blendorc-dist.c new file mode 100644 index 0000000000..bf7b32c454 --- /dev/null +++ b/gst/videomixer/blendorc-dist.c @@ -0,0 +1,423 @@ + +/* autogenerated from blendorc.orc */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif +#ifndef DISABLE_ORC +#include +#endif +#include + +#ifndef _ORC_INTEGER_TYPEDEFS_ +#define _ORC_INTEGER_TYPEDEFS_ +#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L +#include +typedef int8_t orc_int8; +typedef int16_t orc_int16; +typedef int32_t orc_int32; +typedef int64_t orc_int64; +typedef uint8_t orc_uint8; +typedef uint16_t orc_uint16; +typedef uint32_t orc_uint32; +typedef uint64_t orc_uint64; +#elif defined(_MSC_VER) +typedef signed __int8 orc_int8; +typedef signed __int16 orc_int16; +typedef signed __int32 orc_int32; +typedef signed __int64 orc_int64; +typedef unsigned __int8 orc_uint8; +typedef unsigned __int16 orc_uint16; +typedef unsigned __int32 orc_uint32; +typedef unsigned __int64 orc_uint64; +#else +#include +typedef signed char orc_int8; +typedef short orc_int16; +typedef int orc_int32; +typedef unsigned char orc_uint8; +typedef unsigned short orc_uint16; +typedef unsigned int orc_uint32; +#if INT_MAX == LONG_MAX +typedef long long orc_int64; +typedef unsigned long long orc_uint64; +#else +typedef long orc_int64; +typedef unsigned long orc_uint64; +#endif +#endif +typedef union +{ + orc_int32 i; + float f; +} orc_union32; +typedef union +{ + orc_int64 i; + double f; +} orc_union64; +#endif + +void orc_splat_u32 (guint32 * d1, int p1, int n); +void orc_memcpy_u32 (guint32 * d1, const guint32 * s1, int n); +void orc_blend_u8 (guint8 * d1, int d1_stride, const guint8 * s1, int s1_stride, + int p1, int n, int m); + + +/* begin Orc C target preamble */ +#define ORC_CLAMP(x,a,b) ((x)<(a) ? (a) : ((x)>(b) ? (b) : (x))) +#define ORC_ABS(a) ((a)<0 ? -(a) : (a)) +#define ORC_MIN(a,b) ((a)<(b) ? (a) : (b)) +#define ORC_MAX(a,b) ((a)>(b) ? (a) : (b)) +#define ORC_SB_MAX 127 +#define ORC_SB_MIN (-1-ORC_SB_MAX) +#define ORC_UB_MAX 255 +#define ORC_UB_MIN 0 +#define ORC_SW_MAX 32767 +#define ORC_SW_MIN (-1-ORC_SW_MAX) +#define ORC_UW_MAX 65535 +#define ORC_UW_MIN 0 +#define ORC_SL_MAX 2147483647 +#define ORC_SL_MIN (-1-ORC_SL_MAX) +#define ORC_UL_MAX 4294967295U +#define ORC_UL_MIN 0 +#define ORC_CLAMP_SB(x) ORC_CLAMP(x,ORC_SB_MIN,ORC_SB_MAX) +#define ORC_CLAMP_UB(x) ORC_CLAMP(x,ORC_UB_MIN,ORC_UB_MAX) +#define ORC_CLAMP_SW(x) ORC_CLAMP(x,ORC_SW_MIN,ORC_SW_MAX) +#define ORC_CLAMP_UW(x) ORC_CLAMP(x,ORC_UW_MIN,ORC_UW_MAX) +#define ORC_CLAMP_SL(x) ORC_CLAMP(x,ORC_SL_MIN,ORC_SL_MAX) +#define ORC_CLAMP_UL(x) ORC_CLAMP(x,ORC_UL_MIN,ORC_UL_MAX) +#define ORC_SWAP_W(x) ((((x)&0xff)<<8) | (((x)&0xff00)>>8)) +#define ORC_SWAP_L(x) ((((x)&0xff)<<24) | (((x)&0xff00)<<8) | (((x)&0xff0000)>>8) | (((x)&0xff000000)>>24)) +#define ORC_PTR_OFFSET(ptr,offset) ((void *)(((unsigned char *)(ptr)) + (offset))) +/* end Orc C target preamble */ + + + +/* orc_splat_u32 */ +#ifdef DISABLE_ORC +void +orc_splat_u32 (guint32 * d1, int p1, int n) +{ + int i; + orc_union32 var0; + orc_union32 *ptr0; + const orc_union32 var24 = { p1 }; + + ptr0 = (orc_union32 *) d1; + + for (i = 0; i < n; i++) { + /* 0: copyl */ + var0.i = var24.i; + *ptr0 = var0; + ptr0++; + } + +} + +#else +static void +_backup_orc_splat_u32 (OrcExecutor * ex) +{ + int i; + int n = ex->n; + orc_union32 var0; + orc_union32 *ptr0; + const orc_union32 var24 = *(orc_union32 *) (ex->params + 24); + + ptr0 = (orc_union32 *) ex->arrays[0]; + + for (i = 0; i < n; i++) { + /* 0: copyl */ + var0.i = var24.i; + *ptr0 = var0; + ptr0++; + } + +} + +void +orc_splat_u32 (guint32 * d1, int p1, int n) +{ + OrcExecutor _ex, *ex = &_ex; + static int p_inited = 0; + static OrcProgram *p = 0; + void (*func) (OrcExecutor *); + + if (!p_inited) { + orc_once_mutex_lock (); + if (!p_inited) { + OrcCompileResult result; + + p = orc_program_new (); + orc_program_set_name (p, "orc_splat_u32"); + orc_program_set_backup_function (p, _backup_orc_splat_u32); + orc_program_add_destination (p, 4, "d1"); + orc_program_add_parameter (p, 4, "p1"); + + orc_program_append (p, "copyl", ORC_VAR_D1, ORC_VAR_P1, ORC_VAR_D1); + + result = orc_program_compile (p); + } + p_inited = TRUE; + orc_once_mutex_unlock (); + } + ex->program = p; + + ex->n = n; + ex->arrays[ORC_VAR_D1] = d1; + ex->params[ORC_VAR_P1] = p1; + + func = p->code_exec; + func (ex); +} +#endif + + +/* orc_memcpy_u32 */ +#ifdef DISABLE_ORC +void +orc_memcpy_u32 (guint32 * d1, const guint32 * s1, int n) +{ + int i; + orc_union32 var0; + orc_union32 *ptr0; + orc_union32 var4; + const orc_union32 *ptr4; + + ptr0 = (orc_union32 *) d1; + ptr4 = (orc_union32 *) s1; + + for (i = 0; i < n; i++) { + var4 = *ptr4; + ptr4++; + /* 0: copyl */ + var0.i = var4.i; + *ptr0 = var0; + ptr0++; + } + +} + +#else +static void +_backup_orc_memcpy_u32 (OrcExecutor * ex) +{ + int i; + int n = ex->n; + orc_union32 var0; + orc_union32 *ptr0; + orc_union32 var4; + const orc_union32 *ptr4; + + ptr0 = (orc_union32 *) ex->arrays[0]; + ptr4 = (orc_union32 *) ex->arrays[4]; + + for (i = 0; i < n; i++) { + var4 = *ptr4; + ptr4++; + /* 0: copyl */ + var0.i = var4.i; + *ptr0 = var0; + ptr0++; + } + +} + +void +orc_memcpy_u32 (guint32 * d1, const guint32 * s1, int n) +{ + OrcExecutor _ex, *ex = &_ex; + static int p_inited = 0; + static OrcProgram *p = 0; + void (*func) (OrcExecutor *); + + if (!p_inited) { + orc_once_mutex_lock (); + if (!p_inited) { + OrcCompileResult result; + + p = orc_program_new (); + orc_program_set_name (p, "orc_memcpy_u32"); + orc_program_set_backup_function (p, _backup_orc_memcpy_u32); + orc_program_add_destination (p, 4, "d1"); + orc_program_add_source (p, 4, "s1"); + + orc_program_append (p, "copyl", ORC_VAR_D1, ORC_VAR_S1, ORC_VAR_D1); + + result = orc_program_compile (p); + } + p_inited = TRUE; + orc_once_mutex_unlock (); + } + ex->program = p; + + ex->n = n; + ex->arrays[ORC_VAR_D1] = d1; + ex->arrays[ORC_VAR_S1] = (void *) s1; + + func = p->code_exec; + func (ex); +} +#endif + + +/* orc_blend_u8 */ +#ifdef DISABLE_ORC +void +orc_blend_u8 (guint8 * d1, int d1_stride, const guint8 * s1, int s1_stride, + int p1, int n, int m) +{ + int i; + int j; + orc_int8 var0; + orc_int8 *ptr0; + orc_int8 var4; + const orc_int8 *ptr4; + const orc_int8 var16 = 8; + const orc_int16 var24 = p1; + orc_int16 var32; + orc_int16 var33; + orc_int16 var34; + orc_int16 var35; + orc_int16 var36; + orc_int16 var37; + orc_int16 var38; + + for (j = 0; j < m; j++) { + ptr0 = ORC_PTR_OFFSET (d1, d1_stride * j); + ptr4 = ORC_PTR_OFFSET (s1, s1_stride * j); + + for (i = 0; i < n; i++) { + var0 = *ptr0; + var4 = *ptr4; + ptr4++; + /* 0: convubw */ + var32 = (orc_uint8) var0; + /* 1: convubw */ + var33 = (orc_uint8) var4; + /* 2: subw */ + var34 = var33 - var32; + /* 3: mullw */ + var35 = (var34 * var24) & 0xffff; + /* 4: shlw */ + var36 = var32 << var16; + /* 5: addw */ + var37 = var36 + var35; + /* 6: shruw */ + var38 = ((orc_uint16) var37) >> var16; + /* 7: convsuswb */ + var0 = ORC_CLAMP_UB (var38); + *ptr0 = var0; + ptr0++; + } + } + +} + +#else +static void +_backup_orc_blend_u8 (OrcExecutor * ex) +{ + int i; + int j; + int n = ex->n; + int m = ex->params[ORC_VAR_A1]; + orc_int8 var0; + orc_int8 *ptr0; + orc_int8 var4; + const orc_int8 *ptr4; + const orc_int8 var16 = 8; + const orc_int16 var24 = ex->params[24]; + orc_int16 var32; + orc_int16 var33; + orc_int16 var34; + orc_int16 var35; + orc_int16 var36; + orc_int16 var37; + orc_int16 var38; + + for (j = 0; j < m; j++) { + ptr0 = ORC_PTR_OFFSET (ex->arrays[0], ex->params[0] * j); + ptr4 = ORC_PTR_OFFSET (ex->arrays[4], ex->params[4] * j); + + for (i = 0; i < n; i++) { + var0 = *ptr0; + var4 = *ptr4; + ptr4++; + /* 0: convubw */ + var32 = (orc_uint8) var0; + /* 1: convubw */ + var33 = (orc_uint8) var4; + /* 2: subw */ + var34 = var33 - var32; + /* 3: mullw */ + var35 = (var34 * var24) & 0xffff; + /* 4: shlw */ + var36 = var32 << var16; + /* 5: addw */ + var37 = var36 + var35; + /* 6: shruw */ + var38 = ((orc_uint16) var37) >> var16; + /* 7: convsuswb */ + var0 = ORC_CLAMP_UB (var38); + *ptr0 = var0; + ptr0++; + } + } + +} + +void +orc_blend_u8 (guint8 * d1, int d1_stride, const guint8 * s1, int s1_stride, + int p1, int n, int m) +{ + OrcExecutor _ex, *ex = &_ex; + static int p_inited = 0; + static OrcProgram *p = 0; + void (*func) (OrcExecutor *); + + if (!p_inited) { + orc_once_mutex_lock (); + if (!p_inited) { + OrcCompileResult result; + + p = orc_program_new (); + orc_program_set_2d (p); + orc_program_set_name (p, "orc_blend_u8"); + orc_program_set_backup_function (p, _backup_orc_blend_u8); + orc_program_add_destination (p, 1, "d1"); + orc_program_add_source (p, 1, "s1"); + orc_program_add_constant (p, 1, 8, "c1"); + orc_program_add_parameter (p, 2, "p1"); + orc_program_add_temporary (p, 2, "t1"); + orc_program_add_temporary (p, 2, "t2"); + + orc_program_append (p, "convubw", ORC_VAR_T1, ORC_VAR_D1, ORC_VAR_D1); + orc_program_append (p, "convubw", ORC_VAR_T2, ORC_VAR_S1, ORC_VAR_D1); + orc_program_append (p, "subw", ORC_VAR_T2, ORC_VAR_T2, ORC_VAR_T1); + orc_program_append (p, "mullw", ORC_VAR_T2, ORC_VAR_T2, ORC_VAR_P1); + orc_program_append (p, "shlw", ORC_VAR_T1, ORC_VAR_T1, ORC_VAR_C1); + orc_program_append (p, "addw", ORC_VAR_T2, ORC_VAR_T1, ORC_VAR_T2); + orc_program_append (p, "shruw", ORC_VAR_T2, ORC_VAR_T2, ORC_VAR_C1); + orc_program_append (p, "convsuswb", ORC_VAR_D1, ORC_VAR_T2, ORC_VAR_D1); + + result = orc_program_compile (p); + } + p_inited = TRUE; + orc_once_mutex_unlock (); + } + ex->program = p; + + ex->n = n; + ORC_EXECUTOR_M (ex) = m; + ex->arrays[ORC_VAR_D1] = d1; + ex->params[ORC_VAR_D1] = d1_stride; + ex->arrays[ORC_VAR_S1] = (void *) s1; + ex->params[ORC_VAR_S1] = s1_stride; + ex->params[ORC_VAR_P1] = p1; + + func = p->code_exec; + func (ex); +} +#endif diff --git a/gst/videomixer/blendorc-dist.h b/gst/videomixer/blendorc-dist.h new file mode 100644 index 0000000000..4ddb04c7de --- /dev/null +++ b/gst/videomixer/blendorc-dist.h @@ -0,0 +1,63 @@ + +/* autogenerated from blendorc.orc */ + +#ifndef _BLENDORC_H_ +#define _BLENDORC_H_ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef _ORC_INTEGER_TYPEDEFS_ +#define _ORC_INTEGER_TYPEDEFS_ +#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L +#include +typedef int8_t orc_int8; +typedef int16_t orc_int16; +typedef int32_t orc_int32; +typedef int64_t orc_int64; +typedef uint8_t orc_uint8; +typedef uint16_t orc_uint16; +typedef uint32_t orc_uint32; +typedef uint64_t orc_uint64; +#elif defined(_MSC_VER) +typedef signed __int8 orc_int8; +typedef signed __int16 orc_int16; +typedef signed __int32 orc_int32; +typedef signed __int64 orc_int64; +typedef unsigned __int8 orc_uint8; +typedef unsigned __int16 orc_uint16; +typedef unsigned __int32 orc_uint32; +typedef unsigned __int64 orc_uint64; +#else +#include +typedef signed char orc_int8; +typedef short orc_int16; +typedef int orc_int32; +typedef unsigned char orc_uint8; +typedef unsigned short orc_uint16; +typedef unsigned int orc_uint32; +#if INT_MAX == LONG_MAX +typedef long long orc_int64; +typedef unsigned long long orc_uint64; +#else +typedef long orc_int64; +typedef unsigned long orc_uint64; +#endif +#endif +typedef union { orc_int32 i; float f; } orc_union32; +typedef union { orc_int64 i; double f; } orc_union64; +#endif + +void orc_splat_u32 (guint32 * d1, int p1, int n); +void orc_memcpy_u32 (guint32 * d1, const guint32 * s1, int n); +void orc_blend_u8 (guint8 * d1, int d1_stride, const guint8 * s1, int s1_stride, int p1, int n, int m); + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/gst/videomixer/blendorc.orc b/gst/videomixer/blendorc.orc new file mode 100644 index 0000000000..55e20bba1d --- /dev/null +++ b/gst/videomixer/blendorc.orc @@ -0,0 +1,30 @@ +.function orc_splat_u32 +.dest 4 d1 guint32 +.param 4 p1 guint32 + +copyl d1, p1 + +.function orc_memcpy_u32 +.dest 4 d1 guint32 +.source 4 s1 guint32 + +copyl d1, s1 + +.function orc_blend_u8 +.flags 2d +.dest 1 d1 guint8 +.source 1 s1 guint8 +.param 2 p1 +.temp 2 t1 +.temp 2 t2 +.const 1 c1 8 + +convubw t1, d1 +convubw t2, s1 +subw t2, t2, t1 +mullw t2, t2, p1 +shlw t1, t1, c1 +addw t2, t1, t2 +shruw t2, t2, c1 +convsuswb d1, t2 +