GST-Tensordecoder-ov_ep/subprojects/gst-plugins-bad/sys/nvcodec/kernel/gstcudaconverter.cu

/* GStreamer
 * Copyright (C) 2025 Seungha Yang <seungha@centricular.com>
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Library General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Library General Public License for more details.
 *
 * You should have received a copy of the GNU Library General Public
 * License along with this library; if not, write to the
 * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
 * Boston, MA 02110-1301, USA.
 */

#ifdef __NVCC__
struct ColorMatrix
{
  float CoeffX[3];
  float CoeffY[3];
  float CoeffZ[3];
  float Offset[3];
  float Min[3];
  float Max[3];
};

struct ConstBuffer
{
  ColorMatrix matrix;
  int width;
  int height;
  int left;
  int top;
  int right;
  int bottom;
  int view_width;
  int view_height;
  float border_x;
  float border_y;
  float border_z;
  float border_w;
  int fill_border;
  int video_direction;
  float alpha;
  int do_blend;
  int do_convert;
};

__device__ inline float
dot (const float coeff[3], float3 val)
{
  return coeff[0] * val.x + coeff[1] * val.y + coeff[2] * val.z;
}

__device__ inline float
clamp (float val, float min_val, float max_val)
{
  return max (min_val, min (val, max_val));
}

__device__ inline float3
clamp3 (float3 val, const float min_val[3], const float max_val[3])
{
  return make_float3 (clamp (val.x, min_val[0], max_val[0]),
      clamp (val.y, min_val[1], max_val[2]),
      clamp (val.z, min_val[1], max_val[2]));
}

__device__ inline unsigned char
scale_to_2bits (float val)
{
  return (unsigned short) __float2int_rz (val * 3.0);
}

__device__ inline unsigned char
scale_to_uchar (float val)
{
  return (unsigned char) __float2int_rz (val * 255.0);
}

__device__ inline unsigned short
scale_to_ushort (float val)
{
  return (unsigned short) __float2int_rz (val * 65535.0);
}

__device__ inline unsigned short
scale_to_10bits (float val)
{
  return (unsigned short) __float2int_rz (val * 1023.0);
}

__device__ inline unsigned short
scale_to_12bits (float val)
{
  return (unsigned short) __float2int_rz (val * 4095.0);
}

__device__ inline unsigned char
blend_uchar (unsigned char dst, float src, float src_alpha)
{
  // DstColor' = SrcA * SrcColor + (1 - SrcA) DstColor
  float src_val = src * src_alpha;
  float dst_val = __int2float_rz (dst) / 255.0 * (1.0 - src_alpha);
  return scale_to_uchar(clamp(src_val + dst_val, 0, 1.0));
}

__device__ inline unsigned short
blend_ushort (unsigned short dst, float src, float src_alpha)
{
  // DstColor' = SrcA * SrcColor + (1 - SrcA) DstColor
  float src_val = src * src_alpha;
  float dst_val = __int2float_rz (dst) / 65535.0 * (1.0 - src_alpha);
  return scale_to_ushort(clamp(src_val + dst_val, 0, 1.0));
}

__device__ inline unsigned short
blend_10bits (unsigned short dst, float src, float src_alpha)
{
  // DstColor' = SrcA * SrcColor + (1 - SrcA) DstColor
  float src_val = src * src_alpha;
  float dst_val = __int2float_rz (dst) / 1023.0 * (1.0 - src_alpha);
  return scale_to_10bits(clamp(src_val + dst_val, 0, 1.0));
}

__device__ inline unsigned short
blend_12bits (unsigned short dst, float src, float src_alpha)
{
  // DstColor' = SrcA * SrcColor + (1 - SrcA) DstColor
  float src_val = src * src_alpha;
  float dst_val = __int2float_rz (dst) / 4095.0 * (1.0 - src_alpha);
  return scale_to_12bits(clamp(src_val + dst_val, 0, 1.0));
}

struct IConverter
{
  __device__ virtual float3
  Execute (float3 sample, const ColorMatrix * matrix) = 0;
};

struct ConvertSimple : public IConverter
{
  __device__ float3
  Execute (float3 sample, const ColorMatrix * matrix)
  {
    float3 out;
    out.x = dot (matrix->CoeffX, sample);
    out.y = dot (matrix->CoeffY, sample);
    out.z = dot (matrix->CoeffZ, sample);
    out.x += matrix->Offset[0];
    out.y += matrix->Offset[1];
    out.z += matrix->Offset[2];
    return clamp3 (out, matrix->Min, matrix->Max);
  }
};

struct ISampler
{
  __device__ virtual float4
  Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
      cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y) = 0;
};

struct SampleI420 : public ISampler
{
  __device__ float4
  Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
      cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
  {
    float luma = tex2D<float>(tex0, x, y);
    float u = tex2D<float>(tex1, x, y);
    float v = tex2D<float>(tex2, x, y);
    return make_float4 (luma, u, v, 1);
  }
};

struct SampleYV12 : public ISampler
{
  __device__ float4
  Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
      cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
  {
    float luma = tex2D<float>(tex0, x, y);
    float u = tex2D<float>(tex2, x, y);
    float v = tex2D<float>(tex1, x, y);
    return make_float4 (luma, u, v, 1);
  }
};

struct SampleI420_10 : public ISampler
{
  __device__ float4
  Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
      cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
  {
    float luma = tex2D<float>(tex0, x, y);
    float u = tex2D<float>(tex1, x, y);
    float v = tex2D<float>(tex2, x, y);
    /* (1 << 6) to scale [0, 1.0) range */
    return make_float4 (luma * 64, u * 64, v * 64, 1);
  }
};

struct SampleI420_12 : public ISampler
{
  __device__ float4
  Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
      cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
  {
    float luma = tex2D<float>(tex0, x, y);
    float u = tex2D<float>(tex1, x, y);
    float v = tex2D<float>(tex2, x, y);
    /* (1 << 4) to scale [0, 1.0) range */
    return make_float4 (luma * 16, u * 16, v * 16, 1);
  }
};

struct SampleNV12 : public ISampler
{
  __device__ float4
  Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
      cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
  {
    float luma = tex2D<float>(tex0, x, y);
    float2 uv = tex2D<float2>(tex1, x, y);
    return make_float4 (luma, uv.x, uv.y, 1);
  }
};

struct SampleNV21 : public ISampler
{
  __device__ float4
  Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
      cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
  {
    float luma = tex2D<float>(tex0, x, y);
    float2 vu = tex2D<float2>(tex1, x, y);
    return make_float4 (luma, vu.y, vu.x, 1);
  }
};

struct SampleRGBA : public ISampler
{
  __device__ float4
  Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
      cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
  {
    return tex2D<float4>(tex0, x, y);
  }
};

struct SampleBGRA : public ISampler
{
  __device__ float4
  Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
      cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
  {
    float4 bgra = tex2D<float4>(tex0, x, y);
    return make_float4 (bgra.z, bgra.y, bgra.x, bgra.w);
  }
};

struct SampleRGBx : public ISampler
{
  __device__ float4
  Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
      cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
  {
    float4 rgbx = tex2D<float4>(tex0, x, y);
    rgbx.w = 1;
    return rgbx;
  }
};

struct SampleBGRx : public ISampler
{
  __device__ float4
  Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
      cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
  {
    float4 bgrx = tex2D<float4>(tex0, x, y);
    return make_float4 (bgrx.z, bgrx.y, bgrx.x, 1);
  }
};

struct SampleARGB : public ISampler
{
  __device__ float4
  Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
      cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
  {
   float4 argb = tex2D<float4>(tex0, x, y);
   return make_float4 (argb.y, argb.z, argb.w, argb.x);
  }
};

struct SampleABGR : public ISampler
{
  __device__ float4
  Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
      cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
  {
   float4 abgr = tex2D<float4>(tex0, x, y);
   return make_float4 (abgr.w, abgr.z, abgr.y, abgr.x);
  }
};

struct SampleRGBP : public ISampler
{
  __device__ float4
  Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
      cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
  {
    float r = tex2D<float>(tex0, x, y);
    float g = tex2D<float>(tex1, x, y);
    float b = tex2D<float>(tex2, x, y);
    return make_float4 (r, g, b, 1);
  }
};

struct SampleBGRP : public ISampler
{
  __device__ float4
  Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
      cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
  {
    float b = tex2D<float>(tex0, x, y);
    float g = tex2D<float>(tex1, x, y);
    float r = tex2D<float>(tex2, x, y);
    return make_float4 (r, g, b, 1);
  }
};

struct SampleGBR : public ISampler
{
  __device__ float4
  Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
      cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
  {
    float g = tex2D<float>(tex0, x, y);
    float b = tex2D<float>(tex1, x, y);
    float r = tex2D<float>(tex2, x, y);
    return make_float4 (r, g, b, 1);
  }
};

struct SampleGBR_10 : public ISampler
{
  __device__ float4
  Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
      cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
  {
    float g = tex2D<float>(tex0, x, y);
    float b = tex2D<float>(tex1, x, y);
    float r = tex2D<float>(tex2, x, y);
    /* (1 << 6) to scale [0, 1.0) range */
    return make_float4 (r * 64, g * 64, b * 64, 1);
  }
};

struct SampleGBR_12 : public ISampler
{
  __device__ float4
  Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
      cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
  {
    float g = tex2D<float>(tex0, x, y);
    float b = tex2D<float>(tex1, x, y);
    float r = tex2D<float>(tex2, x, y);
    /* (1 << 4) to scale [0, 1.0) range */
    return make_float4 (r * 16, g * 16, b * 16, 1);
  }
};

struct SampleGBRA : public ISampler
{
  __device__ float4
  Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
      cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
  {
    float g = tex2D<float>(tex0, x, y);
    float b = tex2D<float>(tex1, x, y);
    float r = tex2D<float>(tex2, x, y);
    float a = tex2D<float>(tex3, x, y);
    return make_float4 (r, g, b, a);
  }
};

struct SampleVUYA : public ISampler
{
  __device__ float4
  Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
      cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
  {
    float4 vuya = tex2D<float4>(tex0, x, y);
    return make_float4 (vuya.z, vuya.y, vuya.x, vuya.w);
  }
};

struct IOutput
{
  __device__ virtual void
  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1) = 0;

  __device__ virtual void
  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
        unsigned char * dst3, float4 sample, int x, int y, int stride0,
        int stride1) = 0;
};

struct OutputI420 : public IOutput
{
  __device__ void
  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    dst0[x + y * stride0] = scale_to_uchar (sample.x);
    if (x % 2 == 0 && y % 2 == 0) {
      unsigned int pos = x / 2 + (y / 2) * stride1;
      dst1[pos] = scale_to_uchar (sample.y);
      dst2[pos] = scale_to_uchar (sample.z);
    }
  }

  __device__ void
  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    unsigned int pos = x + y * stride0;
    dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);
    if (x % 2 == 0 && y % 2 == 0) {
      pos = x / 2 + (y / 2) * stride1;
      dst1[pos] = blend_uchar (dst1[pos], sample.y, sample.w);
      dst2[pos] = blend_uchar (dst2[pos], sample.z, sample.w);
    }
  }
};

struct OutputYV12 : public IOutput
{
  __device__ void
  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    dst0[x + y * stride0] = scale_to_uchar (sample.x);
    if (x % 2 == 0 && y % 2 == 0) {
      unsigned int pos = x / 2 + (y / 2) * stride1;
      dst1[pos] = scale_to_uchar (sample.z);
      dst2[pos] = scale_to_uchar (sample.y);
    }
  }

  __device__ void
  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    unsigned int pos = x + y * stride0;
    dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);
    if (x % 2 == 0 && y % 2 == 0) {
      pos = x / 2 + (y / 2) * stride1;
      dst1[pos] = blend_uchar (dst1[pos], sample.z, sample.w);
      dst2[pos] = blend_uchar (dst2[pos], sample.y, sample.w);
    }
  }
};

struct OutputNV12 : public IOutput
{
  __device__ void
  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    dst0[x + y * stride0] = scale_to_uchar (sample.x);
    if (x % 2 == 0 && y % 2 == 0) {
      unsigned int pos = x + (y / 2) * stride1;
      dst1[pos] = scale_to_uchar (sample.y);
      dst1[pos + 1] = scale_to_uchar (sample.z);
    }
  }

  __device__ void
  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    unsigned int pos = x + y * stride0;
    dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);
    if (x % 2 == 0 && y % 2 == 0) {
      pos = x + (y / 2) * stride1;
      dst1[pos] = blend_uchar (dst1[pos], sample.y, sample.w);
      dst1[pos + 1] = blend_uchar (dst1[pos + 1], sample.z, sample.w);
    }
  }
};

struct OutputNV21 : public IOutput
{
  __device__ void
  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    dst0[x + y * stride0] = scale_to_uchar (sample.x);
    if (x % 2 == 0 && y % 2 == 0) {
      unsigned int pos = x + (y / 2) * stride1;
      dst1[pos] = scale_to_uchar (sample.z);
      dst1[pos + 1] = scale_to_uchar (sample.y);
    }
  }

  __device__ void
  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    unsigned int pos = x + y * stride0;
    dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);
    if (x % 2 == 0 && y % 2 == 0) {
      pos = x + (y / 2) * stride1;
      dst1[pos] = blend_uchar (dst1[pos], sample.z, sample.w);
      dst1[pos + 1] = blend_uchar (dst1[pos + 1], sample.y, sample.w);
    }
  }
};

struct OutputP010 : public IOutput
{
  __device__ void
  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    *(unsigned short *) &dst0[x * 2 + y * stride0] = scale_to_ushort (sample.x);
    if (x % 2 == 0 && y % 2 == 0) {
      unsigned int pos = x * 2 + (y / 2) * stride1;
      *(unsigned short *) &dst1[pos] = scale_to_ushort (sample.y);
      *(unsigned short *) &dst1[pos + 2] = scale_to_ushort (sample.z);
    }
  }

  __device__ void
  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    unsigned int pos = x * 2 + y * stride0;
    unsigned short * target = (unsigned short *) &dst0[pos];
    *target = blend_ushort (*target, sample.x, sample.w);
    if (x % 2 == 0 && y % 2 == 0) {
      pos = x * 2 + (y / 2) * stride1;
      target = (unsigned short *) &dst1[pos];
      *target = blend_ushort (*target, sample.y, sample.w);
      target = (unsigned short *) &dst1[pos + 2];
      *target = blend_ushort (*target, sample.z, sample.w);
    }
  }
};

struct OutputI420_10 : public IOutput
{
  __device__ void
  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    *(unsigned short *) &dst0[x * 2 + y * stride0] = scale_to_10bits (sample.x);
    if (x % 2 == 0 && y % 2 == 0) {
      unsigned int pos = x + (y / 2) * stride1;
      *(unsigned short *) &dst1[pos] = scale_to_10bits (sample.y);
      *(unsigned short *) &dst2[pos] = scale_to_10bits (sample.z);
    }
  }

  __device__ void
  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    unsigned int pos = x * 2 + y * stride0;
    unsigned short * target = (unsigned short *) &dst0[pos];
    *target = blend_10bits (*target, sample.x, sample.w);
    if (x % 2 == 0 && y % 2 == 0) {
      pos = x * 2 + (y / 2) * stride1;
      target = (unsigned short *) &dst1[pos];
      *target = blend_10bits (*target, sample.y, sample.w);
      target = (unsigned short *) &dst2[pos];
      *target = blend_10bits (*target, sample.z, sample.w);
    }
  }
};

struct OutputI420_12 : public IOutput
{
  __device__ void
  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    *(unsigned short *) &dst0[x * 2 + y * stride0] = scale_to_12bits (sample.x);
    if (x % 2 == 0 && y % 2 == 0) {
      unsigned int pos = x + (y / 2) * stride1;
      *(unsigned short *) &dst1[pos] = scale_to_12bits (sample.y);
      *(unsigned short *) &dst2[pos] = scale_to_12bits (sample.z);
    }
  }

  __device__ void
  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    unsigned int pos = x * 2 + y * stride0;
    unsigned short * target = (unsigned short *) &dst0[pos];
    *target = blend_12bits (*target, sample.x, sample.w);
    if (x % 2 == 0 && y % 2 == 0) {
      pos = x * 2 + (y / 2) * stride1;
      target = (unsigned short *) &dst1[pos];
      *target = blend_12bits (*target, sample.y, sample.w);
      target = (unsigned short *) &dst2[pos];
      *target = blend_12bits (*target, sample.z, sample.w);
    }
  }
};

struct OutputY444 : public IOutput
{
  __device__ void
  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    int pos = x + y * stride0;
    dst0[pos] = scale_to_uchar (sample.x);
    dst1[pos] = scale_to_uchar (sample.y);
    dst2[pos] = scale_to_uchar (sample.z);
  }

  __device__ void
  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    int pos = x + y * stride0;
    dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);
    dst1[pos] = blend_uchar (dst1[pos], sample.y, sample.w);
    dst2[pos] = blend_uchar (dst2[pos], sample.z, sample.w);
  }
};

struct OutputY444_10 : public IOutput
{
  __device__ void
  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    int pos = x * 2 + y * stride0;
    *(unsigned short *) &dst0[pos] = scale_to_10bits (sample.x);
    *(unsigned short *) &dst1[pos] = scale_to_10bits (sample.y);
    *(unsigned short *) &dst2[pos] = scale_to_10bits (sample.z);
  }

  __device__ void
  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    int pos = x * 2 + y * stride0;
    unsigned short * target = (unsigned short *) &dst0[pos];
    *target = blend_10bits (*target, sample.x, sample.w);
    target = (unsigned short *) &dst1[pos];
    *target = blend_10bits (*target, sample.y, sample.w);
    target = (unsigned short *) &dst2[pos];
    *target = blend_10bits (*target, sample.z, sample.w);
  }
};

struct OutputY444_12 : public IOutput
{
  __device__ void
  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    int pos = x * 2 + y * stride0;
    *(unsigned short *) &dst0[pos] = scale_to_12bits (sample.x);
    *(unsigned short *) &dst1[pos] = scale_to_12bits (sample.y);
    *(unsigned short *) &dst2[pos] = scale_to_12bits (sample.z);
  }

  __device__ void
  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    int pos = x * 2 + y * stride0;
    unsigned short * target = (unsigned short *) &dst0[pos];
    *target = blend_12bits (*target, sample.x, sample.w);
    target = (unsigned short *) &dst1[pos];
    *target = blend_12bits (*target, sample.y, sample.w);
    target = (unsigned short *) &dst2[pos];
    *target = blend_12bits (*target, sample.z, sample.w);
  }
};

struct OutputY444_16 : public IOutput
{
  __device__ void
  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    int pos = x * 2 + y * stride0;
    *(unsigned short *) &dst0[pos] = scale_to_ushort (sample.x);
    *(unsigned short *) &dst1[pos] = scale_to_ushort (sample.y);
    *(unsigned short *) &dst2[pos] = scale_to_ushort (sample.z);
  }

  __device__ void
  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    int pos = x * 2 + y * stride0;
    unsigned short * target = (unsigned short *) &dst0[pos];
    *target = blend_ushort (*target, sample.x, sample.w);
    target = (unsigned short *) &dst1[pos];
    *target = blend_ushort (*target, sample.y, sample.w);
    target = (unsigned short *) &dst2[pos];
    *target = blend_ushort (*target, sample.z, sample.w);
  }
};

struct OutputRGBA : public IOutput
{
  __device__ void
  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    int pos = x * 4 + y * stride0;
    dst0[pos] = scale_to_uchar (sample.x);
    dst0[pos + 1] = scale_to_uchar (sample.y);
    dst0[pos + 2] = scale_to_uchar (sample.z);
    dst0[pos + 3] = scale_to_uchar (sample.w);
  }

  __device__ void
  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    int pos = x * 4 + y * stride0;
    dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);
    dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.y, sample.w);
    dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.z, sample.w);
    dst0[pos + 3] = blend_uchar (dst0[pos + 3], 1.0, sample.w);
  }
};

struct OutputRGBx : public IOutput
{
  __device__ void
  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    int pos = x * 4 + y * stride0;
    dst0[pos] = scale_to_uchar (sample.x);
    dst0[pos + 1] = scale_to_uchar (sample.y);
    dst0[pos + 2] = scale_to_uchar (sample.z);
    dst0[pos + 3] = 255;
  }

  __device__ void
  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    int pos = x * 4 + y * stride0;
    dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);
    dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.y, sample.w);
    dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.z, sample.w);
    dst0[pos + 3] = 255;
  }
};

struct OutputBGRA : public IOutput
{
  __device__ void
  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    int pos = x * 4 + y * stride0;
    dst0[pos] = scale_to_uchar (sample.z);
    dst0[pos + 1] = scale_to_uchar (sample.y);
    dst0[pos + 2] = scale_to_uchar (sample.x);
    dst0[pos + 3] = scale_to_uchar (sample.w);
  }

  __device__ void
  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    int pos = x * 4 + y * stride0;
    dst0[pos] = blend_uchar (dst0[pos], sample.z, sample.w);
    dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.y, sample.w);
    dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.x, sample.w);
    dst0[pos + 3] = blend_uchar (dst0[pos + 3], 1.0, sample.w);
  }
};

struct OutputBGRx : public IOutput
{
  __device__ void
  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    int pos = x * 4 + y * stride0;
    dst0[pos] = scale_to_uchar (sample.z);
    dst0[pos + 1] = scale_to_uchar (sample.y);
    dst0[pos + 2] = scale_to_uchar (sample.x);
    dst0[pos + 3] = 255;
  }

  __device__ void
  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    int pos = x * 4 + y * stride0;
    dst0[pos] = blend_uchar (dst0[pos], sample.z, sample.w);
    dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.y, sample.w);
    dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.x, sample.w);
    dst0[pos + 3] = 255;
  }
};

struct OutputARGB : public IOutput
{
  __device__ void
  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    int pos = x * 4 + y * stride0;
    dst0[pos] = scale_to_uchar (sample.w);
    dst0[pos + 1] = scale_to_uchar (sample.x);
    dst0[pos + 2] = scale_to_uchar (sample.y);
    dst0[pos + 3] = scale_to_uchar (sample.z);
  }

  __device__ void
  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    int pos = x * 4 + y * stride0;
    dst0[pos] = blend_uchar (dst0[pos], 1.0, sample.w);
    dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.x, sample.w);
    dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.y, sample.w);
    dst0[pos + 3] = blend_uchar (dst0[pos + 3], sample.z, sample.w);
  }
};

struct OutputABGR : public IOutput
{
  __device__ void
  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    int pos = x * 4 + y * stride0;
    dst0[pos] = scale_to_uchar (sample.w);
    dst0[pos + 1] = scale_to_uchar (sample.z);
    dst0[pos + 2] = scale_to_uchar (sample.y);
    dst0[pos + 3] = scale_to_uchar (sample.x);
  }

  __device__ void
  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    int pos = x * 4 + y * stride0;
    dst0[pos] = blend_uchar (dst0[pos], 1.0, sample.w);
    dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.z, sample.w);
    dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.y, sample.w);
    dst0[pos + 3] = blend_uchar (dst0[pos + 3], sample.x, sample.w);
  }
};

struct OutputRGB : public IOutput
{
  __device__ void
  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    int pos = x * 3 + y * stride0;
    dst0[pos] = scale_to_uchar (sample.x);
    dst0[pos + 1] = scale_to_uchar (sample.y);
    dst0[pos + 2] = scale_to_uchar (sample.z);
  }

  __device__ void
  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    int pos = x * 3 + y * stride0;
    dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);
    dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.y, sample.w);
    dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.z, sample.w);
  }
};

struct OutputBGR : public IOutput
{
  __device__ void
  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    int pos = x * 3 + y * stride0;
    dst0[pos] = scale_to_uchar (sample.z);
    dst0[pos + 1] = scale_to_uchar (sample.y);
    dst0[pos + 2] = scale_to_uchar (sample.x);
  }

  __device__ void
  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    int pos = x * 3 + y * stride0;
    dst0[pos] = blend_uchar (dst0[pos], sample.z, sample.w);
    dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.y, sample.w);
    dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.x, sample.w);
  }
};

__device__ inline ushort3
unpack_rgb10a2 (unsigned int val)
{
  unsigned short r, g, b;
  r = (val & 0x3ff);
  r = (r << 6) | (r >> 4);
  g = ((val >> 10) & 0x3ff);
  g = (g << 6) | (g >> 4);
  b = ((val >> 20) & 0x3ff);
  b = (b << 6) | (b >> 4);
  return make_ushort3 (r, g, b);
}

struct OutputRGB10A2 : public IOutput
{
  __device__ void
  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    unsigned int alpha = (unsigned int) scale_to_2bits (sample.w);
    unsigned int packed_rgb = alpha << 30;
    packed_rgb |= ((unsigned int) scale_to_10bits (sample.x));
    packed_rgb |= ((unsigned int) scale_to_10bits (sample.y)) << 10;
    packed_rgb |= ((unsigned int) scale_to_10bits (sample.z)) << 20;
    *(unsigned int *) &dst0[x * 4 + y * stride0] = packed_rgb;
  }

  __device__ void
  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    unsigned int * target = (unsigned int *) &dst0[x * 4 + y * stride0];
    ushort3 val = unpack_rgb10a2 (*target);
    unsigned int alpha = (unsigned int) scale_to_2bits (sample.w);
    unsigned int packed_rgb = alpha << 30;
    packed_rgb |= ((unsigned int) blend_10bits (val.x, sample.x, sample.w));
    packed_rgb |= ((unsigned int) blend_10bits (val.y, sample.y, sample.w)) << 10;
    packed_rgb |= ((unsigned int) blend_10bits (val.z, sample.z, sample.w)) << 20;
    *target = packed_rgb;
  }
};

__device__ inline ushort3
unpack_bgr10a2 (unsigned int val)
{
  unsigned short r, g, b;
  b = (val & 0x3ff);
  b = (b << 6) | (b >> 4);
  g = ((val >> 10) & 0x3ff);
  g = (g << 6) | (g >> 4);
  r = ((val >> 20) & 0x3ff);
  r = (r << 6) | (r >> 4);
  return make_ushort3 (r, g, b);
}

struct OutputBGR10A2 : public IOutput
{
  __device__ void
  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    unsigned int alpha = (unsigned int) scale_to_2bits (sample.x);
    unsigned int packed_rgb = alpha << 30;
    packed_rgb |= ((unsigned int) scale_to_10bits (sample.x)) << 20;
    packed_rgb |= ((unsigned int) scale_to_10bits (sample.y)) << 10;
    packed_rgb |= ((unsigned int) scale_to_10bits (sample.z));
    *(unsigned int *) &dst0[x * 4 + y * stride0] = packed_rgb;
  }

  __device__ void
  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    unsigned int * target = (unsigned int *) &dst0[x * 4 + y * stride0];
    ushort3 val = unpack_bgr10a2 (*target);
    unsigned int alpha = (unsigned int) scale_to_2bits (sample.w);
    unsigned int packed_rgb = alpha << 30;
    packed_rgb |= ((unsigned int) blend_10bits (val.x, sample.x, sample.w)) << 20;
    packed_rgb |= ((unsigned int) blend_10bits (val.y, sample.y, sample.w)) << 10;
    packed_rgb |= ((unsigned int) blend_10bits (val.z, sample.z, sample.w));
    *target = packed_rgb;
  }
};

struct OutputY42B : public IOutput
{
  __device__ void
  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    dst0[x + y * stride0] = scale_to_uchar (sample.x);
    if (x % 2 == 0) {
      unsigned int pos = x / 2 + y * stride1;
      dst1[pos] = scale_to_uchar (sample.y);
      dst2[pos] = scale_to_uchar (sample.z);
    }
  }

  __device__ void
  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    unsigned int pos = x + y * stride0;
    dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);
    if (x % 2 == 0) {
      pos = x / 2 + y * stride1;
      dst1[pos] = blend_uchar (dst1[pos], sample.y, sample.w);
      dst2[pos] = blend_uchar (dst2[pos], sample.z, sample.w);
    }
  }
};

struct OutputI422_10 : public IOutput
{
  __device__ void
  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    *(unsigned short *) &dst0[x * 2 + y * stride0] = scale_to_10bits (sample.x);
    if (x % 2 == 0) {
      unsigned int pos = x + y * stride1;
      *(unsigned short *) &dst1[pos] = scale_to_10bits (sample.y);
      *(unsigned short *) &dst2[pos] = scale_to_10bits (sample.z);
    }
  }

  __device__ void
  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    unsigned int pos = x * 2 + y * stride0;
    unsigned short * target = (unsigned short *) &dst0[pos];
    *target = blend_10bits (*target, sample.x, sample.w);
    if (x % 2 == 0) {
      pos = x / 2 + y * stride1;
      target = (unsigned short *) &dst1[pos];
      *target = blend_10bits (*target, sample.y, sample.w);
      target = (unsigned short *) &dst2[pos];
      *target = blend_10bits (*target, sample.z, sample.w);
    }
  }
};

struct OutputI422_12 : public IOutput
{
  __device__ void
  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    *(unsigned short *) &dst0[x * 2 + y * stride0] = scale_to_12bits (sample.x);
    if (x % 2 == 0) {
      unsigned int pos = x + y * stride1;
      *(unsigned short *) &dst1[pos] = scale_to_12bits (sample.y);
      *(unsigned short *) &dst2[pos] = scale_to_12bits (sample.z);
    }
  }

  __device__ void
  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    unsigned int pos = x * 2 + y * stride0;
    unsigned short * target = (unsigned short *) &dst0[pos];
    *target = blend_12bits (*target, sample.x, sample.w);
    if (x % 2 == 0) {
      pos = x / 2 + y * stride1;
      target = (unsigned short *) &dst1[pos];
      *target = blend_12bits (*target, sample.y, sample.w);
      target = (unsigned short *) &dst2[pos];
      *target = blend_12bits (*target, sample.z, sample.w);
    }
  }
};

struct OutputRGBP : public IOutput
{
  __device__ void
  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    int pos = x + y * stride0;
    dst0[pos] = scale_to_uchar (sample.x);
    dst1[pos] = scale_to_uchar (sample.y);
    dst2[pos] = scale_to_uchar (sample.z);
  }

  __device__ void
  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    int pos = x + y * stride0;
    dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);
    dst1[pos] = blend_uchar (dst1[pos], sample.y, sample.w);
    dst2[pos] = blend_uchar (dst2[pos], sample.z, sample.w);
  }
};

struct OutputBGRP : public IOutput
{
  __device__ void
  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    int pos = x + y * stride0;
    dst0[pos] = scale_to_uchar (sample.z);
    dst1[pos] = scale_to_uchar (sample.y);
    dst2[pos] = scale_to_uchar (sample.x);
  }

  __device__ void
  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    int pos = x + y * stride0;
    dst0[pos] = blend_uchar (dst0[pos], sample.z, sample.w);
    dst1[pos] = blend_uchar (dst1[pos], sample.y, sample.w);
    dst2[pos] = blend_uchar (dst2[pos], sample.x, sample.w);
  }
};

struct OutputGBR : public IOutput
{
  __device__ void
  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    int pos = x + y * stride0;
    dst0[pos] = scale_to_uchar (sample.y);
    dst1[pos] = scale_to_uchar (sample.z);
    dst2[pos] = scale_to_uchar (sample.x);
  }

  __device__ void
  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    int pos = x + y * stride0;
    dst0[pos] = blend_uchar (dst0[pos], sample.y, sample.w);
    dst1[pos] = blend_uchar (dst1[pos], sample.z, sample.w);
    dst2[pos] = blend_uchar (dst2[pos], sample.x, sample.w);
  }
};

struct OutputGBR_10 : public IOutput
{
  __device__ void
  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    int pos = x * 2 + y * stride0;
    *(unsigned short *) &dst0[pos] = scale_to_10bits (sample.y);
    *(unsigned short *) &dst1[pos] = scale_to_10bits (sample.z);
    *(unsigned short *) &dst2[pos] = scale_to_10bits (sample.x);
  }

  __device__ void
  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    int pos = x * 2 + y * stride0;
    unsigned short * target = (unsigned short *) &dst0[pos];
    *target = blend_10bits (*target, sample.y, sample.w);
    target = (unsigned short *) &dst1[pos];
    *target = blend_10bits (*target, sample.z, sample.w);
    target = (unsigned short *) &dst2[pos];
    *target = blend_10bits (*target, sample.x, sample.w);
  }
};

struct OutputGBR_12 : public IOutput
{
  __device__ void
  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    int pos = x * 2 + y * stride0;
    *(unsigned short *) &dst0[pos] = scale_to_12bits (sample.y);
    *(unsigned short *) &dst1[pos] = scale_to_12bits (sample.z);
    *(unsigned short *) &dst2[pos] = scale_to_12bits (sample.x);
  }

  __device__ void
  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    int pos = x * 2 + y * stride0;
    unsigned short * target = (unsigned short *) &dst0[pos];
    *target = blend_12bits (*target, sample.y, sample.w);
    target = (unsigned short *) &dst1[pos];
    *target = blend_12bits (*target, sample.z, sample.w);
    target = (unsigned short *) &dst2[pos];
    *target = blend_12bits (*target, sample.x, sample.w);
  }
};

struct OutputGBR_16 : public IOutput
{
  __device__ void
  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    int pos = x * 2 + y * stride0;
    *(unsigned short *) &dst0[pos] = scale_to_ushort (sample.y);
    *(unsigned short *) &dst1[pos] = scale_to_ushort (sample.z);
    *(unsigned short *) &dst2[pos] = scale_to_ushort (sample.x);
  }

  __device__ void
  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    int pos = x * 2 + y * stride0;
    unsigned short * target = (unsigned short *) &dst0[pos];
    *target = blend_ushort (*target, sample.y, sample.w);
    target = (unsigned short *) &dst1[pos];
    *target = blend_ushort (*target, sample.z, sample.w);
    target = (unsigned short *) &dst2[pos];
    *target = blend_ushort (*target, sample.x, sample.w);
  }
};

struct OutputGBRA : public IOutput
{
  __device__ void
  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    int pos = x + y * stride0;
    dst0[pos] = scale_to_uchar (sample.y);
    dst1[pos] = scale_to_uchar (sample.z);
    dst2[pos] = scale_to_uchar (sample.x);
    dst3[pos] = scale_to_uchar (sample.w);
  }

  __device__ void
  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    int pos = x + y * stride0;
    dst0[pos] = blend_uchar (dst0[pos], sample.y, sample.w);
    dst1[pos] = blend_uchar (dst1[pos], sample.z, sample.w);
    dst2[pos] = blend_uchar (dst2[pos], sample.x, sample.w);
    dst3[pos] = blend_uchar (dst3[pos], 1.0, sample.w);
  }
};

struct OutputVUYA : public IOutput
{
  __device__ void
  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    int pos = x * 4 + y * stride0;
    dst0[pos] = scale_to_uchar (sample.z);
    dst0[pos + 1] = scale_to_uchar (sample.y);
    dst0[pos + 2] = scale_to_uchar (sample.x);
    dst0[pos + 3] = scale_to_uchar (sample.w);
  }

  __device__ void
  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
      unsigned char * dst3, float4 sample, int x, int y, int stride0,
      int stride1)
  {
    int pos = x * 4 + y * stride0;
    dst0[pos] = blend_uchar (dst0[pos], sample.z, sample.w);
    dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.y, sample.w);
    dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.x, sample.w);
    dst0[pos + 3] = blend_uchar (dst0[pos + 3], 1.0, sample.w);
  }
};

__device__ inline float2
rotate_identity (float x, float y)
{
  return make_float2(x, y);
}

__device__ inline float2
rotate_90r (float x, float y)
{
  return make_float2(y, 1.0 - x);
}

__device__ inline float2
rotate_180 (float x, float y)
{
  return make_float2(1.0 - x, 1.0 - y);
}

__device__ inline float2
rotate_90l (float x, float y)
{
  return make_float2(1.0 - y, x);
}

__device__ inline float2
rotate_horiz (float x, float y)
{
  return make_float2(1.0 - x, y);
}

__device__ inline float2
rotate_vert (float x, float y)
{
  return make_float2(x, 1.0 - y);
}

__device__ inline float2
rotate_ul_lr (float x, float y)
{
  return make_float2(y, x);
}

__device__ inline float2
rotate_ur_ll (float x, float y)
{
  return make_float2(1.0 - y, 1.0 - x);
}
__device__ inline float2
do_rotate (float x, float y, int direction)
{
  switch (direction) {
    case 1:
      return rotate_90r (x, y);
    case 2:
      return rotate_180 (x, y);
    case 3:
      return rotate_90l (x, y);
    case 4:
      return rotate_horiz (x, y);
    case 5:
      return rotate_vert (x, y);
    case 6:
      return rotate_ul_lr (x, y);
    case 7:
      return rotate_ur_ll (x, y);
    default:
      return rotate_identity (x, y);
  }
}

extern "C" {
__global__ void
GstCudaConverterMain (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
    cudaTextureObject_t tex2, cudaTextureObject_t tex3, unsigned char * dst0,
    unsigned char * dst1, unsigned char * dst2, unsigned char * dst3,
    int stride0, int stride1, ConstBuffer const_buf, int off_x, int off_y)
{
  ConvertSimple g_converter;
  SAMPLER g_sampler;
  OUTPUT g_output;
  int x_pos = blockIdx.x * blockDim.x + threadIdx.x + off_x;
  int y_pos = blockIdx.y * blockDim.y + threadIdx.y + off_y;
  float4 sample;
  if (x_pos >= const_buf.width || y_pos >= const_buf.height ||
      const_buf.view_width <= 0 || const_buf.view_height <= 0)
    return;
  if (x_pos < const_buf.left || x_pos >= const_buf.right ||
      y_pos < const_buf.top || y_pos >= const_buf.bottom) {
    if (!const_buf.fill_border)
      return;
    sample = make_float4 (const_buf.border_x, const_buf.border_y,
       const_buf.border_z, const_buf.border_w);
  } else {
    float x = (__int2float_rz (x_pos - const_buf.left) + 0.5) / const_buf.view_width;
    if (x < 0.0 || x > 1.0)
      return;
    float y = (__int2float_rz (y_pos - const_buf.top) + 0.5) / const_buf.view_height;
    if (y < 0.0 || y > 1.0)
      return;
    float2 rotated = do_rotate (x, y, const_buf.video_direction);
    float4 s = g_sampler.Execute (tex0, tex1, tex2, tex3, rotated.x, rotated.y);
    float3 rgb = make_float3 (s.x, s.y, s.z);
    float3 yuv;
    if (const_buf.do_convert)
      yuv = g_converter.Execute (rgb, &const_buf.matrix);
    else
      yuv = rgb;
    sample = make_float4 (yuv.x, yuv.y, yuv.z, s.w);
  }
  sample.w = sample.w * const_buf.alpha;
  if (!const_buf.do_blend) {
    g_output.Write (dst0, dst1, dst2, dst3, sample, x_pos, y_pos, stride0, stride1);
  } else {
    g_output.Blend (dst0, dst1, dst2, dst3, sample, x_pos, y_pos, stride0, stride1);
  }
}
}
#else
static const char GstCudaConverterMain_str[] =
"struct ColorMatrix\n"
"{\n"
"  float CoeffX[3];\n"
"  float CoeffY[3];\n"
"  float CoeffZ[3];\n"
"  float Offset[3];\n"
"  float Min[3];\n"
"  float Max[3];\n"
"};\n"
"\n"
"struct ConstBuffer\n"
"{\n"
"  ColorMatrix matrix;\n"
"  int width;\n"
"  int height;\n"
"  int left;\n"
"  int top;\n"
"  int right;\n"
"  int bottom;\n"
"  int view_width;\n"
"  int view_height;\n"
"  float border_x;\n"
"  float border_y;\n"
"  float border_z;\n"
"  float border_w;\n"
"  int fill_border;\n"
"  int video_direction;\n"
"  float alpha;\n"
"  int do_blend;\n"
"  int do_convert;\n"
"};\n"
"\n"
"__device__ inline float\n"
"dot (const float coeff[3], float3 val)\n"
"{\n"
"  return coeff[0] * val.x + coeff[1] * val.y + coeff[2] * val.z;\n"
"}\n"
"\n"
"__device__ inline float\n"
"clamp (float val, float min_val, float max_val)\n"
"{\n"
"  return max (min_val, min (val, max_val));\n"
"}\n"
"\n"
"__device__ inline float3\n"
"clamp3 (float3 val, const float min_val[3], const float max_val[3])\n"
"{\n"
"  return make_float3 (clamp (val.x, min_val[0], max_val[0]),\n"
"      clamp (val.y, min_val[1], max_val[2]),\n"
"      clamp (val.z, min_val[1], max_val[2]));\n"
"}\n"
"\n"
"__device__ inline unsigned char\n"
"scale_to_2bits (float val)\n"
"{\n"
"  return (unsigned short) __float2int_rz (val * 3.0);\n"
"}\n"
"\n"
"__device__ inline unsigned char\n"
"scale_to_uchar (float val)\n"
"{\n"
"  return (unsigned char) __float2int_rz (val * 255.0);\n"
"}\n"
"\n"
"__device__ inline unsigned short\n"
"scale_to_ushort (float val)\n"
"{\n"
"  return (unsigned short) __float2int_rz (val * 65535.0);\n"
"}\n"
"\n"
"__device__ inline unsigned short\n"
"scale_to_10bits (float val)\n"
"{\n"
"  return (unsigned short) __float2int_rz (val * 1023.0);\n"
"}\n"
"\n"
"__device__ inline unsigned short\n"
"scale_to_12bits (float val)\n"
"{\n"
"  return (unsigned short) __float2int_rz (val * 4095.0);\n"
"}\n"
"\n"
"__device__ inline unsigned char\n"
"blend_uchar (unsigned char dst, float src, float src_alpha)\n"
"{\n"
"  // DstColor' = SrcA * SrcColor + (1 - SrcA) DstColor\n"
"  float src_val = src * src_alpha;\n"
"  float dst_val = __int2float_rz (dst) / 255.0 * (1.0 - src_alpha);\n"
"  return scale_to_uchar(clamp(src_val + dst_val, 0, 1.0));\n"
"}\n"
"\n"
"__device__ inline unsigned short\n"
"blend_ushort (unsigned short dst, float src, float src_alpha)\n"
"{\n"
"  // DstColor' = SrcA * SrcColor + (1 - SrcA) DstColor\n"
"  float src_val = src * src_alpha;\n"
"  float dst_val = __int2float_rz (dst) / 65535.0 * (1.0 - src_alpha);\n"
"  return scale_to_ushort(clamp(src_val + dst_val, 0, 1.0));\n"
"}\n"
"\n"
"__device__ inline unsigned short\n"
"blend_10bits (unsigned short dst, float src, float src_alpha)\n"
"{\n"
"  // DstColor' = SrcA * SrcColor + (1 - SrcA) DstColor\n"
"  float src_val = src * src_alpha;\n"
"  float dst_val = __int2float_rz (dst) / 1023.0 * (1.0 - src_alpha);\n"
"  return scale_to_10bits(clamp(src_val + dst_val, 0, 1.0));\n"
"}\n"
"\n"
"__device__ inline unsigned short\n"
"blend_12bits (unsigned short dst, float src, float src_alpha)\n"
"{\n"
"  // DstColor' = SrcA * SrcColor + (1 - SrcA) DstColor\n"
"  float src_val = src * src_alpha;\n"
"  float dst_val = __int2float_rz (dst) / 4095.0 * (1.0 - src_alpha);\n"
"  return scale_to_12bits(clamp(src_val + dst_val, 0, 1.0));\n"
"}\n"
"\n"
"struct IConverter\n"
"{\n"
"  __device__ virtual float3\n"
"  Execute (float3 sample, const ColorMatrix * matrix) = 0;\n"
"};\n"
"\n"
"struct ConvertSimple : public IConverter\n"
"{\n"
"  __device__ float3\n"
"  Execute (float3 sample, const ColorMatrix * matrix)\n"
"  {\n"
"    float3 out;\n"
"    out.x = dot (matrix->CoeffX, sample);\n"
"    out.y = dot (matrix->CoeffY, sample);\n"
"    out.z = dot (matrix->CoeffZ, sample);\n"
"    out.x += matrix->Offset[0];\n"
"    out.y += matrix->Offset[1];\n"
"    out.z += matrix->Offset[2];\n"
"    return clamp3 (out, matrix->Min, matrix->Max);\n"
"  }\n"
"};\n"
"\n"
"struct ISampler\n"
"{\n"
"  __device__ virtual float4\n"
"  Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
"      cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y) = 0;\n"
"};\n"
"\n"
"struct SampleI420 : public ISampler\n"
"{\n"
"  __device__ float4\n"
"  Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
"      cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
"  {\n"
"    float luma = tex2D<float>(tex0, x, y);\n"
"    float u = tex2D<float>(tex1, x, y);\n"
"    float v = tex2D<float>(tex2, x, y);\n"
"    return make_float4 (luma, u, v, 1);\n"
"  }\n"
"};\n"
"\n"
"struct SampleYV12 : public ISampler\n"
"{\n"
"  __device__ float4\n"
"  Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
"      cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
"  {\n"
"    float luma = tex2D<float>(tex0, x, y);\n"
"    float u = tex2D<float>(tex2, x, y);\n"
"    float v = tex2D<float>(tex1, x, y);\n"
"    return make_float4 (luma, u, v, 1);\n"
"  }\n"
"};\n"
"\n"
"struct SampleI420_10 : public ISampler\n"
"{\n"
"  __device__ float4\n"
"  Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
"      cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
"  {\n"
"    float luma = tex2D<float>(tex0, x, y);\n"
"    float u = tex2D<float>(tex1, x, y);\n"
"    float v = tex2D<float>(tex2, x, y);\n"
"    /* (1 << 6) to scale [0, 1.0) range */\n"
"    return make_float4 (luma * 64, u * 64, v * 64, 1);\n"
"  }\n"
"};\n"
"\n"
"struct SampleI420_12 : public ISampler\n"
"{\n"
"  __device__ float4\n"
"  Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
"      cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
"  {\n"
"    float luma = tex2D<float>(tex0, x, y);\n"
"    float u = tex2D<float>(tex1, x, y);\n"
"    float v = tex2D<float>(tex2, x, y);\n"
"    /* (1 << 4) to scale [0, 1.0) range */\n"
"    return make_float4 (luma * 16, u * 16, v * 16, 1);\n"
"  }\n"
"};\n"
"\n"
"struct SampleNV12 : public ISampler\n"
"{\n"
"  __device__ float4\n"
"  Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
"      cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
"  {\n"
"    float luma = tex2D<float>(tex0, x, y);\n"
"    float2 uv = tex2D<float2>(tex1, x, y);\n"
"    return make_float4 (luma, uv.x, uv.y, 1);\n"
"  }\n"
"};\n"
"\n"
"struct SampleNV21 : public ISampler\n"
"{\n"
"  __device__ float4\n"
"  Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
"      cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
"  {\n"
"    float luma = tex2D<float>(tex0, x, y);\n"
"    float2 vu = tex2D<float2>(tex1, x, y);\n"
"    return make_float4 (luma, vu.y, vu.x, 1);\n"
"  }\n"
"};\n"
"\n"
"struct SampleRGBA : public ISampler\n"
"{\n"
"  __device__ float4\n"
"  Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
"      cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
"  {\n"
"    return tex2D<float4>(tex0, x, y);\n"
"  }\n"
"};\n"
"\n"
"struct SampleBGRA : public ISampler\n"
"{\n"
"  __device__ float4\n"
"  Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
"      cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
"  {\n"
"    float4 bgra = tex2D<float4>(tex0, x, y);\n"
"    return make_float4 (bgra.z, bgra.y, bgra.x, bgra.w);\n"
"  }\n"
"};\n"
"\n"
"struct SampleRGBx : public ISampler\n"
"{\n"
"  __device__ float4\n"
"  Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
"      cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
"  {\n"
"    float4 rgbx = tex2D<float4>(tex0, x, y);\n"
"    rgbx.w = 1;\n"
"    return rgbx;\n"
"  }\n"
"};\n"
"\n"
"struct SampleBGRx : public ISampler\n"
"{\n"
"  __device__ float4\n"
"  Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
"      cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
"  {\n"
"    float4 bgrx = tex2D<float4>(tex0, x, y);\n"
"    return make_float4 (bgrx.z, bgrx.y, bgrx.x, 1);\n"
"  }\n"
"};\n"
"\n"
"struct SampleARGB : public ISampler\n"
"{\n"
"  __device__ float4\n"
"  Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
"      cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
"  {\n"
"   float4 argb = tex2D<float4>(tex0, x, y);\n"
"   return make_float4 (argb.y, argb.z, argb.w, argb.x);\n"
"  }\n"
"};\n"
"\n"
"struct SampleABGR : public ISampler\n"
"{\n"
"  __device__ float4\n"
"  Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
"      cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
"  {\n"
"   float4 abgr = tex2D<float4>(tex0, x, y);\n"
"   return make_float4 (abgr.w, abgr.z, abgr.y, abgr.x);\n"
"  }\n"
"};\n"
"\n"
"struct SampleRGBP : public ISampler\n"
"{\n"
"  __device__ float4\n"
"  Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
"      cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
"  {\n"
"    float r = tex2D<float>(tex0, x, y);\n"
"    float g = tex2D<float>(tex1, x, y);\n"
"    float b = tex2D<float>(tex2, x, y);\n"
"    return make_float4 (r, g, b, 1);\n"
"  }\n"
"};\n"
"\n"
"struct SampleBGRP : public ISampler\n"
"{\n"
"  __device__ float4\n"
"  Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
"      cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
"  {\n"
"    float b = tex2D<float>(tex0, x, y);\n"
"    float g = tex2D<float>(tex1, x, y);\n"
"    float r = tex2D<float>(tex2, x, y);\n"
"    return make_float4 (r, g, b, 1);\n"
"  }\n"
"};\n"
"\n"
"struct SampleGBR : public ISampler\n"
"{\n"
"  __device__ float4\n"
"  Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
"      cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
"  {\n"
"    float g = tex2D<float>(tex0, x, y);\n"
"    float b = tex2D<float>(tex1, x, y);\n"
"    float r = tex2D<float>(tex2, x, y);\n"
"    return make_float4 (r, g, b, 1);\n"
"  }\n"
"};\n"
"\n"
"struct SampleGBR_10 : public ISampler\n"
"{\n"
"  __device__ float4\n"
"  Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
"      cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
"  {\n"
"    float g = tex2D<float>(tex0, x, y);\n"
"    float b = tex2D<float>(tex1, x, y);\n"
"    float r = tex2D<float>(tex2, x, y);\n"
"    /* (1 << 6) to scale [0, 1.0) range */\n"
"    return make_float4 (r * 64, g * 64, b * 64, 1);\n"
"  }\n"
"};\n"
"\n"
"struct SampleGBR_12 : public ISampler\n"
"{\n"
"  __device__ float4\n"
"  Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
"      cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
"  {\n"
"    float g = tex2D<float>(tex0, x, y);\n"
"    float b = tex2D<float>(tex1, x, y);\n"
"    float r = tex2D<float>(tex2, x, y);\n"
"    /* (1 << 4) to scale [0, 1.0) range */\n"
"    return make_float4 (r * 16, g * 16, b * 16, 1);\n"
"  }\n"
"};\n"
"\n"
"struct SampleGBRA : public ISampler\n"
"{\n"
"  __device__ float4\n"
"  Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
"      cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
"  {\n"
"    float g = tex2D<float>(tex0, x, y);\n"
"    float b = tex2D<float>(tex1, x, y);\n"
"    float r = tex2D<float>(tex2, x, y);\n"
"    float a = tex2D<float>(tex3, x, y);\n"
"    return make_float4 (r, g, b, a);\n"
"  }\n"
"};\n"
"\n"
"struct SampleVUYA : public ISampler\n"
"{\n"
"  __device__ float4\n"
"  Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
"      cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
"  {\n"
"    float4 vuya = tex2D<float4>(tex0, x, y);\n"
"    return make_float4 (vuya.z, vuya.y, vuya.x, vuya.w);\n"
"  }\n"
"};\n"
"\n"
"struct IOutput\n"
"{\n"
"  __device__ virtual void\n"
"  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1) = 0;\n"
"\n"
"  __device__ virtual void\n"
"  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"        unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"        int stride1) = 0;\n"
"};\n"
"\n"
"struct OutputI420 : public IOutput\n"
"{\n"
"  __device__ void\n"
"  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    dst0[x + y * stride0] = scale_to_uchar (sample.x);\n"
"    if (x % 2 == 0 && y % 2 == 0) {\n"
"      unsigned int pos = x / 2 + (y / 2) * stride1;\n"
"      dst1[pos] = scale_to_uchar (sample.y);\n"
"      dst2[pos] = scale_to_uchar (sample.z);\n"
"    }\n"
"  }\n"
"\n"
"  __device__ void\n"
"  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    unsigned int pos = x + y * stride0;\n"
"    dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);\n"
"    if (x % 2 == 0 && y % 2 == 0) {\n"
"      pos = x / 2 + (y / 2) * stride1;\n"
"      dst1[pos] = blend_uchar (dst1[pos], sample.y, sample.w);\n"
"      dst2[pos] = blend_uchar (dst2[pos], sample.z, sample.w);\n"
"    }\n"
"  }\n"
"};\n"
"\n"
"struct OutputYV12 : public IOutput\n"
"{\n"
"  __device__ void\n"
"  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    dst0[x + y * stride0] = scale_to_uchar (sample.x);\n"
"    if (x % 2 == 0 && y % 2 == 0) {\n"
"      unsigned int pos = x / 2 + (y / 2) * stride1;\n"
"      dst1[pos] = scale_to_uchar (sample.z);\n"
"      dst2[pos] = scale_to_uchar (sample.y);\n"
"    }\n"
"  }\n"
"\n"
"  __device__ void\n"
"  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    unsigned int pos = x + y * stride0;\n"
"    dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);\n"
"    if (x % 2 == 0 && y % 2 == 0) {\n"
"      pos = x / 2 + (y / 2) * stride1;\n"
"      dst1[pos] = blend_uchar (dst1[pos], sample.z, sample.w);\n"
"      dst2[pos] = blend_uchar (dst2[pos], sample.y, sample.w);\n"
"    }\n"
"  }\n"
"};\n"
"\n"
"struct OutputNV12 : public IOutput\n"
"{\n"
"  __device__ void\n"
"  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    dst0[x + y * stride0] = scale_to_uchar (sample.x);\n"
"    if (x % 2 == 0 && y % 2 == 0) {\n"
"      unsigned int pos = x + (y / 2) * stride1;\n"
"      dst1[pos] = scale_to_uchar (sample.y);\n"
"      dst1[pos + 1] = scale_to_uchar (sample.z);\n"
"    }\n"
"  }\n"
"\n"
"  __device__ void\n"
"  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    unsigned int pos = x + y * stride0;\n"
"    dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);\n"
"    if (x % 2 == 0 && y % 2 == 0) {\n"
"      pos = x + (y / 2) * stride1;\n"
"      dst1[pos] = blend_uchar (dst1[pos], sample.y, sample.w);\n"
"      dst1[pos + 1] = blend_uchar (dst1[pos + 1], sample.z, sample.w);\n"
"    }\n"
"  }\n"
"};\n"
"\n"
"struct OutputNV21 : public IOutput\n"
"{\n"
"  __device__ void\n"
"  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    dst0[x + y * stride0] = scale_to_uchar (sample.x);\n"
"    if (x % 2 == 0 && y % 2 == 0) {\n"
"      unsigned int pos = x + (y / 2) * stride1;\n"
"      dst1[pos] = scale_to_uchar (sample.z);\n"
"      dst1[pos + 1] = scale_to_uchar (sample.y);\n"
"    }\n"
"  }\n"
"\n"
"  __device__ void\n"
"  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    unsigned int pos = x + y * stride0;\n"
"    dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);\n"
"    if (x % 2 == 0 && y % 2 == 0) {\n"
"      pos = x + (y / 2) * stride1;\n"
"      dst1[pos] = blend_uchar (dst1[pos], sample.z, sample.w);\n"
"      dst1[pos + 1] = blend_uchar (dst1[pos + 1], sample.y, sample.w);\n"
"    }\n"
"  }\n"
"};\n"
"\n"
"struct OutputP010 : public IOutput\n"
"{\n"
"  __device__ void\n"
"  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    *(unsigned short *) &dst0[x * 2 + y * stride0] = scale_to_ushort (sample.x);\n"
"    if (x % 2 == 0 && y % 2 == 0) {\n"
"      unsigned int pos = x * 2 + (y / 2) * stride1;\n"
"      *(unsigned short *) &dst1[pos] = scale_to_ushort (sample.y);\n"
"      *(unsigned short *) &dst1[pos + 2] = scale_to_ushort (sample.z);\n"
"    }\n"
"  }\n"
"\n"
"  __device__ void\n"
"  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    unsigned int pos = x * 2 + y * stride0;\n"
"    unsigned short * target = (unsigned short *) &dst0[pos];\n"
"    *target = blend_ushort (*target, sample.x, sample.w);\n"
"    if (x % 2 == 0 && y % 2 == 0) {\n"
"      pos = x * 2 + (y / 2) * stride1;\n"
"      target = (unsigned short *) &dst1[pos];\n"
"      *target = blend_ushort (*target, sample.y, sample.w);\n"
"      target = (unsigned short *) &dst1[pos + 2];\n"
"      *target = blend_ushort (*target, sample.z, sample.w);\n"
"    }\n"
"  }\n"
"};\n"
"\n"
"struct OutputI420_10 : public IOutput\n"
"{\n"
"  __device__ void\n"
"  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    *(unsigned short *) &dst0[x * 2 + y * stride0] = scale_to_10bits (sample.x);\n"
"    if (x % 2 == 0 && y % 2 == 0) {\n"
"      unsigned int pos = x + (y / 2) * stride1;\n"
"      *(unsigned short *) &dst1[pos] = scale_to_10bits (sample.y);\n"
"      *(unsigned short *) &dst2[pos] = scale_to_10bits (sample.z);\n"
"    }\n"
"  }\n"
"\n"
"  __device__ void\n"
"  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    unsigned int pos = x * 2 + y * stride0;\n"
"    unsigned short * target = (unsigned short *) &dst0[pos];\n"
"    *target = blend_10bits (*target, sample.x, sample.w);\n"
"    if (x % 2 == 0 && y % 2 == 0) {\n"
"      pos = x * 2 + (y / 2) * stride1;\n"
"      target = (unsigned short *) &dst1[pos];\n"
"      *target = blend_10bits (*target, sample.y, sample.w);\n"
"      target = (unsigned short *) &dst2[pos];\n"
"      *target = blend_10bits (*target, sample.z, sample.w);\n"
"    }\n"
"  }\n"
"};\n"
"\n"
"struct OutputI420_12 : public IOutput\n"
"{\n"
"  __device__ void\n"
"  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    *(unsigned short *) &dst0[x * 2 + y * stride0] = scale_to_12bits (sample.x);\n"
"    if (x % 2 == 0 && y % 2 == 0) {\n"
"      unsigned int pos = x + (y / 2) * stride1;\n"
"      *(unsigned short *) &dst1[pos] = scale_to_12bits (sample.y);\n"
"      *(unsigned short *) &dst2[pos] = scale_to_12bits (sample.z);\n"
"    }\n"
"  }\n"
"\n"
"  __device__ void\n"
"  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    unsigned int pos = x * 2 + y * stride0;\n"
"    unsigned short * target = (unsigned short *) &dst0[pos];\n"
"    *target = blend_12bits (*target, sample.x, sample.w);\n"
"    if (x % 2 == 0 && y % 2 == 0) {\n"
"      pos = x * 2 + (y / 2) * stride1;\n"
"      target = (unsigned short *) &dst1[pos];\n"
"      *target = blend_12bits (*target, sample.y, sample.w);\n"
"      target = (unsigned short *) &dst2[pos];\n"
"      *target = blend_12bits (*target, sample.z, sample.w);\n"
"    }\n"
"  }\n"
"};\n"
"\n"
"struct OutputY444 : public IOutput\n"
"{\n"
"  __device__ void\n"
"  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    int pos = x + y * stride0;\n"
"    dst0[pos] = scale_to_uchar (sample.x);\n"
"    dst1[pos] = scale_to_uchar (sample.y);\n"
"    dst2[pos] = scale_to_uchar (sample.z);\n"
"  }\n"
"\n"
"  __device__ void\n"
"  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    int pos = x + y * stride0;\n"
"    dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);\n"
"    dst1[pos] = blend_uchar (dst1[pos], sample.y, sample.w);\n"
"    dst2[pos] = blend_uchar (dst2[pos], sample.z, sample.w);\n"
"  }\n"
"};\n"
"\n"
"struct OutputY444_10 : public IOutput\n"
"{\n"
"  __device__ void\n"
"  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    int pos = x * 2 + y * stride0;\n"
"    *(unsigned short *) &dst0[pos] = scale_to_10bits (sample.x);\n"
"    *(unsigned short *) &dst1[pos] = scale_to_10bits (sample.y);\n"
"    *(unsigned short *) &dst2[pos] = scale_to_10bits (sample.z);\n"
"  }\n"
"\n"
"  __device__ void\n"
"  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    int pos = x * 2 + y * stride0;\n"
"    unsigned short * target = (unsigned short *) &dst0[pos];\n"
"    *target = blend_10bits (*target, sample.x, sample.w);\n"
"    target = (unsigned short *) &dst1[pos];\n"
"    *target = blend_10bits (*target, sample.y, sample.w);\n"
"    target = (unsigned short *) &dst2[pos];\n"
"    *target = blend_10bits (*target, sample.z, sample.w);\n"
"  }\n"
"};\n"
"\n"
"struct OutputY444_12 : public IOutput\n"
"{\n"
"  __device__ void\n"
"  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    int pos = x * 2 + y * stride0;\n"
"    *(unsigned short *) &dst0[pos] = scale_to_12bits (sample.x);\n"
"    *(unsigned short *) &dst1[pos] = scale_to_12bits (sample.y);\n"
"    *(unsigned short *) &dst2[pos] = scale_to_12bits (sample.z);\n"
"  }\n"
"\n"
"  __device__ void\n"
"  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    int pos = x * 2 + y * stride0;\n"
"    unsigned short * target = (unsigned short *) &dst0[pos];\n"
"    *target = blend_12bits (*target, sample.x, sample.w);\n"
"    target = (unsigned short *) &dst1[pos];\n"
"    *target = blend_12bits (*target, sample.y, sample.w);\n"
"    target = (unsigned short *) &dst2[pos];\n"
"    *target = blend_12bits (*target, sample.z, sample.w);\n"
"  }\n"
"};\n"
"\n"
"struct OutputY444_16 : public IOutput\n"
"{\n"
"  __device__ void\n"
"  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    int pos = x * 2 + y * stride0;\n"
"    *(unsigned short *) &dst0[pos] = scale_to_ushort (sample.x);\n"
"    *(unsigned short *) &dst1[pos] = scale_to_ushort (sample.y);\n"
"    *(unsigned short *) &dst2[pos] = scale_to_ushort (sample.z);\n"
"  }\n"
"\n"
"  __device__ void\n"
"  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    int pos = x * 2 + y * stride0;\n"
"    unsigned short * target = (unsigned short *) &dst0[pos];\n"
"    *target = blend_ushort (*target, sample.x, sample.w);\n"
"    target = (unsigned short *) &dst1[pos];\n"
"    *target = blend_ushort (*target, sample.y, sample.w);\n"
"    target = (unsigned short *) &dst2[pos];\n"
"    *target = blend_ushort (*target, sample.z, sample.w);\n"
"  }\n"
"};\n"
"\n"
"struct OutputRGBA : public IOutput\n"
"{\n"
"  __device__ void\n"
"  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    int pos = x * 4 + y * stride0;\n"
"    dst0[pos] = scale_to_uchar (sample.x);\n"
"    dst0[pos + 1] = scale_to_uchar (sample.y);\n"
"    dst0[pos + 2] = scale_to_uchar (sample.z);\n"
"    dst0[pos + 3] = scale_to_uchar (sample.w);\n"
"  }\n"
"\n"
"  __device__ void\n"
"  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    int pos = x * 4 + y * stride0;\n"
"    dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);\n"
"    dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.y, sample.w);\n"
"    dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.z, sample.w);\n"
"    dst0[pos + 3] = blend_uchar (dst0[pos + 3], 1.0, sample.w);\n"
"  }\n"
"};\n"
"\n"
"struct OutputRGBx : public IOutput\n"
"{\n"
"  __device__ void\n"
"  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    int pos = x * 4 + y * stride0;\n"
"    dst0[pos] = scale_to_uchar (sample.x);\n"
"    dst0[pos + 1] = scale_to_uchar (sample.y);\n"
"    dst0[pos + 2] = scale_to_uchar (sample.z);\n"
"    dst0[pos + 3] = 255;\n"
"  }\n"
"\n"
"  __device__ void\n"
"  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    int pos = x * 4 + y * stride0;\n"
"    dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);\n"
"    dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.y, sample.w);\n"
"    dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.z, sample.w);\n"
"    dst0[pos + 3] = 255;\n"
"  }\n"
"};\n"
"\n"
"struct OutputBGRA : public IOutput\n"
"{\n"
"  __device__ void\n"
"  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    int pos = x * 4 + y * stride0;\n"
"    dst0[pos] = scale_to_uchar (sample.z);\n"
"    dst0[pos + 1] = scale_to_uchar (sample.y);\n"
"    dst0[pos + 2] = scale_to_uchar (sample.x);\n"
"    dst0[pos + 3] = scale_to_uchar (sample.w);\n"
"  }\n"
"\n"
"  __device__ void\n"
"  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    int pos = x * 4 + y * stride0;\n"
"    dst0[pos] = blend_uchar (dst0[pos], sample.z, sample.w);\n"
"    dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.y, sample.w);\n"
"    dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.x, sample.w);\n"
"    dst0[pos + 3] = blend_uchar (dst0[pos + 3], 1.0, sample.w);\n"
"  }\n"
"};\n"
"\n"
"struct OutputBGRx : public IOutput\n"
"{\n"
"  __device__ void\n"
"  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    int pos = x * 4 + y * stride0;\n"
"    dst0[pos] = scale_to_uchar (sample.z);\n"
"    dst0[pos + 1] = scale_to_uchar (sample.y);\n"
"    dst0[pos + 2] = scale_to_uchar (sample.x);\n"
"    dst0[pos + 3] = 255;\n"
"  }\n"
"\n"
"  __device__ void\n"
"  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    int pos = x * 4 + y * stride0;\n"
"    dst0[pos] = blend_uchar (dst0[pos], sample.z, sample.w);\n"
"    dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.y, sample.w);\n"
"    dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.x, sample.w);\n"
"    dst0[pos + 3] = 255;\n"
"  }\n"
"};\n"
"\n"
"struct OutputARGB : public IOutput\n"
"{\n"
"  __device__ void\n"
"  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    int pos = x * 4 + y * stride0;\n"
"    dst0[pos] = scale_to_uchar (sample.w);\n"
"    dst0[pos + 1] = scale_to_uchar (sample.x);\n"
"    dst0[pos + 2] = scale_to_uchar (sample.y);\n"
"    dst0[pos + 3] = scale_to_uchar (sample.z);\n"
"  }\n"
"\n"
"  __device__ void\n"
"  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    int pos = x * 4 + y * stride0;\n"
"    dst0[pos] = blend_uchar (dst0[pos], 1.0, sample.w);\n"
"    dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.x, sample.w);\n"
"    dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.y, sample.w);\n"
"    dst0[pos + 3] = blend_uchar (dst0[pos + 3], sample.z, sample.w);\n"
"  }\n"
"};\n"
"\n"
"struct OutputABGR : public IOutput\n"
"{\n"
"  __device__ void\n"
"  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    int pos = x * 4 + y * stride0;\n"
"    dst0[pos] = scale_to_uchar (sample.w);\n"
"    dst0[pos + 1] = scale_to_uchar (sample.z);\n"
"    dst0[pos + 2] = scale_to_uchar (sample.y);\n"
"    dst0[pos + 3] = scale_to_uchar (sample.x);\n"
"  }\n"
"\n"
"  __device__ void\n"
"  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    int pos = x * 4 + y * stride0;\n"
"    dst0[pos] = blend_uchar (dst0[pos], 1.0, sample.w);\n"
"    dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.z, sample.w);\n"
"    dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.y, sample.w);\n"
"    dst0[pos + 3] = blend_uchar (dst0[pos + 3], sample.x, sample.w);\n"
"  }\n"
"};\n"
"\n"
"struct OutputRGB : public IOutput\n"
"{\n"
"  __device__ void\n"
"  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    int pos = x * 3 + y * stride0;\n"
"    dst0[pos] = scale_to_uchar (sample.x);\n"
"    dst0[pos + 1] = scale_to_uchar (sample.y);\n"
"    dst0[pos + 2] = scale_to_uchar (sample.z);\n"
"  }\n"
"\n"
"  __device__ void\n"
"  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    int pos = x * 3 + y * stride0;\n"
"    dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);\n"
"    dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.y, sample.w);\n"
"    dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.z, sample.w);\n"
"  }\n"
"};\n"
"\n"
"struct OutputBGR : public IOutput\n"
"{\n"
"  __device__ void\n"
"  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    int pos = x * 3 + y * stride0;\n"
"    dst0[pos] = scale_to_uchar (sample.z);\n"
"    dst0[pos + 1] = scale_to_uchar (sample.y);\n"
"    dst0[pos + 2] = scale_to_uchar (sample.x);\n"
"  }\n"
"\n"
"  __device__ void\n"
"  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    int pos = x * 3 + y * stride0;\n"
"    dst0[pos] = blend_uchar (dst0[pos], sample.z, sample.w);\n"
"    dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.y, sample.w);\n"
"    dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.x, sample.w);\n"
"  }\n"
"};\n"
"\n"
"__device__ inline ushort3\n"
"unpack_rgb10a2 (unsigned int val)\n"
"{\n"
"  unsigned short r, g, b;\n"
"  r = (val & 0x3ff);\n"
"  r = (r << 6) | (r >> 4);\n"
"  g = ((val >> 10) & 0x3ff);\n"
"  g = (g << 6) | (g >> 4);\n"
"  b = ((val >> 20) & 0x3ff);\n"
"  b = (b << 6) | (b >> 4);\n"
"  return make_ushort3 (r, g, b);\n"
"}\n"
"\n"
"struct OutputRGB10A2 : public IOutput\n"
"{\n"
"  __device__ void\n"
"  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    unsigned int alpha = (unsigned int) scale_to_2bits (sample.w);\n"
"    unsigned int packed_rgb = alpha << 30;\n"
"    packed_rgb |= ((unsigned int) scale_to_10bits (sample.x));\n"
"    packed_rgb |= ((unsigned int) scale_to_10bits (sample.y)) << 10;\n"
"    packed_rgb |= ((unsigned int) scale_to_10bits (sample.z)) << 20;\n"
"    *(unsigned int *) &dst0[x * 4 + y * stride0] = packed_rgb;\n"
"  }\n"
"\n"
"  __device__ void\n"
"  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    unsigned int * target = (unsigned int *) &dst0[x * 4 + y * stride0];\n"
"    ushort3 val = unpack_rgb10a2 (*target);\n"
"    unsigned int alpha = (unsigned int) scale_to_2bits (sample.w);\n"
"    unsigned int packed_rgb = alpha << 30;\n"
"    packed_rgb |= ((unsigned int) blend_10bits (val.x, sample.x, sample.w));\n"
"    packed_rgb |= ((unsigned int) blend_10bits (val.y, sample.y, sample.w)) << 10;\n"
"    packed_rgb |= ((unsigned int) blend_10bits (val.z, sample.z, sample.w)) << 20;\n"
"    *target = packed_rgb;\n"
"  }\n"
"};\n"
"\n"
"__device__ inline ushort3\n"
"unpack_bgr10a2 (unsigned int val)\n"
"{\n"
"  unsigned short r, g, b;\n"
"  b = (val & 0x3ff);\n"
"  b = (b << 6) | (b >> 4);\n"
"  g = ((val >> 10) & 0x3ff);\n"
"  g = (g << 6) | (g >> 4);\n"
"  r = ((val >> 20) & 0x3ff);\n"
"  r = (r << 6) | (r >> 4);\n"
"  return make_ushort3 (r, g, b);\n"
"}\n"
"\n"
"struct OutputBGR10A2 : public IOutput\n"
"{\n"
"  __device__ void\n"
"  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    unsigned int alpha = (unsigned int) scale_to_2bits (sample.x);\n"
"    unsigned int packed_rgb = alpha << 30;\n"
"    packed_rgb |= ((unsigned int) scale_to_10bits (sample.x)) << 20;\n"
"    packed_rgb |= ((unsigned int) scale_to_10bits (sample.y)) << 10;\n"
"    packed_rgb |= ((unsigned int) scale_to_10bits (sample.z));\n"
"    *(unsigned int *) &dst0[x * 4 + y * stride0] = packed_rgb;\n"
"  }\n"
"\n"
"  __device__ void\n"
"  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    unsigned int * target = (unsigned int *) &dst0[x * 4 + y * stride0];\n"
"    ushort3 val = unpack_bgr10a2 (*target);\n"
"    unsigned int alpha = (unsigned int) scale_to_2bits (sample.w);\n"
"    unsigned int packed_rgb = alpha << 30;\n"
"    packed_rgb |= ((unsigned int) blend_10bits (val.x, sample.x, sample.w)) << 20;\n"
"    packed_rgb |= ((unsigned int) blend_10bits (val.y, sample.y, sample.w)) << 10;\n"
"    packed_rgb |= ((unsigned int) blend_10bits (val.z, sample.z, sample.w));\n"
"    *target = packed_rgb;\n"
"  }\n"
"};\n"
"\n"
"struct OutputY42B : public IOutput\n"
"{\n"
"  __device__ void\n"
"  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    dst0[x + y * stride0] = scale_to_uchar (sample.x);\n"
"    if (x % 2 == 0) {\n"
"      unsigned int pos = x / 2 + y * stride1;\n"
"      dst1[pos] = scale_to_uchar (sample.y);\n"
"      dst2[pos] = scale_to_uchar (sample.z);\n"
"    }\n"
"  }\n"
"\n"
"  __device__ void\n"
"  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    unsigned int pos = x + y * stride0;\n"
"    dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);\n"
"    if (x % 2 == 0) {\n"
"      pos = x / 2 + y * stride1;\n"
"      dst1[pos] = blend_uchar (dst1[pos], sample.y, sample.w);\n"
"      dst2[pos] = blend_uchar (dst2[pos], sample.z, sample.w);\n"
"    }\n"
"  }\n"
"};\n"
"\n"
"struct OutputI422_10 : public IOutput\n"
"{\n"
"  __device__ void\n"
"  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    *(unsigned short *) &dst0[x * 2 + y * stride0] = scale_to_10bits (sample.x);\n"
"    if (x % 2 == 0) {\n"
"      unsigned int pos = x + y * stride1;\n"
"      *(unsigned short *) &dst1[pos] = scale_to_10bits (sample.y);\n"
"      *(unsigned short *) &dst2[pos] = scale_to_10bits (sample.z);\n"
"    }\n"
"  }\n"
"\n"
"  __device__ void\n"
"  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    unsigned int pos = x * 2 + y * stride0;\n"
"    unsigned short * target = (unsigned short *) &dst0[pos];\n"
"    *target = blend_10bits (*target, sample.x, sample.w);\n"
"    if (x % 2 == 0) {\n"
"      pos = x / 2 + y * stride1;\n"
"      target = (unsigned short *) &dst1[pos];\n"
"      *target = blend_10bits (*target, sample.y, sample.w);\n"
"      target = (unsigned short *) &dst2[pos];\n"
"      *target = blend_10bits (*target, sample.z, sample.w);\n"
"    }\n"
"  }\n"
"};\n"
"\n"
"struct OutputI422_12 : public IOutput\n"
"{\n"
"  __device__ void\n"
"  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    *(unsigned short *) &dst0[x * 2 + y * stride0] = scale_to_12bits (sample.x);\n"
"    if (x % 2 == 0) {\n"
"      unsigned int pos = x + y * stride1;\n"
"      *(unsigned short *) &dst1[pos] = scale_to_12bits (sample.y);\n"
"      *(unsigned short *) &dst2[pos] = scale_to_12bits (sample.z);\n"
"    }\n"
"  }\n"
"\n"
"  __device__ void\n"
"  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    unsigned int pos = x * 2 + y * stride0;\n"
"    unsigned short * target = (unsigned short *) &dst0[pos];\n"
"    *target = blend_12bits (*target, sample.x, sample.w);\n"
"    if (x % 2 == 0) {\n"
"      pos = x / 2 + y * stride1;\n"
"      target = (unsigned short *) &dst1[pos];\n"
"      *target = blend_12bits (*target, sample.y, sample.w);\n"
"      target = (unsigned short *) &dst2[pos];\n"
"      *target = blend_12bits (*target, sample.z, sample.w);\n"
"    }\n"
"  }\n"
"};\n"
"\n"
"struct OutputRGBP : public IOutput\n"
"{\n"
"  __device__ void\n"
"  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    int pos = x + y * stride0;\n"
"    dst0[pos] = scale_to_uchar (sample.x);\n"
"    dst1[pos] = scale_to_uchar (sample.y);\n"
"    dst2[pos] = scale_to_uchar (sample.z);\n"
"  }\n"
"\n"
"  __device__ void\n"
"  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    int pos = x + y * stride0;\n"
"    dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);\n"
"    dst1[pos] = blend_uchar (dst1[pos], sample.y, sample.w);\n"
"    dst2[pos] = blend_uchar (dst2[pos], sample.z, sample.w);\n"
"  }\n"
"};\n"
"\n"
"struct OutputBGRP : public IOutput\n"
"{\n"
"  __device__ void\n"
"  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    int pos = x + y * stride0;\n"
"    dst0[pos] = scale_to_uchar (sample.z);\n"
"    dst1[pos] = scale_to_uchar (sample.y);\n"
"    dst2[pos] = scale_to_uchar (sample.x);\n"
"  }\n"
"\n"
"  __device__ void\n"
"  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    int pos = x + y * stride0;\n"
"    dst0[pos] = blend_uchar (dst0[pos], sample.z, sample.w);\n"
"    dst1[pos] = blend_uchar (dst1[pos], sample.y, sample.w);\n"
"    dst2[pos] = blend_uchar (dst2[pos], sample.x, sample.w);\n"
"  }\n"
"};\n"
"\n"
"struct OutputGBR : public IOutput\n"
"{\n"
"  __device__ void\n"
"  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    int pos = x + y * stride0;\n"
"    dst0[pos] = scale_to_uchar (sample.y);\n"
"    dst1[pos] = scale_to_uchar (sample.z);\n"
"    dst2[pos] = scale_to_uchar (sample.x);\n"
"  }\n"
"\n"
"  __device__ void\n"
"  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    int pos = x + y * stride0;\n"
"    dst0[pos] = blend_uchar (dst0[pos], sample.y, sample.w);\n"
"    dst1[pos] = blend_uchar (dst1[pos], sample.z, sample.w);\n"
"    dst2[pos] = blend_uchar (dst2[pos], sample.x, sample.w);\n"
"  }\n"
"};\n"
"\n"
"struct OutputGBR_10 : public IOutput\n"
"{\n"
"  __device__ void\n"
"  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    int pos = x * 2 + y * stride0;\n"
"    *(unsigned short *) &dst0[pos] = scale_to_10bits (sample.y);\n"
"    *(unsigned short *) &dst1[pos] = scale_to_10bits (sample.z);\n"
"    *(unsigned short *) &dst2[pos] = scale_to_10bits (sample.x);\n"
"  }\n"
"\n"
"  __device__ void\n"
"  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    int pos = x * 2 + y * stride0;\n"
"    unsigned short * target = (unsigned short *) &dst0[pos];\n"
"    *target = blend_10bits (*target, sample.y, sample.w);\n"
"    target = (unsigned short *) &dst1[pos];\n"
"    *target = blend_10bits (*target, sample.z, sample.w);\n"
"    target = (unsigned short *) &dst2[pos];\n"
"    *target = blend_10bits (*target, sample.x, sample.w);\n"
"  }\n"
"};\n"
"\n"
"struct OutputGBR_12 : public IOutput\n"
"{\n"
"  __device__ void\n"
"  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    int pos = x * 2 + y * stride0;\n"
"    *(unsigned short *) &dst0[pos] = scale_to_12bits (sample.y);\n"
"    *(unsigned short *) &dst1[pos] = scale_to_12bits (sample.z);\n"
"    *(unsigned short *) &dst2[pos] = scale_to_12bits (sample.x);\n"
"  }\n"
"\n"
"  __device__ void\n"
"  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    int pos = x * 2 + y * stride0;\n"
"    unsigned short * target = (unsigned short *) &dst0[pos];\n"
"    *target = blend_12bits (*target, sample.y, sample.w);\n"
"    target = (unsigned short *) &dst1[pos];\n"
"    *target = blend_12bits (*target, sample.z, sample.w);\n"
"    target = (unsigned short *) &dst2[pos];\n"
"    *target = blend_12bits (*target, sample.x, sample.w);\n"
"  }\n"
"};\n"
"\n"
"struct OutputGBR_16 : public IOutput\n"
"{\n"
"  __device__ void\n"
"  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    int pos = x * 2 + y * stride0;\n"
"    *(unsigned short *) &dst0[pos] = scale_to_ushort (sample.y);\n"
"    *(unsigned short *) &dst1[pos] = scale_to_ushort (sample.z);\n"
"    *(unsigned short *) &dst2[pos] = scale_to_ushort (sample.x);\n"
"  }\n"
"\n"
"  __device__ void\n"
"  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    int pos = x * 2 + y * stride0;\n"
"    unsigned short * target = (unsigned short *) &dst0[pos];\n"
"    *target = blend_ushort (*target, sample.y, sample.w);\n"
"    target = (unsigned short *) &dst1[pos];\n"
"    *target = blend_ushort (*target, sample.z, sample.w);\n"
"    target = (unsigned short *) &dst2[pos];\n"
"    *target = blend_ushort (*target, sample.x, sample.w);\n"
"  }\n"
"};\n"
"\n"
"struct OutputGBRA : public IOutput\n"
"{\n"
"  __device__ void\n"
"  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    int pos = x + y * stride0;\n"
"    dst0[pos] = scale_to_uchar (sample.y);\n"
"    dst1[pos] = scale_to_uchar (sample.z);\n"
"    dst2[pos] = scale_to_uchar (sample.x);\n"
"    dst3[pos] = scale_to_uchar (sample.w);\n"
"  }\n"
"\n"
"  __device__ void\n"
"  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    int pos = x + y * stride0;\n"
"    dst0[pos] = blend_uchar (dst0[pos], sample.y, sample.w);\n"
"    dst1[pos] = blend_uchar (dst1[pos], sample.z, sample.w);\n"
"    dst2[pos] = blend_uchar (dst2[pos], sample.x, sample.w);\n"
"    dst3[pos] = blend_uchar (dst3[pos], 1.0, sample.w);\n"
"  }\n"
"};\n"
"\n"
"struct OutputVUYA : public IOutput\n"
"{\n"
"  __device__ void\n"
"  Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    int pos = x * 4 + y * stride0;\n"
"    dst0[pos] = scale_to_uchar (sample.z);\n"
"    dst0[pos + 1] = scale_to_uchar (sample.y);\n"
"    dst0[pos + 2] = scale_to_uchar (sample.x);\n"
"    dst0[pos + 3] = scale_to_uchar (sample.w);\n"
"  }\n"
"\n"
"  __device__ void\n"
"  Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
"      unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
"      int stride1)\n"
"  {\n"
"    int pos = x * 4 + y * stride0;\n"
"    dst0[pos] = blend_uchar (dst0[pos], sample.z, sample.w);\n"
"    dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.y, sample.w);\n"
"    dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.x, sample.w);\n"
"    dst0[pos + 3] = blend_uchar (dst0[pos + 3], 1.0, sample.w);\n"
"  }\n"
"};\n"
"\n"
"__device__ inline float2\n"
"rotate_identity (float x, float y)\n"
"{\n"
"  return make_float2(x, y);\n"
"}\n"
"\n"
"__device__ inline float2\n"
"rotate_90r (float x, float y)\n"
"{\n"
"  return make_float2(y, 1.0 - x);\n"
"}\n"
"\n"
"__device__ inline float2\n"
"rotate_180 (float x, float y)\n"
"{\n"
"  return make_float2(1.0 - x, 1.0 - y);\n"
"}\n"
"\n"
"__device__ inline float2\n"
"rotate_90l (float x, float y)\n"
"{\n"
"  return make_float2(1.0 - y, x);\n"
"}\n"
"\n"
"__device__ inline float2\n"
"rotate_horiz (float x, float y)\n"
"{\n"
"  return make_float2(1.0 - x, y);\n"
"}\n"
"\n"
"__device__ inline float2\n"
"rotate_vert (float x, float y)\n"
"{\n"
"  return make_float2(x, 1.0 - y);\n"
"}\n"
"\n"
"__device__ inline float2\n"
"rotate_ul_lr (float x, float y)\n"
"{\n"
"  return make_float2(y, x);\n"
"}\n"
"\n"
"__device__ inline float2\n"
"rotate_ur_ll (float x, float y)\n"
"{\n"
"  return make_float2(1.0 - y, 1.0 - x);\n"
"}\n"
"__device__ inline float2\n"
"do_rotate (float x, float y, int direction)\n"
"{\n"
"  switch (direction) {\n"
"    case 1:\n"
"      return rotate_90r (x, y);\n"
"    case 2:\n"
"      return rotate_180 (x, y);\n"
"    case 3:\n"
"      return rotate_90l (x, y);\n"
"    case 4:\n"
"      return rotate_horiz (x, y);\n"
"    case 5:\n"
"      return rotate_vert (x, y);\n"
"    case 6:\n"
"      return rotate_ul_lr (x, y);\n"
"    case 7:\n"
"      return rotate_ur_ll (x, y);\n"
"    default:\n"
"      return rotate_identity (x, y);\n"
"  }\n"
"}\n"
"\n"
"extern \"C\" {\n"
"__global__ void\n"
"GstCudaConverterMain (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
"    cudaTextureObject_t tex2, cudaTextureObject_t tex3, unsigned char * dst0,\n"
"    unsigned char * dst1, unsigned char * dst2, unsigned char * dst3,\n"
"    int stride0, int stride1, ConstBuffer const_buf, int off_x, int off_y)\n"
"{\n"
"  ConvertSimple g_converter;\n"
"  SAMPLER g_sampler;\n"
"  OUTPUT g_output;\n"
"  int x_pos = blockIdx.x * blockDim.x + threadIdx.x + off_x;\n"
"  int y_pos = blockIdx.y * blockDim.y + threadIdx.y + off_y;\n"
"  float4 sample;\n"
"  if (x_pos >= const_buf.width || y_pos >= const_buf.height ||\n"
"      const_buf.view_width <= 0 || const_buf.view_height <= 0)\n"
"    return;\n"
"  if (x_pos < const_buf.left || x_pos >= const_buf.right ||\n"
"      y_pos < const_buf.top || y_pos >= const_buf.bottom) {\n"
"    if (!const_buf.fill_border)\n"
"      return;\n"
"    sample = make_float4 (const_buf.border_x, const_buf.border_y,\n"
"       const_buf.border_z, const_buf.border_w);\n"
"  } else {\n"
"    float x = (__int2float_rz (x_pos - const_buf.left) + 0.5) / const_buf.view_width;\n"
"    if (x < 0.0 || x > 1.0)\n"
"      return;\n"
"    float y = (__int2float_rz (y_pos - const_buf.top) + 0.5) / const_buf.view_height;\n"
"    if (y < 0.0 || y > 1.0)\n"
"      return;\n"
"    float2 rotated = do_rotate (x, y, const_buf.video_direction);\n"
"    float4 s = g_sampler.Execute (tex0, tex1, tex2, tex3, rotated.x, rotated.y);\n"
"    float3 rgb = make_float3 (s.x, s.y, s.z);\n"
"    float3 yuv;\n"
"    if (const_buf.do_convert)\n"
"      yuv = g_converter.Execute (rgb, &const_buf.matrix);\n"
"    else\n"
"      yuv = rgb;\n"
"    sample = make_float4 (yuv.x, yuv.y, yuv.z, s.w);\n"
"  }\n"
"  sample.w = sample.w * const_buf.alpha;\n"
"  if (!const_buf.do_blend) {\n"
"    g_output.Write (dst0, dst1, dst2, dst3, sample, x_pos, y_pos, stride0, stride1);\n"
"  } else {\n"
"    g_output.Blend (dst0, dst1, dst2, dst3, sample, x_pos, y_pos, stride0, stride1);\n"
"  }\n"
"}\n"
"}\n"
"\n";
#endif