nvcodec: Add support for CUDA kernel precompile

Enable build time CUDA kernel compile if nvcc is detected. Precompile is disabled by default and controlled by "nvcodec-cuda-precompile" build option. Part-of: <https://gitlab.freedesktop.org/gstreamer/gstreamer/-/merge_requests/8536>
2025-02-20 15:26:37 +09:00 · 2025-02-20 15:26:37 +09:00 · 9a8f3a65a3
commit 9a8f3a65a3
parent 8165735902
6 changed files with 3287 additions and 1 deletions
--- a/subprojects/gst-plugins-bad/meson_options.txt
+++ b/subprojects/gst-plugins-bad/meson_options.txt
@ -246,6 +246,10 @@ option('sctp-internal-usrsctp', type: 'feature', value : 'enabled',
 option('mfx_api', type : 'combo', choices : ['MSDK', 'oneVPL', 'auto'], value : 'auto',
       description : 'Select MFX API to build against')

+# nvcodec plugin options
+option('nvcodec-cuda-precompile', type : 'feature', value : 'disabled', description : 'Enable CUDA kernel precompile')
+option('nvcodec-nvcc-arch', type : 'string', value : 'compute_52', description : 'GPU architectur for nvcc -arch option')
+
 # nvCOMP plugin options
 option('nvcomp-sdk-path', type: 'string', value : '',
       description : 'nvCOMP SDK root directory')
--- a/subprojects/gst-plugins-bad/sys/nvcodec/kernel/collect_ptx_headers.py
+++ b/subprojects/gst-plugins-bad/sys/nvcodec/kernel/collect_ptx_headers.py
@ -0,0 +1,79 @@
+#!/usr/bin/env python3
+# GStreamer
+# Copyright (C) 2025 Seungha Yang <seungha@centricular.com>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Library General Public
+# License as published by the Free Software Foundation; either
+# version 2 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Library General Public License for more details.
+#
+# You should have received a copy of the GNU Library General Public
+# License along with this library; if not, write to the
+# Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+# Boston, MA 02110-1301, USA.
+
+import sys
+import os
+import argparse
+
+start_header = """/*
+ * This file is autogenerated by collect_ptx_headers.py
+ */
+#pragma once
+
+"""
+
+start_map = """
+#define MAKE_BYTECODE(name) { G_STRINGIFY (name), g_##name }
+static std::unordered_map<std::string, const char *>
+"""
+
+end_map = """};
+#undef MAKE_BYTECODE
+"""
+
+def convert_ptx_to_header(ptx_file, header_file):
+    with open(ptx_file, 'r', encoding='utf8') as ptx:
+        ptx_content = ptx.read()
+
+    with open(header_file, 'w', newline='\n', encoding='utf8') as header:
+        header.write('#pragma once\n')
+        header.write('// This file is autogenerated by collect_ptx_headers.py\n')
+        header.write(f'static const char* g_{os.path.splitext(os.path.basename(ptx_file))[0]} = R"(\n')
+        header.write(ptx_content)
+        header.write(')";\n\n')
+
+
+def main(args):
+    parser = argparse.ArgumentParser(description='Read CUDA PTX from directory and make single header')
+    parser.add_argument("--input", help="the precompiled CUDA PTX directory")
+    parser.add_argument("--output", help="output header file location")
+    parser.add_argument("--prefix", help="CUDA PTX header filename prefix")
+    parser.add_argument("--name", help="Hash map variable name")
+
+    args = parser.parse_args(args)
+
+    ptx_files = [os.path.join(args.input, file) for file in os.listdir(args.input) if file.startswith(args.prefix) and file.endswith(".ptx") ]
+
+    with open(args.output, 'w', newline='\n', encoding='utf8') as f:
+        f.write(start_header)
+        for ptx_file in ptx_files:
+            header_file = os.path.splitext(ptx_file)[0] + '.h'
+            convert_ptx_to_header(ptx_file, header_file)
+            f.write("#include \"")
+            f.write(os.path.basename(header_file))
+            f.write("\"\n")
+        f.write(start_map)
+        f.write(args.name)
+        f.write(" = {\n")
+        for ptx_file in ptx_files:
+            f.write("  MAKE_BYTECODE ({}),\n".format(os.path.splitext(os.path.basename(ptx_file))[0]))
+        f.write(end_map)
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv[1:]))
--- a/subprojects/gst-plugins-bad/sys/nvcodec/kernel/gstcudaconverter-unpack.cu
+++ b/subprojects/gst-plugins-bad/sys/nvcodec/kernel/gstcudaconverter-unpack.cu
@ -0,0 +1,168 @@
+/* GStreamer
+ * Copyright (C) 2025 Seungha Yang <seungha@centricular.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifdef __NVCC__
+extern "C" {
+__global__ void
+GstCudaConverterUnpack_RGB_RGBx
+(unsigned char *src, unsigned char *dst, int width, int height,
+    int src_stride, int dst_stride)
+{
+  int x_pos = blockIdx.x * blockDim.x + threadIdx.x;
+  int y_pos = blockIdx.y * blockDim.y + threadIdx.y;
+  if (x_pos < width && y_pos < height) {
+    int dst_pos = x_pos * 4 + y_pos * dst_stride;
+    int src_pos = x_pos * 3 + y_pos * src_stride;
+    dst[dst_pos] = src[src_pos];
+    dst[dst_pos + 1] = src[src_pos + 1];
+    dst[dst_pos + 2] = src[src_pos + 2];
+    dst[dst_pos + 3] = 0xff;
+  }
+}
+
+__global__ void
+GstCudaConverterUnpack_RGB10A2_ARGB64
+(unsigned char *src, unsigned char *dst, int width, int height,
+    int src_stride, int dst_stride)
+{
+  int x_pos = blockIdx.x * blockDim.x + threadIdx.x;
+  int y_pos = blockIdx.y * blockDim.y + threadIdx.y;
+  if (x_pos < width && y_pos < height) {
+    unsigned short a, r, g, b;
+    unsigned int val;
+    int dst_pos = x_pos * 8 + y_pos * dst_stride;
+    val = *(unsigned int *)&src[x_pos * 4 + y_pos * src_stride];
+    a = (val >> 30) & 0x03;
+    a = (a << 14) | (a << 12) | (a << 10) | (a << 8) | (a << 6) | (a << 4) | (a << 2) | (a << 0);
+    r = (val & 0x3ff);
+    r = (r << 6) | (r >> 4);
+    g = ((val >> 10) & 0x3ff);
+    g = (g << 6) | (g >> 4);
+    b = ((val >> 20) & 0x3ff);
+    b = (b << 6) | (b >> 4);
+    *(unsigned short *) &dst[dst_pos] = a;
+    *(unsigned short *) &dst[dst_pos + 2] = r;
+    *(unsigned short *) &dst[dst_pos + 4] = g;
+    *(unsigned short *) &dst[dst_pos + 6] = b;
+  }
+}
+
+__global__ void
+GstCudaConverterUnpack_BGR10A2_ARGB64
+(unsigned char *src, unsigned char *dst, int width, int height,
+    int src_stride, int dst_stride)
+{
+  int x_pos = blockIdx.x * blockDim.x + threadIdx.x;
+  int y_pos = blockIdx.y * blockDim.y + threadIdx.y;
+  if (x_pos < width && y_pos < height) {
+    unsigned short a, r, g, b;
+    unsigned int val;
+    int dst_pos = x_pos * 8 + y_pos * dst_stride;
+    val = *(unsigned int *)&src[x_pos * 4 + y_pos * src_stride];
+    a = (val >> 30) & 0x03;
+    a = (a << 14) | (a << 12) | (a << 10) | (a << 8) | (a << 6) | (a << 4) | (a << 2) | (a << 0);
+    b = (val & 0x3ff);
+    b = (b << 6) | (b >> 4);
+    g = ((val >> 10) & 0x3ff);
+    g = (g << 6) | (g >> 4);
+    r = ((val >> 20) & 0x3ff);
+    r = (r << 6) | (r >> 4);
+    *(unsigned short *) &dst[dst_pos] = a;
+    *(unsigned short *) &dst[dst_pos + 2] = r;
+    *(unsigned short *) &dst[dst_pos + 4] = g;
+    *(unsigned short *) &dst[dst_pos + 6] = b;
+  }
+}
+}
+#else
+static const char GstCudaConverterUnpack_str[] =
+"extern \"C\" {\n"
+"__global__ void\n"
+"GstCudaConverterUnpack_RGB_RGBx\n"
+"(unsigned char *src, unsigned char *dst, int width, int height,\n"
+"    int src_stride, int dst_stride)\n"
+"{\n"
+"  int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
+"  int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
+"  if (x_pos < width && y_pos < height) {\n"
+"    int dst_pos = x_pos * 4 + y_pos * dst_stride;\n"
+"    int src_pos = x_pos * 3 + y_pos * src_stride;\n"
+"    dst[dst_pos] = src[src_pos];\n"
+"    dst[dst_pos + 1] = src[src_pos + 1];\n"
+"    dst[dst_pos + 2] = src[src_pos + 2];\n"
+"    dst[dst_pos + 3] = 0xff;\n"
+"  }\n"
+"}\n"
+"\n"
+"__global__ void\n"
+"GstCudaConverterUnpack_RGB10A2_ARGB64\n"
+"(unsigned char *src, unsigned char *dst, int width, int height,\n"
+"    int src_stride, int dst_stride)\n"
+"{\n"
+"  int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
+"  int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
+"  if (x_pos < width && y_pos < height) {\n"
+"    unsigned short a, r, g, b;\n"
+"    unsigned int val;\n"
+"    int dst_pos = x_pos * 8 + y_pos * dst_stride;\n"
+"    val = *(unsigned int *)&src[x_pos * 4 + y_pos * src_stride];\n"
+"    a = (val >> 30) & 0x03;\n"
+"    a = (a << 14) | (a << 12) | (a << 10) | (a << 8) | (a << 6) | (a << 4) | (a << 2) | (a << 0);\n"
+"    r = (val & 0x3ff);\n"
+"    r = (r << 6) | (r >> 4);\n"
+"    g = ((val >> 10) & 0x3ff);\n"
+"    g = (g << 6) | (g >> 4);\n"
+"    b = ((val >> 20) & 0x3ff);\n"
+"    b = (b << 6) | (b >> 4);\n"
+"    *(unsigned short *) &dst[dst_pos] = a;\n"
+"    *(unsigned short *) &dst[dst_pos + 2] = r;\n"
+"    *(unsigned short *) &dst[dst_pos + 4] = g;\n"
+"    *(unsigned short *) &dst[dst_pos + 6] = b;\n"
+"  }\n"
+"}\n"
+"\n"
+"__global__ void\n"
+"GstCudaConverterUnpack_BGR10A2_ARGB64\n"
+"(unsigned char *src, unsigned char *dst, int width, int height,\n"
+"    int src_stride, int dst_stride)\n"
+"{\n"
+"  int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
+"  int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
+"  if (x_pos < width && y_pos < height) {\n"
+"    unsigned short a, r, g, b;\n"
+"    unsigned int val;\n"
+"    int dst_pos = x_pos * 8 + y_pos * dst_stride;\n"
+"    val = *(unsigned int *)&src[x_pos * 4 + y_pos * src_stride];\n"
+"    a = (val >> 30) & 0x03;\n"
+"    a = (a << 14) | (a << 12) | (a << 10) | (a << 8) | (a << 6) | (a << 4) | (a << 2) | (a << 0);\n"
+"    b = (val & 0x3ff);\n"
+"    b = (b << 6) | (b >> 4);\n"
+"    g = ((val >> 10) & 0x3ff);\n"
+"    g = (g << 6) | (g >> 4);\n"
+"    r = ((val >> 20) & 0x3ff);\n"
+"    r = (r << 6) | (r >> 4);\n"
+"    *(unsigned short *) &dst[dst_pos] = a;\n"
+"    *(unsigned short *) &dst[dst_pos + 2] = r;\n"
+"    *(unsigned short *) &dst[dst_pos + 4] = g;\n"
+"    *(unsigned short *) &dst[dst_pos + 6] = b;\n"
+"  }\n"
+"}\n"
+"}\n"
+"\n";
+#endif
--- a/subprojects/gst-plugins-bad/sys/nvcodec/kernel/gstcudaconverter.cu
+++ b/subprojects/gst-plugins-bad/sys/nvcodec/kernel/gstcudaconverter.cu
--- a/subprojects/gst-plugins-bad/sys/nvcodec/kernel/meson.build
+++ b/subprojects/gst-plugins-bad/sys/nvcodec/kernel/meson.build
@ -0,0 +1,102 @@
+conv_source = files('gstcudaconverter.cu')
+conv_comm_source = files('gstcudaconverter-unpack.cu')
+
+conv_input_formats = [
+  'I420',
+  'YV12',
+  'I420_10',
+  'I420_12',
+  'NV12',
+  'NV21',
+  'VUYA',
+  'RGBA',
+  'BGRA',
+  'RGBx',
+  'BGRx',
+  'ARGB',
+  'ABGR',
+  'RGBP',
+  'BGRP',
+  'GBR',
+  'GBR_10',
+  'GBR_12',
+  'GBRA',
+]
+
+conv_output_formats = [
+  'I420',
+  'YV12',
+  'NV12',
+  'NV21',
+  'P010',
+  'I420_10',
+  'I420_12',
+  'Y444',
+  'Y444_10',
+  'Y444_12',
+  'Y444_16',
+  'Y42B',
+  'I422_10',
+  'I422_12',
+  'VUYA',
+  'RGBA',
+  'RGBx',
+  'BGRA',
+  'BGRx',
+  'ARGB',
+  'ABGR',
+  'RGB',
+  'BGR',
+  'RGB10A2',
+  'BGR10A2',
+  'RGBP',
+  'GBR',
+  'GBR_10',
+  'GBR_12',
+  'GBR_16',
+  'GBRA',
+]
+
+header_collector = find_program('collect_ptx_headers.py')
+
+conv_precompiled = []
+opt_common = ['-ptx', '-w', '-o', '@OUTPUT@']
+arch_opt = get_option('nvcodec-nvcc-arch')
+if arch_opt != ''
+  opt_common += ['-arch=' + arch_opt]
+endif
+
+foreach input_format : conv_input_formats
+  foreach output_format : conv_output_formats
+    ptx_name = 'GstCudaConverterMain_@0@_@1@.ptx'.format(input_format, output_format)
+    opts = opt_common + ['-DSAMPLER=Sample@0@'.format(input_format),
+            '-DOUTPUT=Output@0@'.format(output_format), '@INPUT@']
+    compiled_kernel = custom_target(ptx_name,
+        input : conv_source,
+        output : ptx_name,
+        command : [nvcc] + opts)
+    conv_precompiled += [compiled_kernel]
+  endforeach
+endforeach
+
+ptx_name = 'GstCudaConverterUnpack.ptx'
+compiled_kernel = custom_target(ptx_name,
+    input : conv_comm_source,
+    output : ptx_name,
+    command : [nvcc] + opt_common + ['@INPUT@'])
+conv_precompiled += [compiled_kernel]
+
+conv_ptx_collection = custom_target('converter_ptx',
+    input : conv_precompiled,
+    output : 'converter_ptx.h',
+    command : [header_collector,
+        '--input', meson.current_build_dir(),
+        '--prefix', 'GstCudaConverter',
+        '--name', 'g_precompiled_ptx_table',
+        '--output', '@OUTPUT@'
+    ])
+
+nvcodec_kernel_precompiled += [
+  conv_precompiled,
+  conv_ptx_collection,
+]
--- a/subprojects/gst-plugins-bad/sys/nvcodec/meson.build
+++ b/subprojects/gst-plugins-bad/sys/nvcodec/meson.build
@ -75,6 +75,15 @@ plugin_incdirs = [configinc, cuda_stubinc]
 extra_args = ['-DGST_USE_UNSTABLE_API']
 extra_deps = []

+nvcodec_kernel_precompiled = []
+nvcodec_precompile_opt = get_option('nvcodec-cuda-precompile')
+if not nvcodec_precompile_opt.disabled() and not meson.is_cross_build()
+  nvcc = find_program ('nvcc', required : nvcodec_precompile_opt)
+  if nvcc.found()
+    subdir('kernel')
+  endif
+endif
+
 if gstgl_dep.found()
  extra_args += ['-DHAVE_CUDA_GST_GL']
 endif
@ -121,7 +130,7 @@ if cc.get_id() != 'msvc'
 endif

 gstnvcodec = library('gstnvcodec',
-  nvcodec_sources,
+  nvcodec_sources + nvcodec_kernel_precompiled,
  c_args : gst_plugins_bad_args + extra_args,
  cpp_args : gst_plugins_bad_args + extra_args,
  include_directories : plugin_incdirs,