nvcodec: Add support for CUDA kernel precompile

Enable build time CUDA kernel compile if nvcc is detected.
Precompile is disabled by default and controlled by
"nvcodec-cuda-precompile" build option.

Part-of: <https://gitlab.freedesktop.org/gstreamer/gstreamer/-/merge_requests/8536>
This commit is contained in:
Seungha Yang 2025-02-20 15:26:37 +09:00 committed by GStreamer Marge Bot
parent 8165735902
commit 9a8f3a65a3
6 changed files with 3287 additions and 1 deletions

View File

@ -246,6 +246,10 @@ option('sctp-internal-usrsctp', type: 'feature', value : 'enabled',
option('mfx_api', type : 'combo', choices : ['MSDK', 'oneVPL', 'auto'], value : 'auto',
description : 'Select MFX API to build against')
# nvcodec plugin options
option('nvcodec-cuda-precompile', type : 'feature', value : 'disabled', description : 'Enable CUDA kernel precompile')
option('nvcodec-nvcc-arch', type : 'string', value : 'compute_52', description : 'GPU architectur for nvcc -arch option')
# nvCOMP plugin options
option('nvcomp-sdk-path', type: 'string', value : '',
description : 'nvCOMP SDK root directory')

View File

@ -0,0 +1,79 @@
#!/usr/bin/env python3
# GStreamer
# Copyright (C) 2025 Seungha Yang <seungha@centricular.com>
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Library General Public
# License as published by the Free Software Foundation; either
# version 2 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Library General Public License for more details.
#
# You should have received a copy of the GNU Library General Public
# License along with this library; if not, write to the
# Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
# Boston, MA 02110-1301, USA.
import sys
import os
import argparse
start_header = """/*
* This file is autogenerated by collect_ptx_headers.py
*/
#pragma once
"""
start_map = """
#define MAKE_BYTECODE(name) { G_STRINGIFY (name), g_##name }
static std::unordered_map<std::string, const char *>
"""
end_map = """};
#undef MAKE_BYTECODE
"""
def convert_ptx_to_header(ptx_file, header_file):
with open(ptx_file, 'r', encoding='utf8') as ptx:
ptx_content = ptx.read()
with open(header_file, 'w', newline='\n', encoding='utf8') as header:
header.write('#pragma once\n')
header.write('// This file is autogenerated by collect_ptx_headers.py\n')
header.write(f'static const char* g_{os.path.splitext(os.path.basename(ptx_file))[0]} = R"(\n')
header.write(ptx_content)
header.write(')";\n\n')
def main(args):
parser = argparse.ArgumentParser(description='Read CUDA PTX from directory and make single header')
parser.add_argument("--input", help="the precompiled CUDA PTX directory")
parser.add_argument("--output", help="output header file location")
parser.add_argument("--prefix", help="CUDA PTX header filename prefix")
parser.add_argument("--name", help="Hash map variable name")
args = parser.parse_args(args)
ptx_files = [os.path.join(args.input, file) for file in os.listdir(args.input) if file.startswith(args.prefix) and file.endswith(".ptx") ]
with open(args.output, 'w', newline='\n', encoding='utf8') as f:
f.write(start_header)
for ptx_file in ptx_files:
header_file = os.path.splitext(ptx_file)[0] + '.h'
convert_ptx_to_header(ptx_file, header_file)
f.write("#include \"")
f.write(os.path.basename(header_file))
f.write("\"\n")
f.write(start_map)
f.write(args.name)
f.write(" = {\n")
for ptx_file in ptx_files:
f.write(" MAKE_BYTECODE ({}),\n".format(os.path.splitext(os.path.basename(ptx_file))[0]))
f.write(end_map)
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))

View File

@ -0,0 +1,168 @@
/* GStreamer
* Copyright (C) 2025 Seungha Yang <seungha@centricular.com>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with this library; if not, write to the
* Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
* Boston, MA 02110-1301, USA.
*/
#ifdef __NVCC__
extern "C" {
__global__ void
GstCudaConverterUnpack_RGB_RGBx
(unsigned char *src, unsigned char *dst, int width, int height,
int src_stride, int dst_stride)
{
int x_pos = blockIdx.x * blockDim.x + threadIdx.x;
int y_pos = blockIdx.y * blockDim.y + threadIdx.y;
if (x_pos < width && y_pos < height) {
int dst_pos = x_pos * 4 + y_pos * dst_stride;
int src_pos = x_pos * 3 + y_pos * src_stride;
dst[dst_pos] = src[src_pos];
dst[dst_pos + 1] = src[src_pos + 1];
dst[dst_pos + 2] = src[src_pos + 2];
dst[dst_pos + 3] = 0xff;
}
}
__global__ void
GstCudaConverterUnpack_RGB10A2_ARGB64
(unsigned char *src, unsigned char *dst, int width, int height,
int src_stride, int dst_stride)
{
int x_pos = blockIdx.x * blockDim.x + threadIdx.x;
int y_pos = blockIdx.y * blockDim.y + threadIdx.y;
if (x_pos < width && y_pos < height) {
unsigned short a, r, g, b;
unsigned int val;
int dst_pos = x_pos * 8 + y_pos * dst_stride;
val = *(unsigned int *)&src[x_pos * 4 + y_pos * src_stride];
a = (val >> 30) & 0x03;
a = (a << 14) | (a << 12) | (a << 10) | (a << 8) | (a << 6) | (a << 4) | (a << 2) | (a << 0);
r = (val & 0x3ff);
r = (r << 6) | (r >> 4);
g = ((val >> 10) & 0x3ff);
g = (g << 6) | (g >> 4);
b = ((val >> 20) & 0x3ff);
b = (b << 6) | (b >> 4);
*(unsigned short *) &dst[dst_pos] = a;
*(unsigned short *) &dst[dst_pos + 2] = r;
*(unsigned short *) &dst[dst_pos + 4] = g;
*(unsigned short *) &dst[dst_pos + 6] = b;
}
}
__global__ void
GstCudaConverterUnpack_BGR10A2_ARGB64
(unsigned char *src, unsigned char *dst, int width, int height,
int src_stride, int dst_stride)
{
int x_pos = blockIdx.x * blockDim.x + threadIdx.x;
int y_pos = blockIdx.y * blockDim.y + threadIdx.y;
if (x_pos < width && y_pos < height) {
unsigned short a, r, g, b;
unsigned int val;
int dst_pos = x_pos * 8 + y_pos * dst_stride;
val = *(unsigned int *)&src[x_pos * 4 + y_pos * src_stride];
a = (val >> 30) & 0x03;
a = (a << 14) | (a << 12) | (a << 10) | (a << 8) | (a << 6) | (a << 4) | (a << 2) | (a << 0);
b = (val & 0x3ff);
b = (b << 6) | (b >> 4);
g = ((val >> 10) & 0x3ff);
g = (g << 6) | (g >> 4);
r = ((val >> 20) & 0x3ff);
r = (r << 6) | (r >> 4);
*(unsigned short *) &dst[dst_pos] = a;
*(unsigned short *) &dst[dst_pos + 2] = r;
*(unsigned short *) &dst[dst_pos + 4] = g;
*(unsigned short *) &dst[dst_pos + 6] = b;
}
}
}
#else
static const char GstCudaConverterUnpack_str[] =
"extern \"C\" {\n"
"__global__ void\n"
"GstCudaConverterUnpack_RGB_RGBx\n"
"(unsigned char *src, unsigned char *dst, int width, int height,\n"
" int src_stride, int dst_stride)\n"
"{\n"
" int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
" int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
" if (x_pos < width && y_pos < height) {\n"
" int dst_pos = x_pos * 4 + y_pos * dst_stride;\n"
" int src_pos = x_pos * 3 + y_pos * src_stride;\n"
" dst[dst_pos] = src[src_pos];\n"
" dst[dst_pos + 1] = src[src_pos + 1];\n"
" dst[dst_pos + 2] = src[src_pos + 2];\n"
" dst[dst_pos + 3] = 0xff;\n"
" }\n"
"}\n"
"\n"
"__global__ void\n"
"GstCudaConverterUnpack_RGB10A2_ARGB64\n"
"(unsigned char *src, unsigned char *dst, int width, int height,\n"
" int src_stride, int dst_stride)\n"
"{\n"
" int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
" int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
" if (x_pos < width && y_pos < height) {\n"
" unsigned short a, r, g, b;\n"
" unsigned int val;\n"
" int dst_pos = x_pos * 8 + y_pos * dst_stride;\n"
" val = *(unsigned int *)&src[x_pos * 4 + y_pos * src_stride];\n"
" a = (val >> 30) & 0x03;\n"
" a = (a << 14) | (a << 12) | (a << 10) | (a << 8) | (a << 6) | (a << 4) | (a << 2) | (a << 0);\n"
" r = (val & 0x3ff);\n"
" r = (r << 6) | (r >> 4);\n"
" g = ((val >> 10) & 0x3ff);\n"
" g = (g << 6) | (g >> 4);\n"
" b = ((val >> 20) & 0x3ff);\n"
" b = (b << 6) | (b >> 4);\n"
" *(unsigned short *) &dst[dst_pos] = a;\n"
" *(unsigned short *) &dst[dst_pos + 2] = r;\n"
" *(unsigned short *) &dst[dst_pos + 4] = g;\n"
" *(unsigned short *) &dst[dst_pos + 6] = b;\n"
" }\n"
"}\n"
"\n"
"__global__ void\n"
"GstCudaConverterUnpack_BGR10A2_ARGB64\n"
"(unsigned char *src, unsigned char *dst, int width, int height,\n"
" int src_stride, int dst_stride)\n"
"{\n"
" int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
" int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
" if (x_pos < width && y_pos < height) {\n"
" unsigned short a, r, g, b;\n"
" unsigned int val;\n"
" int dst_pos = x_pos * 8 + y_pos * dst_stride;\n"
" val = *(unsigned int *)&src[x_pos * 4 + y_pos * src_stride];\n"
" a = (val >> 30) & 0x03;\n"
" a = (a << 14) | (a << 12) | (a << 10) | (a << 8) | (a << 6) | (a << 4) | (a << 2) | (a << 0);\n"
" b = (val & 0x3ff);\n"
" b = (b << 6) | (b >> 4);\n"
" g = ((val >> 10) & 0x3ff);\n"
" g = (g << 6) | (g >> 4);\n"
" r = ((val >> 20) & 0x3ff);\n"
" r = (r << 6) | (r >> 4);\n"
" *(unsigned short *) &dst[dst_pos] = a;\n"
" *(unsigned short *) &dst[dst_pos + 2] = r;\n"
" *(unsigned short *) &dst[dst_pos + 4] = g;\n"
" *(unsigned short *) &dst[dst_pos + 6] = b;\n"
" }\n"
"}\n"
"}\n"
"\n";
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,102 @@
conv_source = files('gstcudaconverter.cu')
conv_comm_source = files('gstcudaconverter-unpack.cu')
conv_input_formats = [
'I420',
'YV12',
'I420_10',
'I420_12',
'NV12',
'NV21',
'VUYA',
'RGBA',
'BGRA',
'RGBx',
'BGRx',
'ARGB',
'ABGR',
'RGBP',
'BGRP',
'GBR',
'GBR_10',
'GBR_12',
'GBRA',
]
conv_output_formats = [
'I420',
'YV12',
'NV12',
'NV21',
'P010',
'I420_10',
'I420_12',
'Y444',
'Y444_10',
'Y444_12',
'Y444_16',
'Y42B',
'I422_10',
'I422_12',
'VUYA',
'RGBA',
'RGBx',
'BGRA',
'BGRx',
'ARGB',
'ABGR',
'RGB',
'BGR',
'RGB10A2',
'BGR10A2',
'RGBP',
'GBR',
'GBR_10',
'GBR_12',
'GBR_16',
'GBRA',
]
header_collector = find_program('collect_ptx_headers.py')
conv_precompiled = []
opt_common = ['-ptx', '-w', '-o', '@OUTPUT@']
arch_opt = get_option('nvcodec-nvcc-arch')
if arch_opt != ''
opt_common += ['-arch=' + arch_opt]
endif
foreach input_format : conv_input_formats
foreach output_format : conv_output_formats
ptx_name = 'GstCudaConverterMain_@0@_@1@.ptx'.format(input_format, output_format)
opts = opt_common + ['-DSAMPLER=Sample@0@'.format(input_format),
'-DOUTPUT=Output@0@'.format(output_format), '@INPUT@']
compiled_kernel = custom_target(ptx_name,
input : conv_source,
output : ptx_name,
command : [nvcc] + opts)
conv_precompiled += [compiled_kernel]
endforeach
endforeach
ptx_name = 'GstCudaConverterUnpack.ptx'
compiled_kernel = custom_target(ptx_name,
input : conv_comm_source,
output : ptx_name,
command : [nvcc] + opt_common + ['@INPUT@'])
conv_precompiled += [compiled_kernel]
conv_ptx_collection = custom_target('converter_ptx',
input : conv_precompiled,
output : 'converter_ptx.h',
command : [header_collector,
'--input', meson.current_build_dir(),
'--prefix', 'GstCudaConverter',
'--name', 'g_precompiled_ptx_table',
'--output', '@OUTPUT@'
])
nvcodec_kernel_precompiled += [
conv_precompiled,
conv_ptx_collection,
]

View File

@ -75,6 +75,15 @@ plugin_incdirs = [configinc, cuda_stubinc]
extra_args = ['-DGST_USE_UNSTABLE_API']
extra_deps = []
nvcodec_kernel_precompiled = []
nvcodec_precompile_opt = get_option('nvcodec-cuda-precompile')
if not nvcodec_precompile_opt.disabled() and not meson.is_cross_build()
nvcc = find_program ('nvcc', required : nvcodec_precompile_opt)
if nvcc.found()
subdir('kernel')
endif
endif
if gstgl_dep.found()
extra_args += ['-DHAVE_CUDA_GST_GL']
endif
@ -121,7 +130,7 @@ if cc.get_id() != 'msvc'
endif
gstnvcodec = library('gstnvcodec',
nvcodec_sources,
nvcodec_sources + nvcodec_kernel_precompiled,
c_args : gst_plugins_bad_args + extra_args,
cpp_args : gst_plugins_bad_args + extra_args,
include_directories : plugin_incdirs,