nvjpegenc: Add support for kernel precompile

Port to CUDA precompile/cache

Part-of: <https://gitlab.freedesktop.org/gstreamer/gstreamer/-/merge_requests/8536>
This commit is contained in:
Seungha Yang 2025-02-21 19:18:01 +09:00 committed by GStreamer Marge Bot
parent 697cfe38ef
commit ae8eef82a6
3 changed files with 182 additions and 44 deletions

View File

@ -27,6 +27,8 @@
#include <gmodule.h>
#include <string>
#include <mutex>
#include <unordered_map>
#include "kernel/gstnvjpegenc.cu"
/**
* SECTION:element-nvjpegenc
@ -43,6 +45,18 @@
*
*/
/* *INDENT-OFF* */
#ifdef NVCODEC_CUDA_PRECOMPILED
#include "kernel/jpegenc_ptx.h"
#else
static std::unordered_map<std::string, const char *> g_precompiled_ptx_table;
#endif
static std::unordered_map<std::string, const char *> g_cubin_table;
static std::unordered_map<std::string, const char *> g_ptx_table;
static std::mutex g_kernel_table_lock;
/* *INDENT-ON* */
GST_DEBUG_CATEGORY_STATIC (gst_nv_jpeg_enc_debug);
#define GST_CAT_DEFAULT gst_nv_jpeg_enc_debug
@ -391,38 +405,6 @@ gst_nv_jpeg_enc_set_context (GstElement * element, GstContext * context)
GST_ELEMENT_CLASS (parent_class)->set_context (element, context);
}
#define KERNEL_MAIN_FUNC "gst_nv_jpec_enc_kernel"
/* *INDENT-OFF* */
const static gchar kernel_source[] =
"extern \"C\" {\n"
"__device__ inline unsigned char\n"
"scale_to_uchar (float val)\n"
"{\n"
" return (unsigned char) __float2int_rz (val * 255.0);\n"
"}\n"
"\n"
"__global__ void\n"
KERNEL_MAIN_FUNC "(cudaTextureObject_t uv_tex, unsigned char * out_u,\n"
" unsigned char * out_v, int width, int height, int stride)\n"
"{\n"
" int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
" int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
" if (x_pos >= width || y_pos >= height)\n"
" return;\n"
" float x = 0;\n"
" float y = 0;\n"
" if (width > 1)\n"
" x = (float) x_pos / (width - 1);\n"
" if (height > 1)\n"
" y = (float) y_pos / (height - 1);\n"
" float2 uv = tex2D<float2> (uv_tex, x, y);\n"
" unsigned int pos = x_pos + (y_pos * stride);\n"
" out_u[pos] = scale_to_uchar (uv.x);\n"
" out_v[pos] = scale_to_uchar (uv.y);\n"
"}\n"
"}";
/* *INDENT-ON* */
static gboolean
gst_nv_jpeg_enc_open (GstVideoEncoder * encoder)
{
@ -444,28 +426,81 @@ gst_nv_jpeg_enc_open (GstVideoEncoder * encoder)
}
if (!priv->module && klass->have_nvrtc) {
auto program = gst_cuda_nvrtc_compile_cubin (kernel_source,
klass->cuda_device_id);
if (!program)
program = gst_cuda_nvrtc_compile (kernel_source);
const gchar *program = nullptr;
auto precompiled = g_precompiled_ptx_table.find ("GstJpegEnc");
CUresult ret;
if (precompiled != g_precompiled_ptx_table.end ())
program = precompiled->second;
if (!program) {
GST_ERROR_OBJECT (self, "Couldn't compile kernel source");
gst_cuda_context_pop (nullptr);
return FALSE;
if (program) {
GST_DEBUG_OBJECT (self, "Precompiled PTX available");
ret = CuModuleLoadData (&priv->module, program);
if (ret != CUDA_SUCCESS) {
GST_WARNING_OBJECT (self, "Could not load module from precompiled PTX");
priv->module = nullptr;
program = nullptr;
}
}
auto ret = CuModuleLoadData (&priv->module, program);
g_free (program);
if (!program) {
std::lock_guard < std::mutex > lk (g_kernel_table_lock);
std::string cubin_kernel_name =
"GstJpegEnc_device_" + std::to_string (klass->cuda_device_id);
if (!gst_cuda_result (ret)) {
auto cubin = g_cubin_table.find (cubin_kernel_name);
if (cubin == g_cubin_table.end ()) {
GST_DEBUG_OBJECT (self, "Building CUBIN");
program = gst_cuda_nvrtc_compile_cubin (GstNvJpegEncConvertMain_str,
klass->cuda_device_id);
if (program)
g_cubin_table[cubin_kernel_name] = program;
} else {
GST_DEBUG_OBJECT (self, "Found cached CUBIN");
program = cubin->second;
}
if (program) {
GST_DEBUG_OBJECT (self, "Loading CUBIN module");
ret = CuModuleLoadData (&priv->module, program);
if (ret != CUDA_SUCCESS) {
GST_WARNING_OBJECT (self, "Could not load module from cached CUBIN");
program = nullptr;
priv->module = nullptr;
}
}
if (!program) {
auto ptx = g_ptx_table.find ("GstJpegEnc");
if (ptx == g_ptx_table.end ()) {
GST_DEBUG_OBJECT (self, "Building PTX");
program = gst_cuda_nvrtc_compile (GstNvJpegEncConvertMain_str);
if (program)
g_ptx_table["GstJpegEnc"] = program;
} else {
GST_DEBUG_OBJECT (self, "Found cached PTX");
program = ptx->second;
}
}
if (program && !priv->module) {
GST_DEBUG_OBJECT (self, "Loading PTX module");
ret = CuModuleLoadData (&priv->module, program);
if (ret != CUDA_SUCCESS) {
GST_ERROR_OBJECT (self, "Could not load module from PTX");
program = nullptr;
priv->module = nullptr;
}
}
}
if (!priv->module) {
GST_ERROR_OBJECT (self, "Couldn't load module");
gst_cuda_context_pop (nullptr);
return FALSE;
}
ret = CuModuleGetFunction (&priv->kernel_func, priv->module,
KERNEL_MAIN_FUNC);
"GstNvJpegEncConvertMain");
if (!gst_cuda_result (ret)) {
GST_ERROR_OBJECT (self, "Couldn't get kernel function");
gst_cuda_context_pop (nullptr);
@ -1013,6 +1048,10 @@ gst_nv_jpeg_enc_register (GstPlugin * plugin, GstCudaContext * context,
g_object_get (context, "cuda-device-id", &cuda_device_id, nullptr);
std::string format_string;
#ifdef NVCODEC_CUDA_PRECOMPILED
have_nvrtc = TRUE;
#endif
if (have_nvrtc)
format_string = "NV12, I420, Y42B, Y444";
else

View File

@ -0,0 +1,78 @@
/* GStreamer
* Copyright (C) 2025 Seungha Yang <seungha@centricular.com>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with this library; if not, write to the
* Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
* Boston, MA 02110-1301, USA.
*/
#ifdef __NVCC__
__device__ inline unsigned char
scale_to_uchar (float val)
{
return (unsigned char) __float2int_rz (val * 255.0);
}
extern "C" {
__global__ void
GstNvJpegEncConvertMain (cudaTextureObject_t uv_tex, unsigned char * out_u,
unsigned char * out_v, int width, int height, int stride)
{
int x_pos = blockIdx.x * blockDim.x + threadIdx.x;
int y_pos = blockIdx.y * blockDim.y + threadIdx.y;
if (x_pos >= width || y_pos >= height)
return;
float x = 0;
float y = 0;
if (width > 1)
x = (float) x_pos / (width - 1);
if (height > 1)
y = (float) y_pos / (height - 1);
float2 uv = tex2D<float2> (uv_tex, x, y);
unsigned int pos = x_pos + (y_pos * stride);
out_u[pos] = scale_to_uchar (uv.x);
out_v[pos] = scale_to_uchar (uv.y);
}
}
#else
static const gchar *GstNvJpegEncConvertMain_str = R"(
__device__ inline unsigned char
scale_to_uchar (float val)
{
return (unsigned char) __float2int_rz (val * 255.0);
}
extern "C" {
__global__ void
GstNvJpegEncConvertMain (cudaTextureObject_t uv_tex, unsigned char * out_u,
unsigned char * out_v, int width, int height, int stride)
{
int x_pos = blockIdx.x * blockDim.x + threadIdx.x;
int y_pos = blockIdx.y * blockDim.y + threadIdx.y;
if (x_pos >= width || y_pos >= height)
return;
float x = 0;
float y = 0;
if (width > 1)
x = (float) x_pos / (width - 1);
if (height > 1)
y = (float) y_pos / (height - 1);
float2 uv = tex2D<float2> (uv_tex, x, y);
unsigned int pos = x_pos + (y_pos * stride);
out_u[pos] = scale_to_uchar (uv.x);
out_v[pos] = scale_to_uchar (uv.y);
}
}
)";
#endif

View File

@ -1,5 +1,6 @@
conv_source = files('gstcudaconverter.cu')
conv_comm_source = files('gstcudaconverter-unpack.cu')
jpegenc_source = files('gstnvjpegenc.cu')
conv_input_formats = [
'I420',
@ -60,6 +61,7 @@ conv_output_formats = [
header_collector = find_program('collect_ptx_headers.py')
conv_precompiled = []
jpegenc_precompiled = []
opt_common = ['-ptx', '-w', '-o', '@OUTPUT@']
arch_opt = get_option('nvcodec-nvcc-arch')
if arch_opt != ''
@ -96,7 +98,26 @@ conv_ptx_collection = custom_target('converter_ptx',
'--output', '@OUTPUT@'
])
ptx_name = 'GstJpegEnc.ptx'
compiled_kernel = custom_target(ptx_name,
input : jpegenc_source,
output : ptx_name,
command : [nvcc] + opt_common + ['@INPUT@'])
jpegenc_precompiled += [compiled_kernel]
jpegenc_ptx_collection = custom_target('jpegenc_ptx',
input : jpegenc_precompiled,
output : 'jpegenc_ptx.h',
command : [header_collector,
'--input', meson.current_build_dir(),
'--prefix', 'GstJpegEnc',
'--name', 'g_precompiled_ptx_table',
'--output', '@OUTPUT@'
])
nvcodec_kernel_precompiled += [
conv_precompiled,
conv_ptx_collection,
jpegenc_precompiled,
jpegenc_ptx_collection,
]