From ae8eef82a690336bdcef080fed49eff1db0bd704 Mon Sep 17 00:00:00 2001 From: Seungha Yang Date: Fri, 21 Feb 2025 19:18:01 +0900 Subject: [PATCH] nvjpegenc: Add support for kernel precompile Port to CUDA precompile/cache Part-of: --- .../sys/nvcodec/gstnvjpegenc.cpp | 127 ++++++++++++------ .../sys/nvcodec/kernel/gstnvjpegenc.cu | 78 +++++++++++ .../sys/nvcodec/kernel/meson.build | 21 +++ 3 files changed, 182 insertions(+), 44 deletions(-) create mode 100644 subprojects/gst-plugins-bad/sys/nvcodec/kernel/gstnvjpegenc.cu diff --git a/subprojects/gst-plugins-bad/sys/nvcodec/gstnvjpegenc.cpp b/subprojects/gst-plugins-bad/sys/nvcodec/gstnvjpegenc.cpp index c307cc278f..616e8963c2 100644 --- a/subprojects/gst-plugins-bad/sys/nvcodec/gstnvjpegenc.cpp +++ b/subprojects/gst-plugins-bad/sys/nvcodec/gstnvjpegenc.cpp @@ -27,6 +27,8 @@ #include #include #include +#include +#include "kernel/gstnvjpegenc.cu" /** * SECTION:element-nvjpegenc @@ -43,6 +45,18 @@ * */ +/* *INDENT-OFF* */ +#ifdef NVCODEC_CUDA_PRECOMPILED +#include "kernel/jpegenc_ptx.h" +#else +static std::unordered_map g_precompiled_ptx_table; +#endif + +static std::unordered_map g_cubin_table; +static std::unordered_map g_ptx_table; +static std::mutex g_kernel_table_lock; +/* *INDENT-ON* */ + GST_DEBUG_CATEGORY_STATIC (gst_nv_jpeg_enc_debug); #define GST_CAT_DEFAULT gst_nv_jpeg_enc_debug @@ -391,38 +405,6 @@ gst_nv_jpeg_enc_set_context (GstElement * element, GstContext * context) GST_ELEMENT_CLASS (parent_class)->set_context (element, context); } -#define KERNEL_MAIN_FUNC "gst_nv_jpec_enc_kernel" -/* *INDENT-OFF* */ -const static gchar kernel_source[] = -"extern \"C\" {\n" -"__device__ inline unsigned char\n" -"scale_to_uchar (float val)\n" -"{\n" -" return (unsigned char) __float2int_rz (val * 255.0);\n" -"}\n" -"\n" -"__global__ void\n" -KERNEL_MAIN_FUNC "(cudaTextureObject_t uv_tex, unsigned char * out_u,\n" -" unsigned char * out_v, int width, int height, int stride)\n" -"{\n" -" int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n" -" int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n" -" if (x_pos >= width || y_pos >= height)\n" -" return;\n" -" float x = 0;\n" -" float y = 0;\n" -" if (width > 1)\n" -" x = (float) x_pos / (width - 1);\n" -" if (height > 1)\n" -" y = (float) y_pos / (height - 1);\n" -" float2 uv = tex2D (uv_tex, x, y);\n" -" unsigned int pos = x_pos + (y_pos * stride);\n" -" out_u[pos] = scale_to_uchar (uv.x);\n" -" out_v[pos] = scale_to_uchar (uv.y);\n" -"}\n" -"}"; -/* *INDENT-ON* */ - static gboolean gst_nv_jpeg_enc_open (GstVideoEncoder * encoder) { @@ -444,28 +426,81 @@ gst_nv_jpeg_enc_open (GstVideoEncoder * encoder) } if (!priv->module && klass->have_nvrtc) { - auto program = gst_cuda_nvrtc_compile_cubin (kernel_source, - klass->cuda_device_id); - if (!program) - program = gst_cuda_nvrtc_compile (kernel_source); + const gchar *program = nullptr; + auto precompiled = g_precompiled_ptx_table.find ("GstJpegEnc"); + CUresult ret; + if (precompiled != g_precompiled_ptx_table.end ()) + program = precompiled->second; - if (!program) { - GST_ERROR_OBJECT (self, "Couldn't compile kernel source"); - gst_cuda_context_pop (nullptr); - return FALSE; + if (program) { + GST_DEBUG_OBJECT (self, "Precompiled PTX available"); + ret = CuModuleLoadData (&priv->module, program); + if (ret != CUDA_SUCCESS) { + GST_WARNING_OBJECT (self, "Could not load module from precompiled PTX"); + priv->module = nullptr; + program = nullptr; + } } - auto ret = CuModuleLoadData (&priv->module, program); - g_free (program); + if (!program) { + std::lock_guard < std::mutex > lk (g_kernel_table_lock); + std::string cubin_kernel_name = + "GstJpegEnc_device_" + std::to_string (klass->cuda_device_id); - if (!gst_cuda_result (ret)) { + auto cubin = g_cubin_table.find (cubin_kernel_name); + if (cubin == g_cubin_table.end ()) { + GST_DEBUG_OBJECT (self, "Building CUBIN"); + program = gst_cuda_nvrtc_compile_cubin (GstNvJpegEncConvertMain_str, + klass->cuda_device_id); + if (program) + g_cubin_table[cubin_kernel_name] = program; + } else { + GST_DEBUG_OBJECT (self, "Found cached CUBIN"); + program = cubin->second; + } + + if (program) { + GST_DEBUG_OBJECT (self, "Loading CUBIN module"); + ret = CuModuleLoadData (&priv->module, program); + if (ret != CUDA_SUCCESS) { + GST_WARNING_OBJECT (self, "Could not load module from cached CUBIN"); + program = nullptr; + priv->module = nullptr; + } + } + + if (!program) { + auto ptx = g_ptx_table.find ("GstJpegEnc"); + if (ptx == g_ptx_table.end ()) { + GST_DEBUG_OBJECT (self, "Building PTX"); + program = gst_cuda_nvrtc_compile (GstNvJpegEncConvertMain_str); + if (program) + g_ptx_table["GstJpegEnc"] = program; + } else { + GST_DEBUG_OBJECT (self, "Found cached PTX"); + program = ptx->second; + } + } + + if (program && !priv->module) { + GST_DEBUG_OBJECT (self, "Loading PTX module"); + ret = CuModuleLoadData (&priv->module, program); + if (ret != CUDA_SUCCESS) { + GST_ERROR_OBJECT (self, "Could not load module from PTX"); + program = nullptr; + priv->module = nullptr; + } + } + } + + if (!priv->module) { GST_ERROR_OBJECT (self, "Couldn't load module"); gst_cuda_context_pop (nullptr); return FALSE; } ret = CuModuleGetFunction (&priv->kernel_func, priv->module, - KERNEL_MAIN_FUNC); + "GstNvJpegEncConvertMain"); if (!gst_cuda_result (ret)) { GST_ERROR_OBJECT (self, "Couldn't get kernel function"); gst_cuda_context_pop (nullptr); @@ -1013,6 +1048,10 @@ gst_nv_jpeg_enc_register (GstPlugin * plugin, GstCudaContext * context, g_object_get (context, "cuda-device-id", &cuda_device_id, nullptr); std::string format_string; +#ifdef NVCODEC_CUDA_PRECOMPILED + have_nvrtc = TRUE; +#endif + if (have_nvrtc) format_string = "NV12, I420, Y42B, Y444"; else diff --git a/subprojects/gst-plugins-bad/sys/nvcodec/kernel/gstnvjpegenc.cu b/subprojects/gst-plugins-bad/sys/nvcodec/kernel/gstnvjpegenc.cu new file mode 100644 index 0000000000..3eacb6abb8 --- /dev/null +++ b/subprojects/gst-plugins-bad/sys/nvcodec/kernel/gstnvjpegenc.cu @@ -0,0 +1,78 @@ +/* GStreamer + * Copyright (C) 2025 Seungha Yang + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, + * Boston, MA 02110-1301, USA. + */ + +#ifdef __NVCC__ +__device__ inline unsigned char +scale_to_uchar (float val) +{ + return (unsigned char) __float2int_rz (val * 255.0); +} + +extern "C" { +__global__ void +GstNvJpegEncConvertMain (cudaTextureObject_t uv_tex, unsigned char * out_u, + unsigned char * out_v, int width, int height, int stride) +{ + int x_pos = blockIdx.x * blockDim.x + threadIdx.x; + int y_pos = blockIdx.y * blockDim.y + threadIdx.y; + if (x_pos >= width || y_pos >= height) + return; + float x = 0; + float y = 0; + if (width > 1) + x = (float) x_pos / (width - 1); + if (height > 1) + y = (float) y_pos / (height - 1); + float2 uv = tex2D (uv_tex, x, y); + unsigned int pos = x_pos + (y_pos * stride); + out_u[pos] = scale_to_uchar (uv.x); + out_v[pos] = scale_to_uchar (uv.y); +} +} +#else +static const gchar *GstNvJpegEncConvertMain_str = R"( +__device__ inline unsigned char +scale_to_uchar (float val) +{ + return (unsigned char) __float2int_rz (val * 255.0); +} + +extern "C" { +__global__ void +GstNvJpegEncConvertMain (cudaTextureObject_t uv_tex, unsigned char * out_u, + unsigned char * out_v, int width, int height, int stride) +{ + int x_pos = blockIdx.x * blockDim.x + threadIdx.x; + int y_pos = blockIdx.y * blockDim.y + threadIdx.y; + if (x_pos >= width || y_pos >= height) + return; + float x = 0; + float y = 0; + if (width > 1) + x = (float) x_pos / (width - 1); + if (height > 1) + y = (float) y_pos / (height - 1); + float2 uv = tex2D (uv_tex, x, y); + unsigned int pos = x_pos + (y_pos * stride); + out_u[pos] = scale_to_uchar (uv.x); + out_v[pos] = scale_to_uchar (uv.y); +} +} +)"; +#endif diff --git a/subprojects/gst-plugins-bad/sys/nvcodec/kernel/meson.build b/subprojects/gst-plugins-bad/sys/nvcodec/kernel/meson.build index fd5500c2e2..87cda3b379 100644 --- a/subprojects/gst-plugins-bad/sys/nvcodec/kernel/meson.build +++ b/subprojects/gst-plugins-bad/sys/nvcodec/kernel/meson.build @@ -1,5 +1,6 @@ conv_source = files('gstcudaconverter.cu') conv_comm_source = files('gstcudaconverter-unpack.cu') +jpegenc_source = files('gstnvjpegenc.cu') conv_input_formats = [ 'I420', @@ -60,6 +61,7 @@ conv_output_formats = [ header_collector = find_program('collect_ptx_headers.py') conv_precompiled = [] +jpegenc_precompiled = [] opt_common = ['-ptx', '-w', '-o', '@OUTPUT@'] arch_opt = get_option('nvcodec-nvcc-arch') if arch_opt != '' @@ -96,7 +98,26 @@ conv_ptx_collection = custom_target('converter_ptx', '--output', '@OUTPUT@' ]) +ptx_name = 'GstJpegEnc.ptx' +compiled_kernel = custom_target(ptx_name, + input : jpegenc_source, + output : ptx_name, + command : [nvcc] + opt_common + ['@INPUT@']) +jpegenc_precompiled += [compiled_kernel] + +jpegenc_ptx_collection = custom_target('jpegenc_ptx', + input : jpegenc_precompiled, + output : 'jpegenc_ptx.h', + command : [header_collector, + '--input', meson.current_build_dir(), + '--prefix', 'GstJpegEnc', + '--name', 'g_precompiled_ptx_table', + '--output', '@OUTPUT@' + ]) + nvcodec_kernel_precompiled += [ conv_precompiled, conv_ptx_collection, + jpegenc_precompiled, + jpegenc_ptx_collection, ]