nvjpegenc: Add support for kernel precompile
Port to CUDA precompile/cache Part-of: <https://gitlab.freedesktop.org/gstreamer/gstreamer/-/merge_requests/8536>
This commit is contained in:
parent
697cfe38ef
commit
ae8eef82a6
@ -27,6 +27,8 @@
|
||||
#include <gmodule.h>
|
||||
#include <string>
|
||||
#include <mutex>
|
||||
#include <unordered_map>
|
||||
#include "kernel/gstnvjpegenc.cu"
|
||||
|
||||
/**
|
||||
* SECTION:element-nvjpegenc
|
||||
@ -43,6 +45,18 @@
|
||||
*
|
||||
*/
|
||||
|
||||
/* *INDENT-OFF* */
|
||||
#ifdef NVCODEC_CUDA_PRECOMPILED
|
||||
#include "kernel/jpegenc_ptx.h"
|
||||
#else
|
||||
static std::unordered_map<std::string, const char *> g_precompiled_ptx_table;
|
||||
#endif
|
||||
|
||||
static std::unordered_map<std::string, const char *> g_cubin_table;
|
||||
static std::unordered_map<std::string, const char *> g_ptx_table;
|
||||
static std::mutex g_kernel_table_lock;
|
||||
/* *INDENT-ON* */
|
||||
|
||||
GST_DEBUG_CATEGORY_STATIC (gst_nv_jpeg_enc_debug);
|
||||
#define GST_CAT_DEFAULT gst_nv_jpeg_enc_debug
|
||||
|
||||
@ -391,38 +405,6 @@ gst_nv_jpeg_enc_set_context (GstElement * element, GstContext * context)
|
||||
GST_ELEMENT_CLASS (parent_class)->set_context (element, context);
|
||||
}
|
||||
|
||||
#define KERNEL_MAIN_FUNC "gst_nv_jpec_enc_kernel"
|
||||
/* *INDENT-OFF* */
|
||||
const static gchar kernel_source[] =
|
||||
"extern \"C\" {\n"
|
||||
"__device__ inline unsigned char\n"
|
||||
"scale_to_uchar (float val)\n"
|
||||
"{\n"
|
||||
" return (unsigned char) __float2int_rz (val * 255.0);\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"__global__ void\n"
|
||||
KERNEL_MAIN_FUNC "(cudaTextureObject_t uv_tex, unsigned char * out_u,\n"
|
||||
" unsigned char * out_v, int width, int height, int stride)\n"
|
||||
"{\n"
|
||||
" int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
|
||||
" int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
|
||||
" if (x_pos >= width || y_pos >= height)\n"
|
||||
" return;\n"
|
||||
" float x = 0;\n"
|
||||
" float y = 0;\n"
|
||||
" if (width > 1)\n"
|
||||
" x = (float) x_pos / (width - 1);\n"
|
||||
" if (height > 1)\n"
|
||||
" y = (float) y_pos / (height - 1);\n"
|
||||
" float2 uv = tex2D<float2> (uv_tex, x, y);\n"
|
||||
" unsigned int pos = x_pos + (y_pos * stride);\n"
|
||||
" out_u[pos] = scale_to_uchar (uv.x);\n"
|
||||
" out_v[pos] = scale_to_uchar (uv.y);\n"
|
||||
"}\n"
|
||||
"}";
|
||||
/* *INDENT-ON* */
|
||||
|
||||
static gboolean
|
||||
gst_nv_jpeg_enc_open (GstVideoEncoder * encoder)
|
||||
{
|
||||
@ -444,28 +426,81 @@ gst_nv_jpeg_enc_open (GstVideoEncoder * encoder)
|
||||
}
|
||||
|
||||
if (!priv->module && klass->have_nvrtc) {
|
||||
auto program = gst_cuda_nvrtc_compile_cubin (kernel_source,
|
||||
klass->cuda_device_id);
|
||||
if (!program)
|
||||
program = gst_cuda_nvrtc_compile (kernel_source);
|
||||
const gchar *program = nullptr;
|
||||
auto precompiled = g_precompiled_ptx_table.find ("GstJpegEnc");
|
||||
CUresult ret;
|
||||
if (precompiled != g_precompiled_ptx_table.end ())
|
||||
program = precompiled->second;
|
||||
|
||||
if (!program) {
|
||||
GST_ERROR_OBJECT (self, "Couldn't compile kernel source");
|
||||
gst_cuda_context_pop (nullptr);
|
||||
return FALSE;
|
||||
if (program) {
|
||||
GST_DEBUG_OBJECT (self, "Precompiled PTX available");
|
||||
ret = CuModuleLoadData (&priv->module, program);
|
||||
if (ret != CUDA_SUCCESS) {
|
||||
GST_WARNING_OBJECT (self, "Could not load module from precompiled PTX");
|
||||
priv->module = nullptr;
|
||||
program = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
auto ret = CuModuleLoadData (&priv->module, program);
|
||||
g_free (program);
|
||||
if (!program) {
|
||||
std::lock_guard < std::mutex > lk (g_kernel_table_lock);
|
||||
std::string cubin_kernel_name =
|
||||
"GstJpegEnc_device_" + std::to_string (klass->cuda_device_id);
|
||||
|
||||
if (!gst_cuda_result (ret)) {
|
||||
auto cubin = g_cubin_table.find (cubin_kernel_name);
|
||||
if (cubin == g_cubin_table.end ()) {
|
||||
GST_DEBUG_OBJECT (self, "Building CUBIN");
|
||||
program = gst_cuda_nvrtc_compile_cubin (GstNvJpegEncConvertMain_str,
|
||||
klass->cuda_device_id);
|
||||
if (program)
|
||||
g_cubin_table[cubin_kernel_name] = program;
|
||||
} else {
|
||||
GST_DEBUG_OBJECT (self, "Found cached CUBIN");
|
||||
program = cubin->second;
|
||||
}
|
||||
|
||||
if (program) {
|
||||
GST_DEBUG_OBJECT (self, "Loading CUBIN module");
|
||||
ret = CuModuleLoadData (&priv->module, program);
|
||||
if (ret != CUDA_SUCCESS) {
|
||||
GST_WARNING_OBJECT (self, "Could not load module from cached CUBIN");
|
||||
program = nullptr;
|
||||
priv->module = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
if (!program) {
|
||||
auto ptx = g_ptx_table.find ("GstJpegEnc");
|
||||
if (ptx == g_ptx_table.end ()) {
|
||||
GST_DEBUG_OBJECT (self, "Building PTX");
|
||||
program = gst_cuda_nvrtc_compile (GstNvJpegEncConvertMain_str);
|
||||
if (program)
|
||||
g_ptx_table["GstJpegEnc"] = program;
|
||||
} else {
|
||||
GST_DEBUG_OBJECT (self, "Found cached PTX");
|
||||
program = ptx->second;
|
||||
}
|
||||
}
|
||||
|
||||
if (program && !priv->module) {
|
||||
GST_DEBUG_OBJECT (self, "Loading PTX module");
|
||||
ret = CuModuleLoadData (&priv->module, program);
|
||||
if (ret != CUDA_SUCCESS) {
|
||||
GST_ERROR_OBJECT (self, "Could not load module from PTX");
|
||||
program = nullptr;
|
||||
priv->module = nullptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!priv->module) {
|
||||
GST_ERROR_OBJECT (self, "Couldn't load module");
|
||||
gst_cuda_context_pop (nullptr);
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
ret = CuModuleGetFunction (&priv->kernel_func, priv->module,
|
||||
KERNEL_MAIN_FUNC);
|
||||
"GstNvJpegEncConvertMain");
|
||||
if (!gst_cuda_result (ret)) {
|
||||
GST_ERROR_OBJECT (self, "Couldn't get kernel function");
|
||||
gst_cuda_context_pop (nullptr);
|
||||
@ -1013,6 +1048,10 @@ gst_nv_jpeg_enc_register (GstPlugin * plugin, GstCudaContext * context,
|
||||
g_object_get (context, "cuda-device-id", &cuda_device_id, nullptr);
|
||||
|
||||
std::string format_string;
|
||||
#ifdef NVCODEC_CUDA_PRECOMPILED
|
||||
have_nvrtc = TRUE;
|
||||
#endif
|
||||
|
||||
if (have_nvrtc)
|
||||
format_string = "NV12, I420, Y42B, Y444";
|
||||
else
|
||||
|
@ -0,0 +1,78 @@
|
||||
/* GStreamer
|
||||
* Copyright (C) 2025 Seungha Yang <seungha@centricular.com>
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Library General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Library General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Library General Public
|
||||
* License along with this library; if not, write to the
|
||||
* Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
|
||||
* Boston, MA 02110-1301, USA.
|
||||
*/
|
||||
|
||||
#ifdef __NVCC__
|
||||
__device__ inline unsigned char
|
||||
scale_to_uchar (float val)
|
||||
{
|
||||
return (unsigned char) __float2int_rz (val * 255.0);
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
__global__ void
|
||||
GstNvJpegEncConvertMain (cudaTextureObject_t uv_tex, unsigned char * out_u,
|
||||
unsigned char * out_v, int width, int height, int stride)
|
||||
{
|
||||
int x_pos = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int y_pos = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
if (x_pos >= width || y_pos >= height)
|
||||
return;
|
||||
float x = 0;
|
||||
float y = 0;
|
||||
if (width > 1)
|
||||
x = (float) x_pos / (width - 1);
|
||||
if (height > 1)
|
||||
y = (float) y_pos / (height - 1);
|
||||
float2 uv = tex2D<float2> (uv_tex, x, y);
|
||||
unsigned int pos = x_pos + (y_pos * stride);
|
||||
out_u[pos] = scale_to_uchar (uv.x);
|
||||
out_v[pos] = scale_to_uchar (uv.y);
|
||||
}
|
||||
}
|
||||
#else
|
||||
static const gchar *GstNvJpegEncConvertMain_str = R"(
|
||||
__device__ inline unsigned char
|
||||
scale_to_uchar (float val)
|
||||
{
|
||||
return (unsigned char) __float2int_rz (val * 255.0);
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
__global__ void
|
||||
GstNvJpegEncConvertMain (cudaTextureObject_t uv_tex, unsigned char * out_u,
|
||||
unsigned char * out_v, int width, int height, int stride)
|
||||
{
|
||||
int x_pos = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int y_pos = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
if (x_pos >= width || y_pos >= height)
|
||||
return;
|
||||
float x = 0;
|
||||
float y = 0;
|
||||
if (width > 1)
|
||||
x = (float) x_pos / (width - 1);
|
||||
if (height > 1)
|
||||
y = (float) y_pos / (height - 1);
|
||||
float2 uv = tex2D<float2> (uv_tex, x, y);
|
||||
unsigned int pos = x_pos + (y_pos * stride);
|
||||
out_u[pos] = scale_to_uchar (uv.x);
|
||||
out_v[pos] = scale_to_uchar (uv.y);
|
||||
}
|
||||
}
|
||||
)";
|
||||
#endif
|
@ -1,5 +1,6 @@
|
||||
conv_source = files('gstcudaconverter.cu')
|
||||
conv_comm_source = files('gstcudaconverter-unpack.cu')
|
||||
jpegenc_source = files('gstnvjpegenc.cu')
|
||||
|
||||
conv_input_formats = [
|
||||
'I420',
|
||||
@ -60,6 +61,7 @@ conv_output_formats = [
|
||||
header_collector = find_program('collect_ptx_headers.py')
|
||||
|
||||
conv_precompiled = []
|
||||
jpegenc_precompiled = []
|
||||
opt_common = ['-ptx', '-w', '-o', '@OUTPUT@']
|
||||
arch_opt = get_option('nvcodec-nvcc-arch')
|
||||
if arch_opt != ''
|
||||
@ -96,7 +98,26 @@ conv_ptx_collection = custom_target('converter_ptx',
|
||||
'--output', '@OUTPUT@'
|
||||
])
|
||||
|
||||
ptx_name = 'GstJpegEnc.ptx'
|
||||
compiled_kernel = custom_target(ptx_name,
|
||||
input : jpegenc_source,
|
||||
output : ptx_name,
|
||||
command : [nvcc] + opt_common + ['@INPUT@'])
|
||||
jpegenc_precompiled += [compiled_kernel]
|
||||
|
||||
jpegenc_ptx_collection = custom_target('jpegenc_ptx',
|
||||
input : jpegenc_precompiled,
|
||||
output : 'jpegenc_ptx.h',
|
||||
command : [header_collector,
|
||||
'--input', meson.current_build_dir(),
|
||||
'--prefix', 'GstJpegEnc',
|
||||
'--name', 'g_precompiled_ptx_table',
|
||||
'--output', '@OUTPUT@'
|
||||
])
|
||||
|
||||
nvcodec_kernel_precompiled += [
|
||||
conv_precompiled,
|
||||
conv_ptx_collection,
|
||||
jpegenc_precompiled,
|
||||
jpegenc_ptx_collection,
|
||||
]
|
||||
|
Loading…
x
Reference in New Issue
Block a user