From ae8eef82a690336bdcef080fed49eff1db0bd704 Mon Sep 17 00:00:00 2001
From: Seungha Yang <seungha@centricular.com>
Date: Fri, 21 Feb 2025 19:18:01 +0900
Subject: [PATCH] nvjpegenc: Add support for kernel precompile

Port to CUDA precompile/cache

Part-of: <https://gitlab.freedesktop.org/gstreamer/gstreamer/-/merge_requests/8536>
---
 .../sys/nvcodec/gstnvjpegenc.cpp              | 127 ++++++++++++------
 .../sys/nvcodec/kernel/gstnvjpegenc.cu        |  78 +++++++++++
 .../sys/nvcodec/kernel/meson.build            |  21 +++
 3 files changed, 182 insertions(+), 44 deletions(-)
 create mode 100644 subprojects/gst-plugins-bad/sys/nvcodec/kernel/gstnvjpegenc.cu
diff --git a/subprojects/gst-plugins-bad/sys/nvcodec/gstnvjpegenc.cpp b/subprojects/gst-plugins-bad/sys/nvcodec/gstnvjpegenc.cpp
index c307cc278f..616e8963c2 100644
--- a/subprojects/gst-plugins-bad/sys/nvcodec/gstnvjpegenc.cpp
+++ b/subprojects/gst-plugins-bad/sys/nvcodec/gstnvjpegenc.cpp
@@ -27,6 +27,8 @@
 #include <gmodule.h>
 #include <string>
 #include <mutex>
+#include <unordered_map>
+#include "kernel/gstnvjpegenc.cu"
 
 /**
  * SECTION:element-nvjpegenc
@@ -43,6 +45,18 @@
  *
  */
 
+/* *INDENT-OFF* */
+#ifdef NVCODEC_CUDA_PRECOMPILED
+#include "kernel/jpegenc_ptx.h"
+#else
+static std::unordered_map<std::string, const char *> g_precompiled_ptx_table;
+#endif
+
+static std::unordered_map<std::string, const char *> g_cubin_table;
+static std::unordered_map<std::string, const char *> g_ptx_table;
+static std::mutex g_kernel_table_lock;
+/* *INDENT-ON* */
+
 GST_DEBUG_CATEGORY_STATIC (gst_nv_jpeg_enc_debug);
 #define GST_CAT_DEFAULT gst_nv_jpeg_enc_debug
 
@@ -391,38 +405,6 @@ gst_nv_jpeg_enc_set_context (GstElement * element, GstContext * context)
   GST_ELEMENT_CLASS (parent_class)->set_context (element, context);
 }
 
-#define KERNEL_MAIN_FUNC "gst_nv_jpec_enc_kernel"
-/* *INDENT-OFF* */
-const static gchar kernel_source[] =
-"extern \"C\" {\n"
-"__device__ inline unsigned char\n"
-"scale_to_uchar (float val)\n"
-"{\n"
-"  return (unsigned char) __float2int_rz (val * 255.0);\n"
-"}\n"
-"\n"
-"__global__ void\n"
-KERNEL_MAIN_FUNC "(cudaTextureObject_t uv_tex, unsigned char * out_u,\n"
-"    unsigned char * out_v, int width, int height, int stride)\n"
-"{\n"
-"  int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
-"  int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
-"  if (x_pos >= width || y_pos >= height)\n"
-"    return;\n"
-"  float x = 0;\n"
-"  float y = 0;\n"
-"  if (width > 1)\n"
-"    x = (float) x_pos / (width - 1);\n"
-"  if (height > 1)\n"
-"    y = (float) y_pos / (height - 1);\n"
-"  float2 uv = tex2D<float2> (uv_tex, x, y);\n"
-"  unsigned int pos = x_pos + (y_pos * stride);\n"
-"  out_u[pos] = scale_to_uchar (uv.x);\n"
-"  out_v[pos] = scale_to_uchar (uv.y);\n"
-"}\n"
-"}";
-/* *INDENT-ON* */
-
 static gboolean
 gst_nv_jpeg_enc_open (GstVideoEncoder * encoder)
 {
@@ -444,28 +426,81 @@ gst_nv_jpeg_enc_open (GstVideoEncoder * encoder)
   }
 
   if (!priv->module && klass->have_nvrtc) {
-    auto program = gst_cuda_nvrtc_compile_cubin (kernel_source,
-        klass->cuda_device_id);
-    if (!program)
-      program = gst_cuda_nvrtc_compile (kernel_source);
+    const gchar *program = nullptr;
+    auto precompiled = g_precompiled_ptx_table.find ("GstJpegEnc");
+    CUresult ret;
+    if (precompiled != g_precompiled_ptx_table.end ())
+      program = precompiled->second;
 
-    if (!program) {
-      GST_ERROR_OBJECT (self, "Couldn't compile kernel source");
-      gst_cuda_context_pop (nullptr);
-      return FALSE;
+    if (program) {
+      GST_DEBUG_OBJECT (self, "Precompiled PTX available");
+      ret = CuModuleLoadData (&priv->module, program);
+      if (ret != CUDA_SUCCESS) {
+        GST_WARNING_OBJECT (self, "Could not load module from precompiled PTX");
+        priv->module = nullptr;
+        program = nullptr;
+      }
     }
 
-    auto ret = CuModuleLoadData (&priv->module, program);
-    g_free (program);
+    if (!program) {
+      std::lock_guard < std::mutex > lk (g_kernel_table_lock);
+      std::string cubin_kernel_name =
+          "GstJpegEnc_device_" + std::to_string (klass->cuda_device_id);
 
-    if (!gst_cuda_result (ret)) {
+      auto cubin = g_cubin_table.find (cubin_kernel_name);
+      if (cubin == g_cubin_table.end ()) {
+        GST_DEBUG_OBJECT (self, "Building CUBIN");
+        program = gst_cuda_nvrtc_compile_cubin (GstNvJpegEncConvertMain_str,
+            klass->cuda_device_id);
+        if (program)
+          g_cubin_table[cubin_kernel_name] = program;
+      } else {
+        GST_DEBUG_OBJECT (self, "Found cached CUBIN");
+        program = cubin->second;
+      }
+
+      if (program) {
+        GST_DEBUG_OBJECT (self, "Loading CUBIN module");
+        ret = CuModuleLoadData (&priv->module, program);
+        if (ret != CUDA_SUCCESS) {
+          GST_WARNING_OBJECT (self, "Could not load module from cached CUBIN");
+          program = nullptr;
+          priv->module = nullptr;
+        }
+      }
+
+      if (!program) {
+        auto ptx = g_ptx_table.find ("GstJpegEnc");
+        if (ptx == g_ptx_table.end ()) {
+          GST_DEBUG_OBJECT (self, "Building PTX");
+          program = gst_cuda_nvrtc_compile (GstNvJpegEncConvertMain_str);
+          if (program)
+            g_ptx_table["GstJpegEnc"] = program;
+        } else {
+          GST_DEBUG_OBJECT (self, "Found cached PTX");
+          program = ptx->second;
+        }
+      }
+
+      if (program && !priv->module) {
+        GST_DEBUG_OBJECT (self, "Loading PTX module");
+        ret = CuModuleLoadData (&priv->module, program);
+        if (ret != CUDA_SUCCESS) {
+          GST_ERROR_OBJECT (self, "Could not load module from PTX");
+          program = nullptr;
+          priv->module = nullptr;
+        }
+      }
+    }
+
+    if (!priv->module) {
       GST_ERROR_OBJECT (self, "Couldn't load module");
       gst_cuda_context_pop (nullptr);
       return FALSE;
     }
 
     ret = CuModuleGetFunction (&priv->kernel_func, priv->module,
-        KERNEL_MAIN_FUNC);
+        "GstNvJpegEncConvertMain");
     if (!gst_cuda_result (ret)) {
       GST_ERROR_OBJECT (self, "Couldn't get kernel function");
       gst_cuda_context_pop (nullptr);
@@ -1013,6 +1048,10 @@ gst_nv_jpeg_enc_register (GstPlugin * plugin, GstCudaContext * context,
   g_object_get (context, "cuda-device-id", &cuda_device_id, nullptr);
 
   std::string format_string;
+#ifdef NVCODEC_CUDA_PRECOMPILED
+  have_nvrtc = TRUE;
+#endif
+
   if (have_nvrtc)
     format_string = "NV12, I420, Y42B, Y444";
   else
diff --git a/subprojects/gst-plugins-bad/sys/nvcodec/kernel/gstnvjpegenc.cu b/subprojects/gst-plugins-bad/sys/nvcodec/kernel/gstnvjpegenc.cu
new file mode 100644
index 0000000000..3eacb6abb8
--- /dev/null
+++ b/subprojects/gst-plugins-bad/sys/nvcodec/kernel/gstnvjpegenc.cu
@@ -0,0 +1,78 @@
+/* GStreamer
+ * Copyright (C) 2025 Seungha Yang <seungha@centricular.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifdef __NVCC__
+__device__ inline unsigned char
+scale_to_uchar (float val)
+{
+  return (unsigned char) __float2int_rz (val * 255.0);
+}
+
+extern "C" {
+__global__ void
+GstNvJpegEncConvertMain (cudaTextureObject_t uv_tex, unsigned char * out_u,
+    unsigned char * out_v, int width, int height, int stride)
+{
+  int x_pos = blockIdx.x * blockDim.x + threadIdx.x;
+  int y_pos = blockIdx.y * blockDim.y + threadIdx.y;
+  if (x_pos >= width || y_pos >= height)
+    return;
+  float x = 0;
+  float y = 0;
+  if (width > 1)
+    x = (float) x_pos / (width - 1);
+  if (height > 1)
+    y = (float) y_pos / (height - 1);
+  float2 uv = tex2D<float2> (uv_tex, x, y);
+  unsigned int pos = x_pos + (y_pos * stride);
+  out_u[pos] = scale_to_uchar (uv.x);
+  out_v[pos] = scale_to_uchar (uv.y);
+}
+}
+#else
+static const gchar *GstNvJpegEncConvertMain_str = R"(
+__device__ inline unsigned char
+scale_to_uchar (float val)
+{
+  return (unsigned char) __float2int_rz (val * 255.0);
+}
+
+extern "C" {
+__global__ void
+GstNvJpegEncConvertMain (cudaTextureObject_t uv_tex, unsigned char * out_u,
+    unsigned char * out_v, int width, int height, int stride)
+{
+  int x_pos = blockIdx.x * blockDim.x + threadIdx.x;
+  int y_pos = blockIdx.y * blockDim.y + threadIdx.y;
+  if (x_pos >= width || y_pos >= height)
+    return;
+  float x = 0;
+  float y = 0;
+  if (width > 1)
+    x = (float) x_pos / (width - 1);
+  if (height > 1)
+    y = (float) y_pos / (height - 1);
+  float2 uv = tex2D<float2> (uv_tex, x, y);
+  unsigned int pos = x_pos + (y_pos * stride);
+  out_u[pos] = scale_to_uchar (uv.x);
+  out_v[pos] = scale_to_uchar (uv.y);
+}
+}
+)";
+#endif
diff --git a/subprojects/gst-plugins-bad/sys/nvcodec/kernel/meson.build b/subprojects/gst-plugins-bad/sys/nvcodec/kernel/meson.build
index fd5500c2e2..87cda3b379 100644
--- a/subprojects/gst-plugins-bad/sys/nvcodec/kernel/meson.build
+++ b/subprojects/gst-plugins-bad/sys/nvcodec/kernel/meson.build
@@ -1,5 +1,6 @@
 conv_source = files('gstcudaconverter.cu')
 conv_comm_source = files('gstcudaconverter-unpack.cu')
+jpegenc_source = files('gstnvjpegenc.cu')
 
 conv_input_formats = [
   'I420',
@@ -60,6 +61,7 @@ conv_output_formats = [
 header_collector = find_program('collect_ptx_headers.py')
 
 conv_precompiled = []
+jpegenc_precompiled = []
 opt_common = ['-ptx', '-w', '-o', '@OUTPUT@']
 arch_opt = get_option('nvcodec-nvcc-arch')
 if arch_opt != ''
@@ -96,7 +98,26 @@ conv_ptx_collection = custom_target('converter_ptx',
         '--output', '@OUTPUT@'
     ])
 
+ptx_name = 'GstJpegEnc.ptx'
+compiled_kernel = custom_target(ptx_name,
+    input : jpegenc_source,
+    output : ptx_name,
+    command : [nvcc] + opt_common + ['@INPUT@'])
+jpegenc_precompiled += [compiled_kernel]
+
+jpegenc_ptx_collection = custom_target('jpegenc_ptx',
+    input : jpegenc_precompiled,
+    output : 'jpegenc_ptx.h',
+    command : [header_collector,
+        '--input', meson.current_build_dir(),
+        '--prefix', 'GstJpegEnc',
+        '--name', 'g_precompiled_ptx_table',
+        '--output', '@OUTPUT@'
+    ])
+
 nvcodec_kernel_precompiled += [
   conv_precompiled,
   conv_ptx_collection,
+  jpegenc_precompiled,
+  jpegenc_ptx_collection,
 ]