hip: Add support for NVIDIA kernel precompile

... with "hip-nvidia-precompile" and "hip-nvcc-arch" build options Part-of: <https://gitlab.freedesktop.org/gstreamer/gstreamer/-/merge_requests/8923>
2025-05-14 14:25:40 +09:00 · 2025-05-14 14:25:40 +09:00 · fc8f7c349b
commit fc8f7c349b
parent eb925e4212
5 changed files with 216 additions and 60 deletions
--- a/subprojects/gst-plugins-bad/meson.options
+++ b/subprojects/gst-plugins-bad/meson.options
@ -285,6 +285,8 @@ option('gpl', type: 'feature', value: 'disabled', yield: true,
 # HIP plugin options
 option('hip-amd-precompile', type : 'feature', value : 'disabled', description : 'Enable HIP kernel precompile for AMD')
 option('hip-hipcc-arch', type : 'string', value : '', description : 'GPU architectur for hipcc --offload-arch option')
+option('hip-nvidia-precompile', type : 'feature', value : 'disabled', description : 'Enable HIP kernel precompile for NVIDIA')
+option('hip-nvcc-arch', type : 'string', value : 'compute_52', description : 'GPU architectur for nvcc -arch option')

 # Common feature options
 option('examples', type : 'feature', value : 'auto', yield : true)
--- a/subprojects/gst-plugins-bad/sys/hip/gsthipconverter.cpp
+++ b/subprojects/gst-plugins-bad/sys/hip/gsthipconverter.cpp
@ -42,6 +42,12 @@
 static std::unordered_map<std::string, const unsigned char *> g_precompiled_hsaco_table;
 #endif

+#ifdef HIP_NVIDIA_PRECOMPILED
+#include "kernel/converter_ptx.h"
+#else
+static std::unordered_map<std::string, const char *> g_precompiled_ptx_table;
+#endif
+
 static std::unordered_map<std::string, const char *> g_ptx_table;
 static std::mutex g_kernel_table_lock;
 /* *INDENT-ON* */
@ -1348,17 +1354,24 @@ gst_hip_converter_setup (GstHipConverter * self)
  if (priv->vendor == GST_HIP_VENDOR_AMD) {
    auto kernel_name = kernel_name_base + "_amd";
    auto precompiled = g_precompiled_hsaco_table.find (kernel_name);
-    if (precompiled != g_precompiled_hsaco_table.end ()) {
+    if (precompiled != g_precompiled_hsaco_table.end ())
      program = (const gchar *) precompiled->second;
-      ret = HipModuleLoadData (priv->vendor, &priv->main_module, program);
-      if (ret != hipSuccess) {
-        GST_WARNING_OBJECT (self,
-            "Could not load module from hsaco, ret %d", ret);
-        program = nullptr;
-        priv->main_module = nullptr;
-      } else {
-        GST_DEBUG_OBJECT (self, "Loaded precompiled hsaco");
-      }
+  } else {
+    auto kernel_name = kernel_name_base + "_nvidia";
+    auto precompiled = g_precompiled_ptx_table.find (kernel_name);
+    if (precompiled != g_precompiled_ptx_table.end ())
+      program = precompiled->second;
+  }
+
+  if (program) {
+    ret = HipModuleLoadData (priv->vendor, &priv->main_module, program);
+    if (ret != hipSuccess) {
+      GST_WARNING_OBJECT (self,
+          "Could not load module from precompiled, ret %d", ret);
+      program = nullptr;
+      priv->main_module = nullptr;
+    } else {
+      GST_DEBUG_OBJECT (self, "Loaded precompiled kernel");
    }
  }

@ -1471,17 +1484,24 @@ gst_hip_converter_setup (GstHipConverter * self)
    if (priv->vendor == GST_HIP_VENDOR_AMD) {
      auto kernel_name = unpack_module_name_base + "_amd";
      auto precompiled = g_precompiled_hsaco_table.find (kernel_name);
-      if (precompiled != g_precompiled_hsaco_table.end ()) {
+      if (precompiled != g_precompiled_hsaco_table.end ())
        program = (const gchar *) precompiled->second;
-        ret = HipModuleLoadData (priv->vendor, &priv->unpack_module, program);
-        if (ret != hipSuccess) {
-          GST_WARNING_OBJECT (self,
-              "Could not load module from hsaco, ret %d", ret);
-          program = nullptr;
-          priv->unpack_module = nullptr;
-        } else {
-          GST_DEBUG_OBJECT (self, "Loaded precompiled hsaco");
-        }
+    } else {
+      auto kernel_name = unpack_module_name_base + "_nvidia";
+      auto precompiled = g_precompiled_ptx_table.find (kernel_name);
+      if (precompiled != g_precompiled_ptx_table.end ())
+        program = precompiled->second;
+    }
+
+    if (program) {
+      ret = HipModuleLoadData (priv->vendor, &priv->unpack_module, program);
+      if (ret != hipSuccess) {
+        GST_WARNING_OBJECT (self,
+            "Could not load module from precompiled, ret %d", ret);
+        program = nullptr;
+        priv->unpack_module = nullptr;
+      } else {
+        GST_DEBUG_OBJECT (self, "Loaded precompiled kernel");
      }
    }

--- a/subprojects/gst-plugins-bad/sys/hip/kernel/collect_ptx_headers.py
+++ b/subprojects/gst-plugins-bad/sys/hip/kernel/collect_ptx_headers.py
@ -0,0 +1,79 @@
+#!/usr/bin/env python3
+# GStreamer
+# Copyright (C) 2025 Seungha Yang <seungha@centricular.com>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Library General Public
+# License as published by the Free Software Foundation; either
+# version 2 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Library General Public License for more details.
+#
+# You should have received a copy of the GNU Library General Public
+# License along with this library; if not, write to the
+# Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+# Boston, MA 02110-1301, USA.
+
+import sys
+import os
+import argparse
+
+start_header = """/*
+ * This file is autogenerated by collect_ptx_headers.py
+ */
+#pragma once
+
+"""
+
+start_map = """
+#define MAKE_BYTECODE(name) { G_STRINGIFY (name), g_##name }
+static std::unordered_map<std::string, const char *>
+"""
+
+end_map = """};
+#undef MAKE_BYTECODE
+"""
+
+def convert_ptx_to_header(ptx_file, header_file):
+    with open(ptx_file, 'r', encoding='utf8') as ptx:
+        ptx_content = ptx.read()
+
+    with open(header_file, 'w', newline='\n', encoding='utf8') as header:
+        header.write('#pragma once\n')
+        header.write('// This file is autogenerated by collect_ptx_headers.py\n')
+        header.write(f'static const char* g_{os.path.splitext(os.path.basename(ptx_file))[0]} = R"(\n')
+        header.write(ptx_content)
+        header.write(')";\n\n')
+
+
+def main(args):
+    parser = argparse.ArgumentParser(description='Read CUDA PTX from directory and make single header')
+    parser.add_argument("--input", help="the precompiled CUDA PTX directory")
+    parser.add_argument("--output", help="output header file location")
+    parser.add_argument("--prefix", help="CUDA PTX header filename prefix")
+    parser.add_argument("--name", help="Hash map variable name")
+
+    args = parser.parse_args(args)
+
+    ptx_files = [os.path.join(args.input, file) for file in os.listdir(args.input) if file.startswith(args.prefix) and file.endswith(".ptx") ]
+
+    with open(args.output, 'w', newline='\n', encoding='utf8') as f:
+        f.write(start_header)
+        for ptx_file in ptx_files:
+            header_file = os.path.splitext(ptx_file)[0] + '.h'
+            convert_ptx_to_header(ptx_file, header_file)
+            f.write("#include \"")
+            f.write(os.path.basename(header_file))
+            f.write("\"\n")
+        f.write(start_map)
+        f.write(args.name)
+        f.write(" = {\n")
+        for ptx_file in ptx_files:
+            f.write("  MAKE_BYTECODE ({}),\n".format(os.path.splitext(os.path.basename(ptx_file))[0]))
+        f.write(end_map)
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv[1:]))
--- a/subprojects/gst-plugins-bad/sys/hip/kernel/meson.build
+++ b/subprojects/gst-plugins-bad/sys/hip/kernel/meson.build
@ -57,46 +57,92 @@ conv_output_formats = [
  'GBRA',
 ]

-amd_header_collector = find_program('collect_hsaco_headers.py')
+if have_hipcc
+  amd_header_collector = find_program('collect_hsaco_headers.py')
+  amd_conv_precompiled = []
+  amd_opt_common = ['-w', '--genco', '-c', '@INPUT@', '-o', '@OUTPUT@']
+  amd_arch_opt = get_option('hip-hipcc-arch')
+  if amd_arch_opt != ''
+    amd_opt_common += ['--offload-arch=' + amd_arch_opt]
+  endif

-amd_conv_precompiled = []
-amd_opt_common = ['-w', '--genco', '-c', '@INPUT@', '-o', '@OUTPUT@']
-amd_arch_opt = get_option('hip-hipcc-arch')
-if amd_arch_opt != ''
-  amd_opt_common += ['--offload-arch=' + amd_arch_opt]
+  foreach input_format : conv_input_formats
+    foreach output_format : conv_output_formats
+      hsaco_name = 'GstHipConverterMain_@0@_@1@_amd.hsaco'.format(input_format, output_format)
+      opts = amd_opt_common + ['-DSAMPLER=Sample@0@'.format(input_format),
+              '-DOUTPUT=Output@0@'.format(output_format)]
+      compiled_kernel = custom_target(hsaco_name,
+          input : conv_source,
+          output : hsaco_name,
+          command : [hipcc] + opts)
+      amd_conv_precompiled += [compiled_kernel]
+    endforeach
+  endforeach
+
+  hsaco_name = 'GstHipConverterUnpack_amd.hsaco'
+  compiled_kernel = custom_target(hsaco_name,
+      input : conv_comm_source,
+      output : hsaco_name,
+      command : [hipcc] + amd_opt_common)
+  amd_conv_precompiled += [compiled_kernel]
+
+  amd_conv_hsaco_collection = custom_target('hip_converter_hsaco',
+      input : amd_conv_precompiled,
+      output : 'converter_hsaco.h',
+      command : [amd_header_collector,
+          '--input', meson.current_build_dir(),
+          '--prefix', 'GstHipConverter',
+          '--name', 'g_precompiled_hsaco_table',
+          '--output', '@OUTPUT@'
+      ])
+
+  hip_amd_precompiled += [
+    amd_conv_precompiled,
+    amd_conv_hsaco_collection,
+  ]
 endif

-foreach input_format : conv_input_formats
-  foreach output_format : conv_output_formats
-    hsaco_name = 'GstHipConverterMain_@0@_@1@_amd.hsaco'.format(input_format, output_format)
-    opts = amd_opt_common + ['-DSAMPLER=Sample@0@'.format(input_format),
-            '-DOUTPUT=Output@0@'.format(output_format)]
-    compiled_kernel = custom_target(hsaco_name,
-        input : conv_source,
-        output : hsaco_name,
-        command : [hipcc] + opts)
-    amd_conv_precompiled += [compiled_kernel]
+if have_nvcc
+  nvidia_header_collector = find_program('collect_ptx_headers.py')
+  nvidia_conv_precompiled = []
+  nvidia_opt_common = ['-ptx', '-w', '-o', '@OUTPUT@']
+  nvidia_arch_opt = get_option('hip-nvcc-arch')
+  if nvidia_arch_opt != ''
+    nvidia_opt_common += ['-arch=' + nvidia_arch_opt]
+  endif
+
+  foreach input_format : conv_input_formats
+    foreach output_format : conv_output_formats
+      ptx_name = 'GstHipConverterMain_@0@_@1@_nvidia.ptx'.format(input_format, output_format)
+      opts = nvidia_opt_common + ['-DSAMPLER=Sample@0@'.format(input_format),
+              '-DOUTPUT=Output@0@'.format(output_format), '@INPUT@']
+      compiled_kernel = custom_target(ptx_name,
+          input : conv_source,
+          output : ptx_name,
+          command : [nvcc] + opts)
+      nvidia_conv_precompiled += [compiled_kernel]
+    endforeach
  endforeach
-endforeach

-hsaco_name = 'GstHipConverterUnpack_amd.hsaco'
-compiled_kernel = custom_target(hsaco_name,
-    input : conv_comm_source,
-    output : hsaco_name,
-    command : [hipcc] + amd_opt_common)
-amd_conv_precompiled += [compiled_kernel]
+  ptx_name = 'GstHipConverterUnpack_nvidia.ptx'
+  compiled_kernel = custom_target(ptx_name,
+      input : conv_comm_source,
+      output : ptx_name,
+      command : [nvcc] + nvidia_opt_common + ['@INPUT@'])
+  nvidia_conv_precompiled += [compiled_kernel]

-amd_conv_hsaco_collection = custom_target('hip_converter_hsaco',
-    input : amd_conv_precompiled,
-    output : 'converter_hsaco.h',
-    command : [amd_header_collector,
-        '--input', meson.current_build_dir(),
-        '--prefix', 'GstHipConverter',
-        '--name', 'g_precompiled_hsaco_table',
-        '--output', '@OUTPUT@'
-    ])
+  nvidia_conv_ptx_collection = custom_target('hip_converter_ptx',
+      input : nvidia_conv_precompiled,
+      output : 'converter_ptx.h',
+      command : [nvidia_header_collector,
+          '--input', meson.current_build_dir(),
+          '--prefix', 'GstHipConverter',
+          '--name', 'g_precompiled_ptx_table',
+          '--output', '@OUTPUT@'
+      ])

-hip_kernel_amd_precompiled += [
-  amd_conv_precompiled,
-  amd_conv_hsaco_collection,
-]
+  hip_nvidia_precompiled += [
+    nvidia_conv_precompiled,
+    nvidia_conv_ptx_collection,
+  ]
+endif
--- a/subprojects/gst-plugins-bad/sys/hip/meson.build
+++ b/subprojects/gst-plugins-bad/sys/hip/meson.build
@ -27,7 +27,8 @@ extra_args = [
 ]

 extra_deps = []
-hip_kernel_amd_precompiled = []
+hip_amd_precompiled = []
+hip_nvidia_precompiled = []

 hip_option = get_option('hip')
 if hip_option.disabled()
@ -39,7 +40,9 @@ if host_system not in ['linux', 'windows']
 endif

 hip_precompile_amd_opt = get_option('hip-amd-precompile')
+hip_precompile_nvidia_opt = get_option('hip-nvidia-precompile')
 have_hipcc = false
+have_nvcc = false
 if not hip_precompile_amd_opt.disabled() and not meson.is_cross_build()
  if host_system == 'windows'
    hipcc = find_program('hipcc.bin', required: false)
@ -58,9 +61,15 @@ if not hip_precompile_amd_opt.disabled() and not meson.is_cross_build()
  have_hipcc = hipcc.found()
 endif

+if not hip_precompile_nvidia_opt.disabled() and not meson.is_cross_build()
+  nvcc = find_program('nvcc', required: hip_precompile_nvidia_opt)
+  have_nvcc = nvcc.found()
+endif
+
 hip_cdata = configuration_data()
-if have_hipcc
-  hip_cdata.set('HIP_AMD_PRECOMPILED', true)
+if have_hipcc or have_nvcc
+  hip_cdata.set('HIP_AMD_PRECOMPILED', have_hipcc)
+  hip_cdata.set('HIP_NVIDIA_PRECOMPILED', have_nvcc)
  subdir('kernel')
 endif

@ -75,7 +84,7 @@ configure_file(
 )

 hip_incdir = include_directories('./stub')
-gsthip = library('gsthip', hip_sources + hip_kernel_amd_precompiled,
+gsthip = library('gsthip', hip_sources + hip_amd_precompiled + hip_nvidia_precompiled,
  c_args : gst_plugins_bad_args + extra_args,
  cpp_args: gst_plugins_bad_args + extra_args,
  include_directories : [configinc, hip_incdir],