hip: Add support for NVIDIA kernel precompile

... with "hip-nvidia-precompile" and "hip-nvcc-arch" build options

Part-of: <https://gitlab.freedesktop.org/gstreamer/gstreamer/-/merge_requests/8923>
This commit is contained in:
Seungha Yang 2025-05-14 14:25:40 +09:00
parent eb925e4212
commit fc8f7c349b
5 changed files with 216 additions and 60 deletions

View File

@ -285,6 +285,8 @@ option('gpl', type: 'feature', value: 'disabled', yield: true,
# HIP plugin options
option('hip-amd-precompile', type : 'feature', value : 'disabled', description : 'Enable HIP kernel precompile for AMD')
option('hip-hipcc-arch', type : 'string', value : '', description : 'GPU architectur for hipcc --offload-arch option')
option('hip-nvidia-precompile', type : 'feature', value : 'disabled', description : 'Enable HIP kernel precompile for NVIDIA')
option('hip-nvcc-arch', type : 'string', value : 'compute_52', description : 'GPU architectur for nvcc -arch option')
# Common feature options
option('examples', type : 'feature', value : 'auto', yield : true)

View File

@ -42,6 +42,12 @@
static std::unordered_map<std::string, const unsigned char *> g_precompiled_hsaco_table;
#endif
#ifdef HIP_NVIDIA_PRECOMPILED
#include "kernel/converter_ptx.h"
#else
static std::unordered_map<std::string, const char *> g_precompiled_ptx_table;
#endif
static std::unordered_map<std::string, const char *> g_ptx_table;
static std::mutex g_kernel_table_lock;
/* *INDENT-ON* */
@ -1348,17 +1354,24 @@ gst_hip_converter_setup (GstHipConverter * self)
if (priv->vendor == GST_HIP_VENDOR_AMD) {
auto kernel_name = kernel_name_base + "_amd";
auto precompiled = g_precompiled_hsaco_table.find (kernel_name);
if (precompiled != g_precompiled_hsaco_table.end ()) {
if (precompiled != g_precompiled_hsaco_table.end ())
program = (const gchar *) precompiled->second;
ret = HipModuleLoadData (priv->vendor, &priv->main_module, program);
if (ret != hipSuccess) {
GST_WARNING_OBJECT (self,
"Could not load module from hsaco, ret %d", ret);
program = nullptr;
priv->main_module = nullptr;
} else {
GST_DEBUG_OBJECT (self, "Loaded precompiled hsaco");
}
} else {
auto kernel_name = kernel_name_base + "_nvidia";
auto precompiled = g_precompiled_ptx_table.find (kernel_name);
if (precompiled != g_precompiled_ptx_table.end ())
program = precompiled->second;
}
if (program) {
ret = HipModuleLoadData (priv->vendor, &priv->main_module, program);
if (ret != hipSuccess) {
GST_WARNING_OBJECT (self,
"Could not load module from precompiled, ret %d", ret);
program = nullptr;
priv->main_module = nullptr;
} else {
GST_DEBUG_OBJECT (self, "Loaded precompiled kernel");
}
}
@ -1471,17 +1484,24 @@ gst_hip_converter_setup (GstHipConverter * self)
if (priv->vendor == GST_HIP_VENDOR_AMD) {
auto kernel_name = unpack_module_name_base + "_amd";
auto precompiled = g_precompiled_hsaco_table.find (kernel_name);
if (precompiled != g_precompiled_hsaco_table.end ()) {
if (precompiled != g_precompiled_hsaco_table.end ())
program = (const gchar *) precompiled->second;
ret = HipModuleLoadData (priv->vendor, &priv->unpack_module, program);
if (ret != hipSuccess) {
GST_WARNING_OBJECT (self,
"Could not load module from hsaco, ret %d", ret);
program = nullptr;
priv->unpack_module = nullptr;
} else {
GST_DEBUG_OBJECT (self, "Loaded precompiled hsaco");
}
} else {
auto kernel_name = unpack_module_name_base + "_nvidia";
auto precompiled = g_precompiled_ptx_table.find (kernel_name);
if (precompiled != g_precompiled_ptx_table.end ())
program = precompiled->second;
}
if (program) {
ret = HipModuleLoadData (priv->vendor, &priv->unpack_module, program);
if (ret != hipSuccess) {
GST_WARNING_OBJECT (self,
"Could not load module from precompiled, ret %d", ret);
program = nullptr;
priv->unpack_module = nullptr;
} else {
GST_DEBUG_OBJECT (self, "Loaded precompiled kernel");
}
}

View File

@ -0,0 +1,79 @@
#!/usr/bin/env python3
# GStreamer
# Copyright (C) 2025 Seungha Yang <seungha@centricular.com>
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Library General Public
# License as published by the Free Software Foundation; either
# version 2 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Library General Public License for more details.
#
# You should have received a copy of the GNU Library General Public
# License along with this library; if not, write to the
# Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
# Boston, MA 02110-1301, USA.
import sys
import os
import argparse
start_header = """/*
* This file is autogenerated by collect_ptx_headers.py
*/
#pragma once
"""
start_map = """
#define MAKE_BYTECODE(name) { G_STRINGIFY (name), g_##name }
static std::unordered_map<std::string, const char *>
"""
end_map = """};
#undef MAKE_BYTECODE
"""
def convert_ptx_to_header(ptx_file, header_file):
with open(ptx_file, 'r', encoding='utf8') as ptx:
ptx_content = ptx.read()
with open(header_file, 'w', newline='\n', encoding='utf8') as header:
header.write('#pragma once\n')
header.write('// This file is autogenerated by collect_ptx_headers.py\n')
header.write(f'static const char* g_{os.path.splitext(os.path.basename(ptx_file))[0]} = R"(\n')
header.write(ptx_content)
header.write(')";\n\n')
def main(args):
parser = argparse.ArgumentParser(description='Read CUDA PTX from directory and make single header')
parser.add_argument("--input", help="the precompiled CUDA PTX directory")
parser.add_argument("--output", help="output header file location")
parser.add_argument("--prefix", help="CUDA PTX header filename prefix")
parser.add_argument("--name", help="Hash map variable name")
args = parser.parse_args(args)
ptx_files = [os.path.join(args.input, file) for file in os.listdir(args.input) if file.startswith(args.prefix) and file.endswith(".ptx") ]
with open(args.output, 'w', newline='\n', encoding='utf8') as f:
f.write(start_header)
for ptx_file in ptx_files:
header_file = os.path.splitext(ptx_file)[0] + '.h'
convert_ptx_to_header(ptx_file, header_file)
f.write("#include \"")
f.write(os.path.basename(header_file))
f.write("\"\n")
f.write(start_map)
f.write(args.name)
f.write(" = {\n")
for ptx_file in ptx_files:
f.write(" MAKE_BYTECODE ({}),\n".format(os.path.splitext(os.path.basename(ptx_file))[0]))
f.write(end_map)
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))

View File

@ -57,46 +57,92 @@ conv_output_formats = [
'GBRA',
]
amd_header_collector = find_program('collect_hsaco_headers.py')
if have_hipcc
amd_header_collector = find_program('collect_hsaco_headers.py')
amd_conv_precompiled = []
amd_opt_common = ['-w', '--genco', '-c', '@INPUT@', '-o', '@OUTPUT@']
amd_arch_opt = get_option('hip-hipcc-arch')
if amd_arch_opt != ''
amd_opt_common += ['--offload-arch=' + amd_arch_opt]
endif
amd_conv_precompiled = []
amd_opt_common = ['-w', '--genco', '-c', '@INPUT@', '-o', '@OUTPUT@']
amd_arch_opt = get_option('hip-hipcc-arch')
if amd_arch_opt != ''
amd_opt_common += ['--offload-arch=' + amd_arch_opt]
foreach input_format : conv_input_formats
foreach output_format : conv_output_formats
hsaco_name = 'GstHipConverterMain_@0@_@1@_amd.hsaco'.format(input_format, output_format)
opts = amd_opt_common + ['-DSAMPLER=Sample@0@'.format(input_format),
'-DOUTPUT=Output@0@'.format(output_format)]
compiled_kernel = custom_target(hsaco_name,
input : conv_source,
output : hsaco_name,
command : [hipcc] + opts)
amd_conv_precompiled += [compiled_kernel]
endforeach
endforeach
hsaco_name = 'GstHipConverterUnpack_amd.hsaco'
compiled_kernel = custom_target(hsaco_name,
input : conv_comm_source,
output : hsaco_name,
command : [hipcc] + amd_opt_common)
amd_conv_precompiled += [compiled_kernel]
amd_conv_hsaco_collection = custom_target('hip_converter_hsaco',
input : amd_conv_precompiled,
output : 'converter_hsaco.h',
command : [amd_header_collector,
'--input', meson.current_build_dir(),
'--prefix', 'GstHipConverter',
'--name', 'g_precompiled_hsaco_table',
'--output', '@OUTPUT@'
])
hip_amd_precompiled += [
amd_conv_precompiled,
amd_conv_hsaco_collection,
]
endif
foreach input_format : conv_input_formats
foreach output_format : conv_output_formats
hsaco_name = 'GstHipConverterMain_@0@_@1@_amd.hsaco'.format(input_format, output_format)
opts = amd_opt_common + ['-DSAMPLER=Sample@0@'.format(input_format),
'-DOUTPUT=Output@0@'.format(output_format)]
compiled_kernel = custom_target(hsaco_name,
input : conv_source,
output : hsaco_name,
command : [hipcc] + opts)
amd_conv_precompiled += [compiled_kernel]
if have_nvcc
nvidia_header_collector = find_program('collect_ptx_headers.py')
nvidia_conv_precompiled = []
nvidia_opt_common = ['-ptx', '-w', '-o', '@OUTPUT@']
nvidia_arch_opt = get_option('hip-nvcc-arch')
if nvidia_arch_opt != ''
nvidia_opt_common += ['-arch=' + nvidia_arch_opt]
endif
foreach input_format : conv_input_formats
foreach output_format : conv_output_formats
ptx_name = 'GstHipConverterMain_@0@_@1@_nvidia.ptx'.format(input_format, output_format)
opts = nvidia_opt_common + ['-DSAMPLER=Sample@0@'.format(input_format),
'-DOUTPUT=Output@0@'.format(output_format), '@INPUT@']
compiled_kernel = custom_target(ptx_name,
input : conv_source,
output : ptx_name,
command : [nvcc] + opts)
nvidia_conv_precompiled += [compiled_kernel]
endforeach
endforeach
endforeach
hsaco_name = 'GstHipConverterUnpack_amd.hsaco'
compiled_kernel = custom_target(hsaco_name,
input : conv_comm_source,
output : hsaco_name,
command : [hipcc] + amd_opt_common)
amd_conv_precompiled += [compiled_kernel]
ptx_name = 'GstHipConverterUnpack_nvidia.ptx'
compiled_kernel = custom_target(ptx_name,
input : conv_comm_source,
output : ptx_name,
command : [nvcc] + nvidia_opt_common + ['@INPUT@'])
nvidia_conv_precompiled += [compiled_kernel]
amd_conv_hsaco_collection = custom_target('hip_converter_hsaco',
input : amd_conv_precompiled,
output : 'converter_hsaco.h',
command : [amd_header_collector,
'--input', meson.current_build_dir(),
'--prefix', 'GstHipConverter',
'--name', 'g_precompiled_hsaco_table',
'--output', '@OUTPUT@'
])
nvidia_conv_ptx_collection = custom_target('hip_converter_ptx',
input : nvidia_conv_precompiled,
output : 'converter_ptx.h',
command : [nvidia_header_collector,
'--input', meson.current_build_dir(),
'--prefix', 'GstHipConverter',
'--name', 'g_precompiled_ptx_table',
'--output', '@OUTPUT@'
])
hip_kernel_amd_precompiled += [
amd_conv_precompiled,
amd_conv_hsaco_collection,
]
hip_nvidia_precompiled += [
nvidia_conv_precompiled,
nvidia_conv_ptx_collection,
]
endif

View File

@ -27,7 +27,8 @@ extra_args = [
]
extra_deps = []
hip_kernel_amd_precompiled = []
hip_amd_precompiled = []
hip_nvidia_precompiled = []
hip_option = get_option('hip')
if hip_option.disabled()
@ -39,7 +40,9 @@ if host_system not in ['linux', 'windows']
endif
hip_precompile_amd_opt = get_option('hip-amd-precompile')
hip_precompile_nvidia_opt = get_option('hip-nvidia-precompile')
have_hipcc = false
have_nvcc = false
if not hip_precompile_amd_opt.disabled() and not meson.is_cross_build()
if host_system == 'windows'
hipcc = find_program('hipcc.bin', required: false)
@ -58,9 +61,15 @@ if not hip_precompile_amd_opt.disabled() and not meson.is_cross_build()
have_hipcc = hipcc.found()
endif
if not hip_precompile_nvidia_opt.disabled() and not meson.is_cross_build()
nvcc = find_program('nvcc', required: hip_precompile_nvidia_opt)
have_nvcc = nvcc.found()
endif
hip_cdata = configuration_data()
if have_hipcc
hip_cdata.set('HIP_AMD_PRECOMPILED', true)
if have_hipcc or have_nvcc
hip_cdata.set('HIP_AMD_PRECOMPILED', have_hipcc)
hip_cdata.set('HIP_NVIDIA_PRECOMPILED', have_nvcc)
subdir('kernel')
endif
@ -75,7 +84,7 @@ configure_file(
)
hip_incdir = include_directories('./stub')
gsthip = library('gsthip', hip_sources + hip_kernel_amd_precompiled,
gsthip = library('gsthip', hip_sources + hip_amd_precompiled + hip_nvidia_precompiled,
c_args : gst_plugins_bad_args + extra_args,
cpp_args: gst_plugins_bad_args + extra_args,
include_directories : [configinc, hip_incdir],