/* GStreamer
 * Copyright (C) 2025 Seungha Yang <seungha@centricular.com>
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Library General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Library General Public License for more details.
 *
 * You should have received a copy of the GNU Library General Public
 * License along with this library; if not, write to the
 * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
 * Boston, MA 02110-1301, USA.
 */

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include "gsthip-config.h"

#include "gsthip.h"
#include "gsthiploader.h"
#include <gmodule.h>
#include <mutex>
#include <hip/nvidia_hip_runtime_api.h>
#include <string.h>

#ifdef HAVE_GST_GL
#include "gsthiploader-gl.h"
#include <cudaGL.h>
#endif

#ifndef GST_DISABLE_GST_DEBUG
#define GST_CAT_DEFAULT ensure_debug_category()
static GstDebugCategory *
ensure_debug_category (void)
{
  static GstDebugCategory *cat = nullptr;
  static std::once_flag once;

  std::call_once (once,[&] {
        cat = _gst_debug_category_new ("hiploader", 0, "hiploader");
      });

  return cat;
}
#endif

/* *INDENT-OFF* */
struct GstHipFuncTableAmd
{
  gboolean loaded = FALSE;

  hipError_t (*hipInit) (unsigned int flags);
  hipError_t (*hipDriverGetVersion) (int *driverVersion);
  hipError_t (*hipRuntimeGetVersion) (int *runtimeVersion);
  const char *(*hipGetErrorName) (hipError_t hip_error);
  const char *(*hipGetErrorString) (hipError_t hipError);
  hipError_t (*hipGetDeviceCount) (int *count);
  hipError_t (*hipGetDeviceProperties) (hipDeviceProp_t * prop, int deviceId);
  hipError_t (*hipDeviceGetAttribute) (int *pi, hipDeviceAttribute_t attr,
    int deviceId);
  hipError_t (*hipSetDevice) (int deviceId);
  hipError_t (*hipMalloc) (void **ptr, size_t size);
  hipError_t (*hipFree) (void *ptr);
  hipError_t (*hipHostMalloc) (void **ptr, size_t size, unsigned int flags);
  hipError_t (*hipHostFree) (void *ptr);
  hipError_t (*hipStreamCreate) (hipStream_t* stream);
  hipError_t (*hipStreamDestroy) (hipStream_t stream);
  hipError_t (*hipStreamSynchronize) (hipStream_t stream);
  hipError_t (*hipEventCreateWithFlags) (hipEvent_t* event, unsigned flags);
  hipError_t (*hipEventRecord) (hipEvent_t event, hipStream_t stream);
  hipError_t (*hipEventDestroy) (hipEvent_t event);
  hipError_t (*hipEventSynchronize) (hipEvent_t event);
  hipError_t (*hipEventQuery) (hipEvent_t event);
  hipError_t (*hipModuleLoadData) (hipModule_t * module, const void *image);
  hipError_t (*hipModuleUnload) (hipModule_t module);
  hipError_t (*hipModuleGetFunction) (hipFunction_t * function,
    hipModule_t module, const char *kname);
  hipError_t (*hipModuleLaunchKernel) (hipFunction_t f, unsigned int gridDimX,
    unsigned int gridDimY,
    unsigned int gridDimZ, unsigned int blockDimX,
    unsigned int blockDimY, unsigned int blockDimZ,
    unsigned int sharedMemBytes, hipStream_t stream,
    void **kernelParams, void **extra);
  hipError_t (*hipMemcpyParam2DAsync) (const hip_Memcpy2D * pCopy,
    hipStream_t stream);
  hipError_t (*hipMemsetD8Async) (hipDeviceptr_t dest, unsigned char value,
    size_t count, hipStream_t stream);
  hipError_t (*hipMemsetD16Async) (hipDeviceptr_t dest, unsigned short value,
    size_t count, hipStream_t stream);
  hipError_t (*hipMemsetD32Async) (hipDeviceptr_t dst, int value, size_t count,
    hipStream_t stream);
  hipError_t (*hipTexObjectCreate) (hipTextureObject_t * pTexObject,
    const HIP_RESOURCE_DESC * pResDesc, const HIP_TEXTURE_DESC * pTexDesc,
    const HIP_RESOURCE_VIEW_DESC * pResViewDesc);
  hipError_t (*hipTexObjectDestroy) (hipTextureObject_t texObject);
  hipError_t (*hipGraphicsMapResources) (int count,
    hipGraphicsResource_t* resources, hipStream_t stream);
  hipError_t (*hipGraphicsResourceGetMappedPointer) (void** devPtr,
    size_t* size, hipGraphicsResource_t resource);
  hipError_t (*hipGraphicsUnmapResources) (int count,
    hipGraphicsResource_t* resources, hipStream_t stream);
  hipError_t (*hipGraphicsUnregisterResource) (hipGraphicsResource_t resource);
#ifdef HAVE_GST_GL
  hipError_t (*hipGLGetDevices) (unsigned int* pHipDeviceCount,
      int* pHipDevices, unsigned int hipDeviceCount,
      hipGLDeviceList deviceList);
  hipError_t (*hipGraphicsGLRegisterBuffer) (hipGraphicsResource** resource,
      unsigned int buffer, unsigned int flags);
#endif
};

struct GstHipFuncTableCuda
{
  gboolean loaded = FALSE;

  CUresult (CUDAAPI *cuInit) (unsigned int flags);
  CUresult (CUDAAPI *cuDriverGetVersion) (int *driverVersion);
  CUresult (CUDAAPI *cuDeviceGetAttribute) (int *pi,
    CUdevice_attribute attrib, CUdevice dev);
  CUresult (CUDAAPI *cuModuleLoadData) (CUmodule * module, const void *image);
  CUresult (CUDAAPI *cuModuleUnload) (CUmodule module);
  CUresult (CUDAAPI *cuModuleGetFunction) (CUfunction * function,
    CUmodule module, const char *kname);
  CUresult (CUDAAPI *cuLaunchKernel) (CUfunction f, unsigned int gridDimX,
    unsigned int gridDimY,
    unsigned int gridDimZ, unsigned int blockDimX,
    unsigned int blockDimY, unsigned int blockDimZ,
    unsigned int sharedMemBytes, CUstream stream,
    void **kernelParams, void **extra);
  CUresult (CUDAAPI *cuMemcpy2DAsync) (const CUDA_MEMCPY2D * pCopy,
    CUstream stream);
  CUresult (CUDAAPI *cuMemsetD8Async) (CUdeviceptr dstDevice,
    unsigned char uc, size_t N, CUstream hStream);
  CUresult (CUDAAPI *cuMemsetD16Async) (CUdeviceptr dstDevice,
    unsigned short us, size_t N, CUstream hStream);
  CUresult (CUDAAPI *cuMemsetD32Async) (CUdeviceptr dstDevice, unsigned int ui,
    size_t N, CUstream hStream);
  CUresult (CUDAAPI *cuTexObjectCreate) (CUtexObject * pTexObject,
    const CUDA_RESOURCE_DESC * pResDesc, const CUDA_TEXTURE_DESC * pTexDesc,
    const CUDA_RESOURCE_VIEW_DESC * pResViewDesc);
  CUresult (CUDAAPI *cuTexObjectDestroy) (CUtexObject texObject);
};

struct GstHipFuncTableCudaRt
{
  gboolean loaded = FALSE;

  cudaError_t (CUDAAPI *cudaRuntimeGetVersion) (int *runtimeVersion);
  const char * (CUDAAPI *cudaGetErrorName) (cudaError_t error);
  const char * (CUDAAPI *cudaGetErrorString) (cudaError_t error);
  cudaError_t (CUDAAPI *cudaGetDeviceCount) (int *count);
  cudaError_t (CUDAAPI *cudaGetDeviceProperties) (struct cudaDeviceProp * prop,
    int device);
  cudaError_t (CUDAAPI *cudaDeviceGetAttribute) (int *value, enum cudaDeviceAttr attr,
    int device);
  cudaError_t (CUDAAPI *cudaSetDevice) (int device);
  cudaError_t (CUDAAPI *cudaMalloc) (void **ptr, size_t size);
  cudaError_t (CUDAAPI *cudaFree) (void *ptr);
  cudaError_t (CUDAAPI *cudaMallocHost) (void **ptr, size_t size, unsigned int flags);
  cudaError_t (CUDAAPI *cudaFreeHost) (void *ptr);
  cudaError_t (CUDAAPI *cudaStreamCreate) (cudaStream_t *pStream);
  cudaError_t (CUDAAPI *cudaStreamDestroy) (cudaStream_t stream);
  cudaError_t (CUDAAPI *cudaStreamSynchronize) (cudaStream_t stream);
  cudaError_t (CUDAAPI *cudaEventCreateWithFlags) (cudaEvent_t *event,
    unsigned int flags);
  cudaError_t (CUDAAPI *cudaEventRecord) (cudaEvent_t event, cudaStream_t stream);
  cudaError_t (CUDAAPI *cudaEventDestroy) (cudaEvent_t event);
  cudaError_t (CUDAAPI *cudaEventSynchronize)(cudaEvent_t event);
  cudaError_t (CUDAAPI *cudaEventQuery) (cudaEvent_t event);
  cudaError_t (CUDAAPI *cudaGraphicsMapResources) (int count,
    cudaGraphicsResource_t *resources, cudaStream_t stream);
  cudaError_t (CUDAAPI *cudaGraphicsResourceGetMappedPointer) (void **devPtr,
    size_t *size, cudaGraphicsResource_t resource);
  cudaError_t (CUDAAPI *cudaGraphicsUnmapResources) (int count,
    cudaGraphicsResource_t *resources, cudaStream_t stream);
  cudaError_t (CUDAAPI *cudaGraphicsUnregisterResource) (cudaGraphicsResource_t resource);
#ifdef HAVE_GST_GL
  cudaError_t (CUDAAPI *cudaGLGetDevices) (unsigned int *pCudaDeviceCount,
    int *pCudaDevices, unsigned int cudaDeviceCount,
    enum cudaGLDeviceList deviceList);
  cudaError_t (CUDAAPI *cudaGraphicsGLRegisterBuffer) (struct cudaGraphicsResource **resource,
    unsigned int buffer, unsigned int flags);
#endif
};
/* *INDENT-ON* */

static GstHipFuncTableAmd amd_ftable = { };
static GstHipFuncTableCuda cuda_ftable = { };
static GstHipFuncTableCudaRt cudart_ftable = { };

#define LOAD_SYMBOL(name) G_STMT_START { \
  if (!g_module_symbol (module, G_STRINGIFY (name), (gpointer *) &table->name)) { \
    GST_ERROR ("Failed to load '%s', %s", G_STRINGIFY (name), g_module_error()); \
    g_module_close (module); \
    return; \
  } \
} G_STMT_END;

static void
load_amd_func_table (void)
{
  GModule *module = nullptr;
#ifndef G_OS_WIN32
  module = g_module_open ("libamdhip64.so", G_MODULE_BIND_LAZY);
  if (!module)
    module = g_module_open ("/opt/rocm/lib/libamdhip64.so", G_MODULE_BIND_LAZY);
#else
  /* Prefer hip dll in SDK */
  auto hip_root = g_getenv ("HIP_PATH");
  if (hip_root) {
    auto path = g_build_path (G_DIR_SEPARATOR_S, hip_root, "bin", nullptr);
    auto dir = g_dir_open (path, 0, nullptr);
    if (dir) {
      const gchar *name;
      while ((name = g_dir_read_name (dir))) {
        if (g_str_has_prefix (name, "amdhip64_") && g_str_has_suffix (name,
                ".dll")) {
          auto lib_path = g_build_filename (path, name, nullptr);
          module = g_module_open (lib_path, G_MODULE_BIND_LAZY);
          break;
        }
      }

      g_dir_close (dir);
    }
    g_free (path);
  }

  /* Try dll in System32 */
  if (!module)
    module = g_module_open ("amdhip64_6.dll", G_MODULE_BIND_LAZY);
#endif

  if (!module) {
    GST_INFO ("Couldn't open HIP library");
    return;
  }

  auto table = &amd_ftable;
  LOAD_SYMBOL (hipInit);
  LOAD_SYMBOL (hipDriverGetVersion);
  LOAD_SYMBOL (hipRuntimeGetVersion);
  LOAD_SYMBOL (hipGetErrorName);
  LOAD_SYMBOL (hipGetErrorString);
  LOAD_SYMBOL (hipGetDeviceCount);
  LOAD_SYMBOL (hipGetDeviceProperties);
  LOAD_SYMBOL (hipDeviceGetAttribute);
  LOAD_SYMBOL (hipSetDevice);
  LOAD_SYMBOL (hipMalloc);
  LOAD_SYMBOL (hipFree);
  LOAD_SYMBOL (hipHostMalloc);
  LOAD_SYMBOL (hipHostFree);
  LOAD_SYMBOL (hipStreamCreate);
  LOAD_SYMBOL (hipStreamDestroy);
  LOAD_SYMBOL (hipStreamSynchronize);
  LOAD_SYMBOL (hipEventCreateWithFlags);
  LOAD_SYMBOL (hipEventRecord);
  LOAD_SYMBOL (hipEventDestroy);
  LOAD_SYMBOL (hipEventSynchronize);
  LOAD_SYMBOL (hipEventQuery);
  LOAD_SYMBOL (hipModuleLoadData);
  LOAD_SYMBOL (hipModuleUnload);
  LOAD_SYMBOL (hipModuleGetFunction);
  LOAD_SYMBOL (hipModuleLaunchKernel);
  LOAD_SYMBOL (hipMemcpyParam2DAsync);
  LOAD_SYMBOL (hipMemsetD8Async);
  LOAD_SYMBOL (hipMemsetD16Async);
  LOAD_SYMBOL (hipMemsetD32Async);
  LOAD_SYMBOL (hipTexObjectCreate);
  LOAD_SYMBOL (hipTexObjectDestroy);
  LOAD_SYMBOL (hipGraphicsMapResources);
  LOAD_SYMBOL (hipGraphicsResourceGetMappedPointer);
  LOAD_SYMBOL (hipGraphicsUnmapResources);
  LOAD_SYMBOL (hipGraphicsUnregisterResource);
#ifdef HAVE_GST_GL
  LOAD_SYMBOL (hipGLGetDevices);
  LOAD_SYMBOL (hipGraphicsGLRegisterBuffer);
#endif

  table->loaded = TRUE;
}

static void
load_cuda_func_table (void)
{
  GModule *module = nullptr;
#ifndef G_OS_WIN32
  module = g_module_open ("libcuda.so", G_MODULE_BIND_LAZY);
#else
  module = g_module_open ("nvcuda.dll", G_MODULE_BIND_LAZY);
#endif

  if (!module) {
    GST_INFO ("Couldn't open CUDA library");
    return;
  }

  auto table = &cuda_ftable;
  LOAD_SYMBOL (cuInit);
  LOAD_SYMBOL (cuDriverGetVersion);
  LOAD_SYMBOL (cuModuleLoadData);
  LOAD_SYMBOL (cuModuleUnload);
  LOAD_SYMBOL (cuModuleGetFunction);
  LOAD_SYMBOL (cuLaunchKernel);
  LOAD_SYMBOL (cuMemcpy2DAsync);
  LOAD_SYMBOL (cuMemsetD8Async);
  LOAD_SYMBOL (cuMemsetD16Async);
  LOAD_SYMBOL (cuMemsetD32Async);
  LOAD_SYMBOL (cuTexObjectCreate);
  LOAD_SYMBOL (cuTexObjectDestroy);

  table->loaded = TRUE;
}

static void
load_cudart_func_table (guint major_ver, guint minor_ver)
{
  GModule *module = nullptr;
  auto module_name = g_getenv ("GST_HIP_CUDART_LIBNAME");
  if (module_name)
    module = g_module_open (module_name, G_MODULE_BIND_LAZY);

  if (!module) {
#ifndef G_OS_WIN32
    module = g_module_open ("libcudart.so", G_MODULE_BIND_LAZY);
#else
    auto lib_name = g_strdup_printf ("cudart64_%d.dll", major_ver);
    module = g_module_open (lib_name, G_MODULE_BIND_LAZY);
    g_free (lib_name);

    if (!module) {
      lib_name = g_strdup_printf ("cudart64_%d%d.dll", major_ver, minor_ver);
      module = g_module_open (lib_name, G_MODULE_BIND_LAZY);
      g_free (lib_name);
    }

    if (!module) {
      auto cuda_root = g_getenv ("CUDA_PATH");
      if (cuda_root) {
        auto path = g_build_path (G_DIR_SEPARATOR_S, cuda_root, "bin", nullptr);
        auto dir = g_dir_open (path, 0, nullptr);
        if (dir) {
          const gchar *name;
          while ((name = g_dir_read_name (dir))) {
            if (g_str_has_prefix (name, "cudart64_") &&
                g_str_has_suffix (name, ".dll")) {
              auto lib_path = g_build_filename (path, name, nullptr);
              module = g_module_open (lib_path, G_MODULE_BIND_LAZY);
              g_free (lib_path);
              break;
            }
          }

          g_dir_close (dir);
        }
        g_free (path);
      }
    }
#endif
  }

  if (!module) {
    GST_INFO ("Couldn't open CUDA runtime library");
    return;
  }

  auto table = &cudart_ftable;
  LOAD_SYMBOL (cudaRuntimeGetVersion);
  LOAD_SYMBOL (cudaGetErrorName);
  LOAD_SYMBOL (cudaGetErrorString);
  LOAD_SYMBOL (cudaGetDeviceCount);
  LOAD_SYMBOL (cudaGetDeviceProperties);
  LOAD_SYMBOL (cudaDeviceGetAttribute);
  LOAD_SYMBOL (cudaSetDevice);
  LOAD_SYMBOL (cudaMalloc);
  LOAD_SYMBOL (cudaFree);
  LOAD_SYMBOL (cudaMallocHost);
  LOAD_SYMBOL (cudaFreeHost);
  LOAD_SYMBOL (cudaStreamCreate);
  LOAD_SYMBOL (cudaStreamDestroy);
  LOAD_SYMBOL (cudaStreamSynchronize);
  LOAD_SYMBOL (cudaEventCreateWithFlags);
  LOAD_SYMBOL (cudaEventRecord);
  LOAD_SYMBOL (cudaEventDestroy);
  LOAD_SYMBOL (cudaEventSynchronize);
  LOAD_SYMBOL (cudaEventQuery);
  LOAD_SYMBOL (cudaGraphicsMapResources);
  LOAD_SYMBOL (cudaGraphicsResourceGetMappedPointer);
  LOAD_SYMBOL (cudaGraphicsUnmapResources);
  LOAD_SYMBOL (cudaGraphicsUnregisterResource);
#ifdef HAVE_GST_GL
  LOAD_SYMBOL (cudaGLGetDevices);
  LOAD_SYMBOL (cudaGraphicsGLRegisterBuffer);
#endif

  table->loaded = TRUE;
}

/* *INDENT-OFF* */
static gboolean
gst_hip_load_library_amd (void)
{
  static std::once_flag once;
  std::call_once (once,[]() {
    load_amd_func_table ();
    if (amd_ftable.loaded) {
      auto ret = amd_ftable.hipInit (0);
      if (ret != hipSuccess)
        amd_ftable.loaded = FALSE;
    }
  });

  return amd_ftable.loaded;
}

static gboolean
gst_hip_load_library_nvidia (void)
{
  static std::once_flag once;
  std::call_once (once,[]() {
    load_cuda_func_table ();
    if (cuda_ftable.loaded) {
      auto ret = cuda_ftable.cuInit (0);
      if (ret != CUDA_SUCCESS) {
        cuda_ftable.loaded = FALSE;
        return;
      }

      int cuda_ver = 0;
      ret = cuda_ftable.cuDriverGetVersion (&cuda_ver);
      if (ret != CUDA_SUCCESS)
        return;

      int major_ver = cuda_ver / 1000;
      int minor_ver = (cuda_ver % 1000) / 10;
      load_cudart_func_table (major_ver, minor_ver);
    }
  });

  if (!cuda_ftable.loaded || !cudart_ftable.loaded)
    return FALSE;

  return TRUE;
}
/* *INDENT-ON* */

gboolean
gst_hip_load_library (GstHipVendor vendor)
{
  switch (vendor) {
    case GST_HIP_VENDOR_AMD:
      return gst_hip_load_library_amd ();
    case GST_HIP_VENDOR_NVIDIA:
      return gst_hip_load_library_nvidia ();
    case GST_HIP_VENDOR_UNKNOWN:
      if (gst_hip_load_library_amd () || gst_hip_load_library_nvidia ())
        return TRUE;
      break;
  }

  return FALSE;
}

#define CHECK_VENDOR(v) \
  g_return_val_if_fail (vendor != GST_HIP_VENDOR_UNKNOWN, \
      hipErrorNotInitialized); \
  g_return_val_if_fail (gst_hip_load_library (vendor), hipErrorNotInitialized);


hipError_t
HipInit (GstHipVendor vendor, unsigned int flags)
{
  CHECK_VENDOR (vendor);

  if (vendor == GST_HIP_VENDOR_AMD)
    return amd_ftable.hipInit (flags);

  auto cuda_ret = cuda_ftable.cuInit (flags);
  return hipCUResultTohipError (cuda_ret);
}

hipError_t
HipDriverGetVersion (GstHipVendor vendor, int *driverVersion)
{
  CHECK_VENDOR (vendor);

  if (vendor == GST_HIP_VENDOR_AMD)
    return amd_ftable.hipDriverGetVersion (driverVersion);

  auto cuda_ret = cuda_ftable.cuDriverGetVersion (driverVersion);
  return hipCUResultTohipError (cuda_ret);
}

hipError_t
HipRuntimeGetVersion (GstHipVendor vendor, int *runtimeVersion)
{
  CHECK_VENDOR (vendor);

  if (vendor == GST_HIP_VENDOR_AMD)
    return amd_ftable.hipRuntimeGetVersion (runtimeVersion);

  auto cuda_ret = cudart_ftable.cudaRuntimeGetVersion (runtimeVersion);
  return hipCUDAErrorTohipError (cuda_ret);
}

const char *
HipGetErrorName (GstHipVendor vendor, hipError_t hip_error)
{
  g_return_val_if_fail (vendor != GST_HIP_VENDOR_UNKNOWN, nullptr);
  g_return_val_if_fail (gst_hip_load_library (vendor), nullptr);

  if (vendor == GST_HIP_VENDOR_AMD)
    return amd_ftable.hipGetErrorName (hip_error);

  auto cuda_ret = hipErrorToCudaError (hip_error);
  return cudart_ftable.cudaGetErrorName (cuda_ret);
}

const char *
HipGetErrorString (GstHipVendor vendor, hipError_t hipError)
{
  g_return_val_if_fail (vendor != GST_HIP_VENDOR_UNKNOWN, nullptr);
  g_return_val_if_fail (gst_hip_load_library (vendor), nullptr);

  if (vendor == GST_HIP_VENDOR_AMD)
    return amd_ftable.hipGetErrorString (hipError);

  auto cuda_ret = hipErrorToCudaError (hipError);
  return cudart_ftable.cudaGetErrorString (cuda_ret);
}

hipError_t
HipGetDeviceCount (GstHipVendor vendor, int *count)
{
  CHECK_VENDOR (vendor);

  if (vendor == GST_HIP_VENDOR_AMD)
    return amd_ftable.hipGetDeviceCount (count);

  auto cuda_ret = cudart_ftable.cudaGetDeviceCount (count);
  return hipCUDAErrorTohipError (cuda_ret);
}

hipError_t
HipGetDeviceProperties (GstHipVendor vendor, hipDeviceProp_t * prop,
    int deviceId)
{
  CHECK_VENDOR (vendor);

  if (vendor == GST_HIP_VENDOR_AMD)
    return amd_ftable.hipGetDeviceProperties (prop, deviceId);

  if (!prop)
    return hipErrorInvalidValue;

  struct cudaDeviceProp cdprop;
  auto cuda_ret = cudart_ftable.cudaGetDeviceProperties (&cdprop, deviceId);
  if (cuda_ret != cudaSuccess)
    return hipCUDAErrorTohipError (cuda_ret);

  strncpy (prop->name, cdprop.name, 256);
  strncpy (prop->uuid.bytes, cdprop.uuid.bytes, 16);
  strncpy (prop->luid, cdprop.luid, 8);
  prop->luidDeviceNodeMask = cdprop.luidDeviceNodeMask;
  prop->totalGlobalMem = cdprop.totalGlobalMem;
  prop->sharedMemPerBlock = cdprop.sharedMemPerBlock;
  prop->regsPerBlock = cdprop.regsPerBlock;
  prop->memPitch = cdprop.memPitch;
  prop->maxThreadsPerBlock = cdprop.maxThreadsPerBlock;
  prop->maxThreadsDim[0] = cdprop.maxThreadsDim[0];
  prop->maxThreadsDim[1] = cdprop.maxThreadsDim[1];
  prop->maxThreadsDim[2] = cdprop.maxThreadsDim[2];
  prop->maxGridSize[0] = cdprop.maxGridSize[0];
  prop->maxGridSize[1] = cdprop.maxGridSize[1];
  prop->maxGridSize[2] = cdprop.maxGridSize[2];
  prop->clockRate = cdprop.clockRate;
  prop->totalConstMem = cdprop.totalConstMem;
  prop->major = cdprop.major;
  prop->minor = cdprop.minor;
  prop->textureAlignment = cdprop.textureAlignment;
  prop->texturePitchAlignment = cdprop.texturePitchAlignment;
  prop->deviceOverlap = cdprop.deviceOverlap;
  prop->multiProcessorCount = cdprop.multiProcessorCount;
  prop->kernelExecTimeoutEnabled = cdprop.kernelExecTimeoutEnabled;
  prop->integrated = cdprop.integrated;
  prop->canMapHostMemory = cdprop.canMapHostMemory;
  prop->computeMode = cdprop.computeMode;
  prop->maxTexture1D = cdprop.maxTexture1D;
  prop->maxTexture1DMipmap = cdprop.maxTexture1DMipmap;
  prop->maxTexture1DLinear = cdprop.maxTexture1DLinear;
  prop->maxTexture2D[0] = cdprop.maxTexture2D[0];
  prop->maxTexture2D[1] = cdprop.maxTexture2D[1];
  prop->maxTexture2DMipmap[0] = cdprop.maxTexture2DMipmap[0];
  prop->maxTexture2DMipmap[1] = cdprop.maxTexture2DMipmap[1];
  prop->maxTexture2DLinear[0] = cdprop.maxTexture2DLinear[0];
  prop->maxTexture2DLinear[1] = cdprop.maxTexture2DLinear[1];
  prop->maxTexture2DLinear[2] = cdprop.maxTexture2DLinear[2];
  prop->maxTexture2DGather[0] = cdprop.maxTexture2DGather[0];
  prop->maxTexture2DGather[1] = cdprop.maxTexture2DGather[1];
  prop->maxTexture3D[0] = cdprop.maxTexture3D[0];
  prop->maxTexture3D[1] = cdprop.maxTexture3D[1];
  prop->maxTexture3D[2] = cdprop.maxTexture3D[2];
  prop->maxTexture3DAlt[0] = cdprop.maxTexture3DAlt[0];
  prop->maxTexture3DAlt[1] = cdprop.maxTexture3DAlt[1];
  prop->maxTexture3DAlt[2] = cdprop.maxTexture3DAlt[2];
  prop->maxTextureCubemap = cdprop.maxTextureCubemap;
  prop->maxTexture1DLayered[0] = cdprop.maxTexture1DLayered[0];
  prop->maxTexture1DLayered[1] = cdprop.maxTexture1DLayered[1];
  prop->maxTexture2DLayered[0] = cdprop.maxTexture2DLayered[0];
  prop->maxTexture2DLayered[1] = cdprop.maxTexture2DLayered[1];
  prop->maxTexture2DLayered[2] = cdprop.maxTexture2DLayered[2];
  prop->maxTextureCubemapLayered[0] = cdprop.maxTextureCubemapLayered[0];
  prop->maxTextureCubemapLayered[1] = cdprop.maxTextureCubemapLayered[1];
  prop->maxSurface1D = cdprop.maxSurface1D;
  prop->maxSurface2D[0] = cdprop.maxSurface2D[0];
  prop->maxSurface2D[1] = cdprop.maxSurface2D[1];
  prop->maxSurface3D[0] = cdprop.maxSurface3D[0];
  prop->maxSurface3D[1] = cdprop.maxSurface3D[1];
  prop->maxSurface3D[2] = cdprop.maxSurface3D[2];
  prop->maxSurface1DLayered[0] = cdprop.maxSurface1DLayered[0];
  prop->maxSurface1DLayered[1] = cdprop.maxSurface1DLayered[1];
  prop->maxSurface2DLayered[0] = cdprop.maxSurface2DLayered[0];
  prop->maxSurface2DLayered[1] = cdprop.maxSurface2DLayered[1];
  prop->maxSurface2DLayered[2] = cdprop.maxSurface2DLayered[2];
  prop->maxSurfaceCubemap = cdprop.maxSurfaceCubemap;
  prop->maxSurfaceCubemapLayered[0] = cdprop.maxSurfaceCubemapLayered[0];
  prop->maxSurfaceCubemapLayered[1] = cdprop.maxSurfaceCubemapLayered[1];
  prop->surfaceAlignment = cdprop.surfaceAlignment;
  prop->concurrentKernels = cdprop.concurrentKernels;
  prop->ECCEnabled = cdprop.ECCEnabled;
  prop->pciBusID = cdprop.pciBusID;
  prop->pciDeviceID = cdprop.pciDeviceID;
  prop->pciDomainID = cdprop.pciDomainID;
  prop->tccDriver = cdprop.tccDriver;
  prop->asyncEngineCount = cdprop.asyncEngineCount;
  prop->unifiedAddressing = cdprop.unifiedAddressing;
  prop->memoryClockRate = cdprop.memoryClockRate;
  prop->memoryBusWidth = cdprop.memoryBusWidth;
  prop->l2CacheSize = cdprop.l2CacheSize;
  prop->maxThreadsPerMultiProcessor = cdprop.maxThreadsPerMultiProcessor;
  prop->streamPrioritiesSupported = cdprop.streamPrioritiesSupported;
  prop->globalL1CacheSupported = cdprop.globalL1CacheSupported;
  prop->localL1CacheSupported = cdprop.localL1CacheSupported;
  prop->sharedMemPerMultiprocessor = cdprop.sharedMemPerMultiprocessor;
  prop->regsPerMultiprocessor = cdprop.regsPerMultiprocessor;
  prop->managedMemory = cdprop.managedMemory;
  prop->isMultiGpuBoard = cdprop.isMultiGpuBoard;
  prop->multiGpuBoardGroupID = cdprop.multiGpuBoardGroupID;
  prop->hostNativeAtomicSupported = cdprop.hostNativeAtomicSupported;
  prop->singleToDoublePrecisionPerfRatio =
      cdprop.singleToDoublePrecisionPerfRatio;
  prop->pageableMemoryAccess = cdprop.pageableMemoryAccess;
  prop->concurrentManagedAccess = cdprop.concurrentManagedAccess;
  prop->computePreemptionSupported = cdprop.computePreemptionSupported;
  prop->canUseHostPointerForRegisteredMem =
      cdprop.canUseHostPointerForRegisteredMem;
  prop->cooperativeLaunch = cdprop.cooperativeLaunch;
  prop->cooperativeMultiDeviceLaunch = cdprop.cooperativeMultiDeviceLaunch;
  prop->sharedMemPerBlockOptin = cdprop.sharedMemPerBlockOptin;
  prop->pageableMemoryAccessUsesHostPageTables =
      cdprop.pageableMemoryAccessUsesHostPageTables;
  prop->directManagedMemAccessFromHost = cdprop.directManagedMemAccessFromHost;
  prop->accessPolicyMaxWindowSize = cdprop.accessPolicyMaxWindowSize;
  prop->maxBlocksPerMultiProcessor = cdprop.maxBlocksPerMultiProcessor;
  prop->persistingL2CacheMaxSize = cdprop.persistingL2CacheMaxSize;
  prop->reservedSharedMemPerBlock = cdprop.reservedSharedMemPerBlock;
  prop->warpSize = cdprop.warpSize;
  prop->clusterLaunch = cdprop.clusterLaunch;
  prop->deferredMappingHipArraySupported =
      cdprop.deferredMappingCudaArraySupported;
  prop->gpuDirectRDMAFlushWritesOptions =
      cdprop.gpuDirectRDMAFlushWritesOptions;
  prop->gpuDirectRDMASupported = cdprop.gpuDirectRDMASupported;
  prop->gpuDirectRDMAWritesOrdering = cdprop.gpuDirectRDMAWritesOrdering;
  prop->hostRegisterReadOnlySupported = cdprop.hostRegisterReadOnlySupported;
  prop->hostRegisterSupported = cdprop.hostRegisterSupported;
  prop->ipcEventSupported = cdprop.ipcEventSupported;
  prop->memoryPoolSupportedHandleTypes = cdprop.memoryPoolSupportedHandleTypes;
  prop->memoryPoolsSupported = cdprop.memoryPoolsSupported;
  prop->sparseHipArraySupported = cdprop.sparseCudaArraySupported;
  prop->timelineSemaphoreInteropSupported =
      cdprop.timelineSemaphoreInteropSupported;
  prop->unifiedFunctionPointers = cdprop.unifiedFunctionPointers;

  return hipSuccess;
}

hipError_t
HipDeviceGetAttribute (GstHipVendor vendor, int *pi, hipDeviceAttribute_t attr,
    int deviceId)
{
  CHECK_VENDOR (vendor);

  if (vendor == GST_HIP_VENDOR_AMD)
    return amd_ftable.hipDeviceGetAttribute (pi, attr, deviceId);

  enum cudaDeviceAttr cdattr;
  switch (attr) {
    case hipDeviceAttributeMaxThreadsPerBlock:
      cdattr = cudaDevAttrMaxThreadsPerBlock;
      break;
    case hipDeviceAttributeMaxBlockDimX:
      cdattr = cudaDevAttrMaxBlockDimX;
      break;
    case hipDeviceAttributeMaxBlockDimY:
      cdattr = cudaDevAttrMaxBlockDimY;
      break;
    case hipDeviceAttributeMaxBlockDimZ:
      cdattr = cudaDevAttrMaxBlockDimZ;
      break;
    case hipDeviceAttributeMaxGridDimX:
      cdattr = cudaDevAttrMaxGridDimX;
      break;
    case hipDeviceAttributeMaxGridDimY:
      cdattr = cudaDevAttrMaxGridDimY;
      break;
    case hipDeviceAttributeMaxGridDimZ:
      cdattr = cudaDevAttrMaxGridDimZ;
      break;
    case hipDeviceAttributeMaxSharedMemoryPerBlock:
      cdattr = cudaDevAttrMaxSharedMemoryPerBlock;
      break;
    case hipDeviceAttributeTotalConstantMemory:
      cdattr = cudaDevAttrTotalConstantMemory;
      break;
    case hipDeviceAttributeWarpSize:
      cdattr = cudaDevAttrWarpSize;
      break;
    case hipDeviceAttributeMaxRegistersPerBlock:
      cdattr = cudaDevAttrMaxRegistersPerBlock;
      break;
    case hipDeviceAttributeClockRate:
      cdattr = cudaDevAttrClockRate;
      break;
    case hipDeviceAttributeMemoryClockRate:
      cdattr = cudaDevAttrMemoryClockRate;
      break;
    case hipDeviceAttributeMemoryBusWidth:
      cdattr = cudaDevAttrGlobalMemoryBusWidth;
      break;
    case hipDeviceAttributeMultiprocessorCount:
      cdattr = cudaDevAttrMultiProcessorCount;
      break;
    case hipDeviceAttributeComputeMode:
      cdattr = cudaDevAttrComputeMode;
      break;
    case hipDeviceAttributeL2CacheSize:
      cdattr = cudaDevAttrL2CacheSize;
      break;
    case hipDeviceAttributeMaxThreadsPerMultiProcessor:
      cdattr = cudaDevAttrMaxThreadsPerMultiProcessor;
      break;
    case hipDeviceAttributeComputeCapabilityMajor:
      cdattr = cudaDevAttrComputeCapabilityMajor;
      break;
    case hipDeviceAttributeComputeCapabilityMinor:
      cdattr = cudaDevAttrComputeCapabilityMinor;
      break;
    case hipDeviceAttributeConcurrentKernels:
      cdattr = cudaDevAttrConcurrentKernels;
      break;
    case hipDeviceAttributePciBusId:
      cdattr = cudaDevAttrPciBusId;
      break;
    case hipDeviceAttributePciDeviceId:
      cdattr = cudaDevAttrPciDeviceId;
      break;
    case hipDeviceAttributeMaxSharedMemoryPerMultiprocessor:
      cdattr = cudaDevAttrMaxSharedMemoryPerMultiprocessor;
      break;
    case hipDeviceAttributeIsMultiGpuBoard:
      cdattr = cudaDevAttrIsMultiGpuBoard;
      break;
    case hipDeviceAttributeIntegrated:
      cdattr = cudaDevAttrIntegrated;
      break;
    case hipDeviceAttributeMaxTexture1DWidth:
      cdattr = cudaDevAttrMaxTexture1DWidth;
      break;
    case hipDeviceAttributeMaxTexture2DWidth:
      cdattr = cudaDevAttrMaxTexture2DWidth;
      break;
    case hipDeviceAttributeMaxTexture2DHeight:
      cdattr = cudaDevAttrMaxTexture2DHeight;
      break;
    case hipDeviceAttributeMaxTexture3DWidth:
      cdattr = cudaDevAttrMaxTexture3DWidth;
      break;
    case hipDeviceAttributeMaxTexture3DHeight:
      cdattr = cudaDevAttrMaxTexture3DHeight;
      break;
    case hipDeviceAttributeMaxTexture3DDepth:
      cdattr = cudaDevAttrMaxTexture3DDepth;
      break;
    case hipDeviceAttributeMaxPitch:
      cdattr = cudaDevAttrMaxPitch;
      break;
    case hipDeviceAttributeTextureAlignment:
      cdattr = cudaDevAttrTextureAlignment;
      break;
    case hipDeviceAttributeTexturePitchAlignment:
      cdattr = cudaDevAttrTexturePitchAlignment;
      break;
    case hipDeviceAttributeKernelExecTimeout:
      cdattr = cudaDevAttrKernelExecTimeout;
      break;
    case hipDeviceAttributeCanMapHostMemory:
      cdattr = cudaDevAttrCanMapHostMemory;
      break;
    case hipDeviceAttributeEccEnabled:
      cdattr = cudaDevAttrEccEnabled;
      break;
    case hipDeviceAttributeCooperativeLaunch:
      cdattr = cudaDevAttrCooperativeLaunch;
      break;
    case hipDeviceAttributeCooperativeMultiDeviceLaunch:
      cdattr = cudaDevAttrCooperativeMultiDeviceLaunch;
      break;
    case hipDeviceAttributeHostRegisterSupported:
      cdattr = cudaDevAttrHostRegisterSupported;
      break;
    case hipDeviceAttributeConcurrentManagedAccess:
      cdattr = cudaDevAttrConcurrentManagedAccess;
      break;
    case hipDeviceAttributeManagedMemory:
      cdattr = cudaDevAttrManagedMemory;
      break;
    case hipDeviceAttributePageableMemoryAccessUsesHostPageTables:
      cdattr = cudaDevAttrPageableMemoryAccessUsesHostPageTables;
      break;
    case hipDeviceAttributePageableMemoryAccess:
      cdattr = cudaDevAttrPageableMemoryAccess;
      break;
    case hipDeviceAttributeDirectManagedMemAccessFromHost:
      cdattr = cudaDevAttrDirectManagedMemAccessFromHost;
      break;
    case hipDeviceAttributeGlobalL1CacheSupported:
      cdattr = cudaDevAttrGlobalL1CacheSupported;
      break;
    case hipDeviceAttributeMaxBlocksPerMultiProcessor:
      cdattr = cudaDevAttrMaxBlocksPerMultiprocessor;
      break;
    case hipDeviceAttributeMultiGpuBoardGroupID:
      cdattr = cudaDevAttrMultiGpuBoardGroupID;
      break;
    case hipDeviceAttributeReservedSharedMemPerBlock:
      cdattr = cudaDevAttrReservedSharedMemoryPerBlock;
      break;
    case hipDeviceAttributeSingleToDoublePrecisionPerfRatio:
      cdattr = cudaDevAttrSingleToDoublePrecisionPerfRatio;
      break;
    case hipDeviceAttributeStreamPrioritiesSupported:
      cdattr = cudaDevAttrStreamPrioritiesSupported;
      break;
    case hipDeviceAttributeSurfaceAlignment:
      cdattr = cudaDevAttrSurfaceAlignment;
      break;
    case hipDeviceAttributeTccDriver:
      cdattr = cudaDevAttrTccDriver;
      break;
    case hipDeviceAttributeUnifiedAddressing:
      cdattr = cudaDevAttrUnifiedAddressing;
      break;
    case hipDeviceAttributeMemoryPoolsSupported:
      cdattr = cudaDevAttrMemoryPoolsSupported;
      break;
    case hipDeviceAttributeVirtualMemoryManagementSupported:
    {
      auto cuda_ret = cuda_ftable.cuDeviceGetAttribute (pi,
          CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED,
          deviceId);
      return hipCUResultTohipError (cuda_ret);
    }
    case hipDeviceAttributeAccessPolicyMaxWindowSize:
      cdattr = cudaDevAttrMaxAccessPolicyWindowSize;
      break;
    case hipDeviceAttributeAsyncEngineCount:
      cdattr = cudaDevAttrAsyncEngineCount;
      break;
    case hipDeviceAttributeCanUseHostPointerForRegisteredMem:
      cdattr = cudaDevAttrCanUseHostPointerForRegisteredMem;
      break;
    case hipDeviceAttributeComputePreemptionSupported:
      cdattr = cudaDevAttrComputePreemptionSupported;
      break;
    case hipDeviceAttributeHostNativeAtomicSupported:
      cdattr = cudaDevAttrHostNativeAtomicSupported;
      break;
    default:
      return hipCUDAErrorTohipError (cudaErrorInvalidValue);
  }

  auto cuda_ret = cudart_ftable.cudaDeviceGetAttribute (pi, cdattr, deviceId);
  return hipCUDAErrorTohipError (cuda_ret);
}

hipError_t
HipSetDevice (GstHipVendor vendor, int deviceId)
{
  CHECK_VENDOR (vendor);

  if (vendor == GST_HIP_VENDOR_AMD)
    return amd_ftable.hipSetDevice (deviceId);

  auto cuda_ret = cudart_ftable.cudaSetDevice (deviceId);
  return hipCUDAErrorTohipError (cuda_ret);
}

hipError_t
HipMalloc (GstHipVendor vendor, void **ptr, size_t size)
{
  CHECK_VENDOR (vendor);

  if (vendor == GST_HIP_VENDOR_AMD)
    return amd_ftable.hipMalloc (ptr, size);

  auto cuda_ret = cudart_ftable.cudaMalloc (ptr, size);
  return hipCUDAErrorTohipError (cuda_ret);
}

hipError_t
HipFree (GstHipVendor vendor, void *ptr)
{
  CHECK_VENDOR (vendor);

  if (vendor == GST_HIP_VENDOR_AMD)
    return amd_ftable.hipFree (ptr);

  auto cuda_ret = cudart_ftable.cudaFree (ptr);
  return hipCUDAErrorTohipError (cuda_ret);
}

hipError_t
HipHostMalloc (GstHipVendor vendor, void **ptr, size_t size, unsigned int flags)
{
  CHECK_VENDOR (vendor);

  if (vendor == GST_HIP_VENDOR_AMD)
    return amd_ftable.hipHostMalloc (ptr, size, flags);

  auto cuda_ret = cudart_ftable.cudaMallocHost (ptr, size, flags);
  return hipCUDAErrorTohipError (cuda_ret);
}

hipError_t
HipHostFree (GstHipVendor vendor, void *ptr)
{
  CHECK_VENDOR (vendor);

  if (vendor == GST_HIP_VENDOR_AMD)
    return amd_ftable.hipHostFree (ptr);

  auto cuda_ret = cudart_ftable.cudaFreeHost (ptr);
  return hipCUDAErrorTohipError (cuda_ret);
}

hipError_t
HipStreamCreate (GstHipVendor vendor, hipStream_t * stream)
{
  CHECK_VENDOR (vendor);

  if (vendor == GST_HIP_VENDOR_AMD)
    return amd_ftable.hipStreamCreate (stream);

  auto cuda_ret = cudart_ftable.cudaStreamCreate ((cudaStream_t *) stream);
  return hipCUDAErrorTohipError (cuda_ret);
}

hipError_t
HipStreamDestroy (GstHipVendor vendor, hipStream_t stream)
{
  CHECK_VENDOR (vendor);

  if (vendor == GST_HIP_VENDOR_AMD)
    return amd_ftable.hipStreamDestroy (stream);

  auto cuda_ret = cudart_ftable.cudaStreamDestroy (stream);
  return hipCUDAErrorTohipError (cuda_ret);
}

hipError_t
HipStreamSynchronize (GstHipVendor vendor, hipStream_t stream)
{
  CHECK_VENDOR (vendor);

  if (vendor == GST_HIP_VENDOR_AMD)
    return amd_ftable.hipStreamSynchronize (stream);

  auto cuda_ret = cudart_ftable.cudaStreamSynchronize (stream);
  return hipCUDAErrorTohipError (cuda_ret);
}

hipError_t
HipEventCreateWithFlags (GstHipVendor vendor, hipEvent_t * event,
    unsigned flags)
{
  CHECK_VENDOR (vendor);

  if (vendor == GST_HIP_VENDOR_AMD)
    return amd_ftable.hipEventCreateWithFlags (event, flags);

  auto cuda_ret = cudart_ftable.cudaEventCreateWithFlags ((cudaEvent_t *) event,
      flags);
  return hipCUDAErrorTohipError (cuda_ret);
}

hipError_t
HipEventRecord (GstHipVendor vendor, hipEvent_t event, hipStream_t stream)
{
  CHECK_VENDOR (vendor);

  if (vendor == GST_HIP_VENDOR_AMD)
    return amd_ftable.hipEventRecord (event, stream);

  auto cuda_ret = cudart_ftable.cudaEventRecord ((cudaEvent_t) event, stream);
  return hipCUDAErrorTohipError (cuda_ret);
}

hipError_t
HipEventDestroy (GstHipVendor vendor, hipEvent_t event)
{
  CHECK_VENDOR (vendor);

  if (vendor == GST_HIP_VENDOR_AMD)
    return amd_ftable.hipEventDestroy (event);

  auto cuda_ret = cudart_ftable.cudaEventDestroy ((cudaEvent_t) event);
  return hipCUDAErrorTohipError (cuda_ret);
}

hipError_t
HipEventSynchronize (GstHipVendor vendor, hipEvent_t event)
{
  CHECK_VENDOR (vendor);

  if (vendor == GST_HIP_VENDOR_AMD)
    return amd_ftable.hipEventSynchronize (event);

  auto cuda_ret = cudart_ftable.cudaEventSynchronize ((cudaEvent_t) event);
  return hipCUDAErrorTohipError (cuda_ret);
}

hipError_t
HipEventQuery (GstHipVendor vendor, hipEvent_t event)
{
  CHECK_VENDOR (vendor);

  if (vendor == GST_HIP_VENDOR_AMD)
    return amd_ftable.hipEventQuery (event);

  auto cuda_ret = cudart_ftable.cudaEventQuery ((cudaEvent_t) event);
  return hipCUDAErrorTohipError (cuda_ret);
}

hipError_t
HipModuleLoadData (GstHipVendor vendor, hipModule_t * module, const void *image)
{
  CHECK_VENDOR (vendor);

  if (vendor == GST_HIP_VENDOR_AMD)
    return amd_ftable.hipModuleLoadData (module, image);

  auto cuda_ret = cuda_ftable.cuModuleLoadData ((CUmodule *) module, image);
  return hipCUResultTohipError (cuda_ret);
}

hipError_t
HipModuleUnload (GstHipVendor vendor, hipModule_t module)
{
  CHECK_VENDOR (vendor);

  if (vendor == GST_HIP_VENDOR_AMD)
    return amd_ftable.hipModuleUnload (module);

  auto cuda_ret = cuda_ftable.cuModuleUnload ((CUmodule) module);
  return hipCUResultTohipError (cuda_ret);
}

hipError_t
HipModuleGetFunction (GstHipVendor vendor, hipFunction_t * function,
    hipModule_t module, const char *kname)
{
  CHECK_VENDOR (vendor);

  if (vendor == GST_HIP_VENDOR_AMD)
    return amd_ftable.hipModuleGetFunction (function, module, kname);


  auto cuda_ret = cuda_ftable.cuModuleGetFunction ((CUfunction *) function,
      (CUmodule) module, kname);
  return hipCUResultTohipError (cuda_ret);
}

hipError_t
HipModuleLaunchKernel (GstHipVendor vendor, hipFunction_t f,
    unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ,
    unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,
    unsigned int sharedMemBytes, hipStream_t stream, void **kernelParams,
    void **extra)
{
  CHECK_VENDOR (vendor);

  if (vendor == GST_HIP_VENDOR_AMD)
    return amd_ftable.hipModuleLaunchKernel (f, gridDimX, gridDimY, gridDimZ,
        blockDimX, blockDimY, blockDimZ, sharedMemBytes, stream,
        kernelParams, extra);

  auto cuda_ret = cuda_ftable.cuLaunchKernel ((CUfunction) f, gridDimX,
      gridDimY, gridDimZ,
      blockDimX, blockDimY, blockDimZ, sharedMemBytes, (CUstream) stream,
      kernelParams, extra);
  return hipCUResultTohipError (cuda_ret);
}

hipError_t
HipMemcpyParam2DAsync (GstHipVendor vendor, const hip_Memcpy2D * pCopy,
    hipStream_t stream)
{
  CHECK_VENDOR (vendor);

  if (vendor == GST_HIP_VENDOR_AMD)
    return amd_ftable.hipMemcpyParam2DAsync (pCopy, stream);

  CUresult cuda_ret;
  if (!pCopy) {
    cuda_ret = cuda_ftable.cuMemcpy2DAsync (nullptr, (CUstream) stream);
  } else {
    CUDA_MEMCPY2D cudaCopy = { };
    hipMemcpy2DTocudaMemcpy2D (cudaCopy, pCopy);
    cuda_ret = cuda_ftable.cuMemcpy2DAsync (&cudaCopy, (CUstream) stream);
  }

  return hipCUResultTohipError (cuda_ret);
}

hipError_t
HipMemsetD8Async (GstHipVendor vendor, hipDeviceptr_t dest, unsigned char value,
    size_t count, hipStream_t stream)
{
  CHECK_VENDOR (vendor);

  if (vendor == GST_HIP_VENDOR_AMD)
    return amd_ftable.hipMemsetD8Async (dest, value, count, stream);

  auto cuda_ret = cuda_ftable.cuMemsetD8Async ((CUdeviceptr) dest, value,
      count, (CUstream) stream);
  return hipCUResultTohipError (cuda_ret);
}

hipError_t
HipMemsetD16Async (GstHipVendor vendor, hipDeviceptr_t dest,
    unsigned short value, size_t count, hipStream_t stream)
{
  CHECK_VENDOR (vendor);

  if (vendor == GST_HIP_VENDOR_AMD)
    return amd_ftable.hipMemsetD16Async (dest, value, count, stream);

  auto cuda_ret = cuda_ftable.cuMemsetD16Async ((CUdeviceptr) dest, value,
      count, (CUstream) stream);
  return hipCUResultTohipError (cuda_ret);
}

hipError_t
HipMemsetD32Async (GstHipVendor vendor, hipDeviceptr_t dst, int value,
    size_t count, hipStream_t stream)
{
  CHECK_VENDOR (vendor);

  if (vendor == GST_HIP_VENDOR_AMD)
    return amd_ftable.hipMemsetD32Async (dst, value, count, stream);

  auto cuda_ret = cuda_ftable.cuMemsetD32Async ((CUdeviceptr) dst, value,
      count, (CUstream) stream);
  return hipCUResultTohipError (cuda_ret);
}

hipError_t
HipTexObjectCreate (GstHipVendor vendor, hipTextureObject_t * pTexObject,
    const HIP_RESOURCE_DESC * pResDesc,
    const HIP_TEXTURE_DESC * pTexDesc,
    const HIP_RESOURCE_VIEW_DESC * pResViewDesc)
{
  CHECK_VENDOR (vendor);

  if (vendor == GST_HIP_VENDOR_AMD)
    return amd_ftable.hipTexObjectCreate (pTexObject, pResDesc, pTexDesc,
        pResViewDesc);

  auto cuda_ret = cuda_ftable.cuTexObjectCreate ((CUtexObject *) pTexObject,
      (const CUDA_RESOURCE_DESC *) pResDesc,
      (const CUDA_TEXTURE_DESC *) pTexDesc,
      (const CUDA_RESOURCE_VIEW_DESC *) pResViewDesc);

  return hipCUResultTohipError (cuda_ret);
}

hipError_t
HipTexObjectDestroy (GstHipVendor vendor, hipTextureObject_t texObject)
{
  CHECK_VENDOR (vendor);

  if (vendor == GST_HIP_VENDOR_AMD)
    return amd_ftable.hipTexObjectDestroy (texObject);

  auto cuda_ret = cuda_ftable.cuTexObjectDestroy ((CUtexObject) texObject);
  return hipCUResultTohipError (cuda_ret);
}

hipError_t
HipGraphicsMapResources (GstHipVendor vendor, int count,
    hipGraphicsResource_t * resources, hipStream_t stream)
{
  CHECK_VENDOR (vendor);

  if (vendor == GST_HIP_VENDOR_AMD)
    return amd_ftable.hipGraphicsMapResources (count, resources, stream);

  auto cuda_ret = cudart_ftable.cudaGraphicsMapResources (count,
      (cudaGraphicsResource_t *) resources, stream);
  return hipCUDAErrorTohipError (cuda_ret);
}

hipError_t
HipGraphicsResourceGetMappedPointer (GstHipVendor vendor, void **devPtr,
    size_t *size, hipGraphicsResource_t resource)
{
  CHECK_VENDOR (vendor);

  if (vendor == GST_HIP_VENDOR_AMD) {
    return amd_ftable.hipGraphicsResourceGetMappedPointer (devPtr,
        size, resource);
  }

  auto cuda_ret = cudart_ftable.cudaGraphicsResourceGetMappedPointer (devPtr,
      size, (cudaGraphicsResource_t) resource);
  return hipCUDAErrorTohipError (cuda_ret);
}

hipError_t
HipGraphicsUnmapResources (GstHipVendor vendor, int count,
    hipGraphicsResource_t * resources, hipStream_t stream)
{
  CHECK_VENDOR (vendor);

  if (vendor == GST_HIP_VENDOR_AMD)
    return amd_ftable.hipGraphicsUnmapResources (count, resources, stream);

  auto cuda_ret = cudart_ftable.cudaGraphicsUnmapResources (count,
      (cudaGraphicsResource_t *) resources, stream);
  return hipCUDAErrorTohipError (cuda_ret);
}

hipError_t
HipGraphicsUnregisterResource (GstHipVendor vendor,
    hipGraphicsResource_t resource)
{
  CHECK_VENDOR (vendor);

  if (vendor == GST_HIP_VENDOR_AMD)
    return amd_ftable.hipGraphicsUnregisterResource (resource);

  auto cuda_ret =
      cudart_ftable.cudaGraphicsUnregisterResource ((cudaGraphicsResource_t)
      resource);
  return hipCUDAErrorTohipError (cuda_ret);
}

#ifdef HAVE_GST_GL
hipError_t
HipGLGetDevices (GstHipVendor vendor, unsigned int *pHipDeviceCount,
    int *pHipDevices, unsigned int hipDeviceCount, hipGLDeviceList deviceList)
{
  CHECK_VENDOR (vendor);

  if (vendor == GST_HIP_VENDOR_AMD) {
    return amd_ftable.hipGLGetDevices (pHipDeviceCount, pHipDevices,
        hipDeviceCount, deviceList);
  }

  auto cuda_ret = cudart_ftable.cudaGLGetDevices (pHipDeviceCount, pHipDevices,
      hipDeviceCount, (enum cudaGLDeviceList) deviceList);
  return hipCUDAErrorTohipError (cuda_ret);
}

hipError_t
HipGraphicsGLRegisterBuffer (GstHipVendor vendor,
    hipGraphicsResource ** resource, unsigned int buffer, unsigned int flags)
{
  CHECK_VENDOR (vendor);

  if (vendor == GST_HIP_VENDOR_AMD)
    return amd_ftable.hipGraphicsGLRegisterBuffer (resource, buffer, flags);

  auto cuda_ret =
      cudart_ftable.cudaGraphicsGLRegisterBuffer ((struct cudaGraphicsResource
          **) resource,
      buffer, flags);
  return hipCUDAErrorTohipError (cuda_ret);
}
#endif