From d17e8707c9fb438878e3b1261a268fe34e49e491 Mon Sep 17 00:00:00 2001
From: Seungha Yang <seungha@centricular.com>
Date: Wed, 19 Feb 2025 17:46:34 +0900
Subject: [PATCH] nvencoder: Add extern-cuda-bufferpool property

Add new property to support application allocated GstCudaMemory.

CUDA memory alloc/free is a global device synchronization point
as if launching CUDA kernel on default CUDA stream. To avoid the global
synchronization, we added stream-ordered allocation support
which allocates CUDA memory asynchronously.
However, NVENC does not allow registering the stream-ordered
allocated memory. Thus encoder was allocating normal CUDA
memory in case that input CUDA memory is stream-ordered type.

In this commit, newly introduced property will allow application
to provide encoder with GstCudaBufferPool. Application can
preallocate sufficient amount of CUDA memory in advance
to avoid global device synchronization while pipeline is running.

For now, this pool is used only if input CUDA memory is allocated
via stream-ordered-allocation

Part-of: <https://gitlab.freedesktop.org/gstreamer/gstreamer/-/merge_requests/8516>
---
 .../docs/plugins/gst_plugins_cache.json       |  11 ++
 .../sys/nvcodec/gstnvencobject.cpp            |  51 +++++--
 .../sys/nvcodec/gstnvencobject.h              |   9 ++
 .../sys/nvcodec/gstnvencoder.cpp              | 139 +++++++++++++++---
 4 files changed, 181 insertions(+), 29 deletions(-)

diff --git a/subprojects/gst-plugins-bad/docs/plugins/gst_plugins_cache.json b/subprojects/gst-plugins-bad/docs/plugins/gst_plugins_cache.json
index f3bc8f59a1..81b646a63e 100644
--- a/subprojects/gst-plugins-bad/docs/plugins/gst_plugins_cache.json
+++ b/subprojects/gst-plugins-bad/docs/plugins/gst_plugins_cache.json
@@ -234329,6 +234329,17 @@
                         "readable": true,
                         "type": "GstNvEncoderSeiInsertMode",
                         "writable": true
+                    },
+                    "extern-cuda-bufferpool": {
+                        "blurb": "GstCudaBufferPool prepared by application",
+                        "conditionally-available": false,
+                        "construct": false,
+                        "construct-only": false,
+                        "controllable": false,
+                        "mutable": "ready",
+                        "readable": true,
+                        "type": "GstObject",
+                        "writable": true
                     }
                 }
             },
diff --git a/subprojects/gst-plugins-bad/sys/nvcodec/gstnvencobject.cpp b/subprojects/gst-plugins-bad/sys/nvcodec/gstnvencobject.cpp
index d98c0856a8..d2b2762239 100644
--- a/subprojects/gst-plugins-bad/sys/nvcodec/gstnvencobject.cpp
+++ b/subprojects/gst-plugins-bad/sys/nvcodec/gstnvencobject.cpp
@@ -722,8 +722,8 @@ GstNvEncObject::DeviceUnlock ()
 }
 
 NVENCSTATUS
-GstNvEncObject::acquireResourceCuda (GstMemory * mem,
-    GstNvEncResource ** resource)
+GstNvEncObject::acquireResourceCuda (GstMemory * mem, guint width, guint height,
+      guint stride, GstNvEncResource ** resource)
 {
   GstNvEncResource *res;
   GstCudaMemory *cmem;
@@ -732,11 +732,6 @@ GstNvEncObject::acquireResourceCuda (GstMemory * mem,
   NVENCSTATUS status;
   GstMapInfo info;
 
-  if (!gst_is_cuda_memory (mem)) {
-    GST_ERROR_ID (id_.c_str (), "Not a CUDA memory");
-    return NV_ENC_ERR_INVALID_CALL;
-  }
-
   cmem = GST_CUDA_MEMORY_CAST (mem);
 
   res = (GstNvEncResource *) gst_cuda_memory_get_token_data (cmem,
@@ -761,9 +756,9 @@ GstNvEncObject::acquireResourceCuda (GstMemory * mem,
 
   new_resource.version = gst_nvenc_get_register_resource_version ();
   new_resource.resourceType = NV_ENC_INPUT_RESOURCE_TYPE_CUDADEVICEPTR;
-  new_resource.width = cmem->info.width;
-  new_resource.height = cmem->info.height;
-  new_resource.pitch = cmem->info.stride[0];
+  new_resource.width = width;
+  new_resource.height = height;
+  new_resource.pitch = stride;
   new_resource.resourceToRegister = info.data;
   new_resource.bufferFormat = buffer_format_;
 
@@ -895,7 +890,17 @@ GstNvEncObject::AcquireResource (GstMemory * mem, GstNvEncResource ** resource)
   } else
 #endif
   {
-    status = acquireResourceCuda (mem, resource);
+    if (!gst_is_cuda_memory (mem)) {
+      GST_ERROR_ID (id_.c_str (), "Not a CUDA memory");
+      return NV_ENC_ERR_INVALID_CALL;
+    }
+
+    auto cmem = GST_CUDA_MEMORY_CAST (mem);
+    auto width = cmem->info.width;
+    auto height = cmem->info.height;
+    auto stride = cmem->info.stride[0];
+
+    status = acquireResourceCuda (mem, width, height, stride, resource);
   }
 
   if (status == NV_ENC_SUCCESS) {
@@ -908,6 +913,30 @@ GstNvEncObject::AcquireResource (GstMemory * mem, GstNvEncResource ** resource)
   return status;
 }
 
+NVENCSTATUS
+GstNvEncObject::AcquireResourceWithSize (GstMemory * mem,
+  guint width, guint height, guint stride, GstNvEncResource ** resource)
+{
+  NVENCSTATUS status;
+  std::lock_guard <std::recursive_mutex> lk (resource_lock_);
+
+  if (!gst_is_cuda_memory (mem)) {
+    GST_ERROR_ID (id_.c_str (), "Not a CUDA memory");
+    return NV_ENC_ERR_INVALID_CALL;
+  }
+
+  status = acquireResourceCuda (mem, width, height, stride, resource);
+
+  if (status == NV_ENC_SUCCESS) {
+    GST_TRACE_ID (id_.c_str (), "Returning resource %u, "
+        "resource queue size %u (active %u)",
+        (*resource)->seq_num, (guint) resource_queue_.size (),
+        (guint) active_resource_queue_.size ());
+  }
+
+  return status;
+}
+
 GstFlowReturn
 GstNvEncObject::AcquireTask (GstNvEncTask ** task, bool force)
 {
diff --git a/subprojects/gst-plugins-bad/sys/nvcodec/gstnvencobject.h b/subprojects/gst-plugins-bad/sys/nvcodec/gstnvencobject.h
index 248c689869..7291766a32 100644
--- a/subprojects/gst-plugins-bad/sys/nvcodec/gstnvencobject.h
+++ b/subprojects/gst-plugins-bad/sys/nvcodec/gstnvencobject.h
@@ -187,6 +187,12 @@ public:
   NVENCSTATUS   AcquireResource (GstMemory * mem,
                                  GstNvEncResource ** resource);
 
+  NVENCSTATUS   AcquireResourceWithSize (GstMemory * mem,
+                                         guint width,
+                                         guint height,
+                                         guint stride,
+                                         GstNvEncResource ** resource);
+
   GstFlowReturn AcquireTask (GstNvEncTask ** task,
                              bool force);
 
@@ -208,6 +214,9 @@ private:
   void releaseTaskUnlocked (GstNvEncTask * task);
 
   NVENCSTATUS acquireResourceCuda (GstMemory * mem,
+                                   guint width,
+                                   guint height,
+                                   guint stride,
                                    GstNvEncResource ** resource);
 
 #ifdef G_OS_WIN32
diff --git a/subprojects/gst-plugins-bad/sys/nvcodec/gstnvencoder.cpp b/subprojects/gst-plugins-bad/sys/nvcodec/gstnvencoder.cpp
index 75a895bb16..67f1b9526d 100644
--- a/subprojects/gst-plugins-bad/sys/nvcodec/gstnvencoder.cpp
+++ b/subprojects/gst-plugins-bad/sys/nvcodec/gstnvencoder.cpp
@@ -63,6 +63,7 @@ enum
 {
   PROP_0,
   PROP_CC_INSERT,
+  PROP_EXTERN_POOL,
 };
 
 #define DEFAULT_CC_INSERT GST_NV_ENCODER_SEI_INSERT
@@ -75,6 +76,11 @@ struct _GstNvEncoderPrivate
     memset (&config, 0, sizeof (NV_ENC_CONFIG));
   }
 
+   ~_GstNvEncoderPrivate ()
+  {
+    gst_clear_object (&extern_pool);
+  }
+
   GstCudaContext *context = nullptr;
   GstCudaStream *stream = nullptr;
 
@@ -121,8 +127,11 @@ struct _GstNvEncoderPrivate
 
   std::atomic < GstFlowReturn > last_flow;
 
+  GstVideoInfo extern_pool_info;
+
   /* properties */
   GstNvEncoderSeiInsertMode cc_insert = DEFAULT_CC_INSERT;
+  GstBufferPool *extern_pool = nullptr;
 };
 
 /**
@@ -184,6 +193,25 @@ gst_nv_encoder_class_init (GstNvEncoderClass * klass)
           GST_TYPE_NV_ENCODER_SEI_INSERT_MODE, DEFAULT_CC_INSERT,
           (GParamFlags) (G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS)));
 
+  /**
+   * GstNvEncoder:extern-cuda-bufferpool:
+   *
+   * GstCudaBufferPool prepared by application. Application can pass
+   * a buffer pool instance prepared in advance, to avoid
+   * global device synchronization caused by CUDA memory allocation.
+   *
+   * The buffer pool should be configured with stream-ordered-allocation disabled
+   *
+   * Since: 1.26
+   */
+  g_object_class_install_property (object_class, PROP_EXTERN_POOL,
+      g_param_spec_object ("extern-cuda-bufferpool", "Extern CUDA Buffer Pool",
+          "GstCudaBufferPool prepared by application",
+          GST_TYPE_OBJECT,
+          (GParamFlags) (G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS |
+              GST_PARAM_MUTABLE_READY)));
+
+
   element_class->set_context = GST_DEBUG_FUNCPTR (gst_nv_encoder_set_context);
 
   videoenc_class->open = GST_DEBUG_FUNCPTR (gst_nv_encoder_open);
@@ -247,6 +275,31 @@ gst_nv_encoder_set_property (GObject * object, guint prop_id,
     case PROP_CC_INSERT:
       priv->cc_insert = (GstNvEncoderSeiInsertMode) g_value_get_enum (value);
       break;
+    case PROP_EXTERN_POOL:
+      gst_clear_object (&priv->extern_pool);
+      priv->extern_pool = (GstBufferPool *) g_value_dup_object (value);
+      if (priv->extern_pool) {
+        if (!GST_IS_CUDA_BUFFER_POOL (priv->extern_pool)) {
+          GST_ERROR_OBJECT (self, "Not a CUDA buffer pool");
+          gst_clear_object (&priv->extern_pool);
+        } else if (!gst_buffer_pool_set_active (priv->extern_pool, TRUE)) {
+          GST_ERROR_OBJECT (self, "Set active failed");
+          gst_clear_object (&priv->extern_pool);
+        } else {
+          auto config = gst_buffer_pool_get_config (priv->extern_pool);
+          GstCaps *caps;
+          gst_buffer_pool_config_get_params (config,
+              &caps, nullptr, nullptr, nullptr);
+          auto is_valid = gst_video_info_from_caps (&priv->extern_pool_info,
+              caps);
+          gst_structure_free (config);
+          if (!is_valid) {
+            GST_ERROR_OBJECT (self, "Invalid buffer pool");
+            gst_clear_object (&priv->extern_pool);
+          }
+        }
+      }
+      break;
     default:
       G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec);
       break;
@@ -264,6 +317,9 @@ gst_nv_encoder_get_property (GObject * object, guint prop_id, GValue * value,
     case PROP_CC_INSERT:
       g_value_set_enum (value, priv->cc_insert);
       break;
+    case PROP_EXTERN_POOL:
+      g_value_set_object (value, priv->extern_pool);
+      break;
     default:
       G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec);
       break;
@@ -782,7 +838,8 @@ gst_nv_encoder_propose_allocation (GstVideoEncoder * encoder, GstQuery * query)
     gst_buffer_pool_config_set_cuda_stream (config, priv->stream);
 
     /* Encoder does not seem to support stream ordered allocation */
-    gst_buffer_pool_config_set_cuda_stream_ordered_alloc (config, FALSE);
+    if (!priv->extern_pool)
+      gst_buffer_pool_config_set_cuda_stream_ordered_alloc (config, FALSE);
   }
 
   if (!gst_buffer_pool_set_config (pool, config)) {
@@ -1555,6 +1612,9 @@ gst_nv_encoder_prepare_task_input_cuda (GstNvEncoder * self,
   GstCudaStream *stream;
   GstNvEncResource *resource = nullptr;
   const GstVideoInfo *info = &priv->input_state->info;
+  gboolean sync_done = FALSE;
+  guint out_stride = 0;
+  gboolean is_extern_mem = FALSE;
 
   mem = gst_buffer_peek_memory (buffer, 0);
 
@@ -1648,21 +1708,20 @@ gst_nv_encoder_prepare_task_input_cuda (GstNvEncoder * self,
     return gst_nv_encoder_copy_system (self, info, buffer, task);
   }
 
+  out_stride = cmem->info.stride[0];
+
   if (gst_cuda_memory_is_stream_ordered (mem)) {
     GstBuffer *copy = nullptr;
-    GstVideoFrame in_frame, out_frame;
+    GstVideoFrame in_frame;
     CUDA_MEMCPY2D copy_params = { };
+    GstMemory *out_mem;
+    GstMapInfo out_map;
+    guint8 *out_data;
 
     stream = gst_cuda_memory_get_stream (cmem);
 
     GST_LOG_OBJECT (self, "Stream ordered allocation needs memory copy");
 
-    gst_buffer_pool_acquire_buffer (priv->internal_pool, &copy, nullptr);
-    if (!copy) {
-      GST_ERROR_OBJECT (self, "Couldn't allocate internal buffer");
-      return GST_FLOW_ERROR;
-    }
-
     if (!gst_video_frame_map (&in_frame, info, buffer,
             (GstMapFlags) (GST_MAP_READ | GST_MAP_CUDA))) {
       GST_ERROR_OBJECT (self, "Couldn't map input buffer");
@@ -1670,14 +1729,50 @@ gst_nv_encoder_prepare_task_input_cuda (GstNvEncoder * self,
       return GST_FLOW_ERROR;
     }
 
-    if (!gst_video_frame_map (&out_frame, info, copy,
-            (GstMapFlags) (GST_MAP_WRITE | GST_MAP_CUDA))) {
+    if (priv->extern_pool) {
+      auto cuda_pool = GST_CUDA_BUFFER_POOL (priv->extern_pool);
+      if (cuda_pool->context == priv->context) {
+        gst_buffer_pool_acquire_buffer (priv->extern_pool, &copy, nullptr);
+        if (copy) {
+          auto copy_mem = gst_buffer_peek_memory (copy, 0);
+          if (gst_cuda_memory_is_stream_ordered (copy_mem)) {
+            GST_LOG_OBJECT (self, "External pool uses stream ordered alloc");
+            gst_clear_buffer (&copy);
+          } else if (gst_memory_get_sizes (mem, nullptr, nullptr) >
+              gst_memory_get_sizes (copy_mem, nullptr, nullptr)) {
+            GST_LOG_OBJECT (self, "Too small extern pool buffer");
+            gst_clear_buffer (&copy);
+          } else {
+            is_extern_mem = TRUE;
+          }
+        }
+      }
+    }
+
+    if (!copy)
+      gst_buffer_pool_acquire_buffer (priv->internal_pool, &copy, nullptr);
+
+    if (!copy) {
+      GST_ERROR_OBJECT (self, "Couldn't allocate internal buffer");
+      return GST_FLOW_ERROR;
+    }
+
+    out_mem = gst_buffer_peek_memory (copy, 0);
+
+    if (!gst_memory_map (out_mem,
+            &out_map, (GstMapFlags) (GST_MAP_WRITE | GST_MAP_CUDA))) {
       GST_ERROR_OBJECT (self, "Couldn't map output buffer");
       gst_video_frame_unmap (&in_frame);
       gst_buffer_unref (copy);
       return GST_FLOW_ERROR;
     }
 
+    out_data = (guint8 *) out_map.data;
+    if (is_extern_mem)
+      out_stride = in_frame.info.stride[0];
+    else
+      out_stride = GST_CUDA_MEMORY_CAST (out_mem)->info.stride[0];
+
     for (guint i = 0; i < GST_VIDEO_FRAME_N_PLANES (&in_frame); i++) {
       copy_params.srcMemoryType = CU_MEMORYTYPE_DEVICE;
       copy_params.srcDevice = (CUdeviceptr)
@@ -1685,9 +1780,8 @@ gst_nv_encoder_prepare_task_input_cuda (GstNvEncoder * self,
       copy_params.srcPitch = GST_VIDEO_FRAME_PLANE_STRIDE (&in_frame, i);
 
       copy_params.dstMemoryType = CU_MEMORYTYPE_DEVICE;
-      copy_params.dstDevice = (CUdeviceptr)
-          GST_VIDEO_FRAME_PLANE_DATA (&out_frame, i);
-      copy_params.dstPitch = GST_VIDEO_FRAME_PLANE_STRIDE (&out_frame, i);
+      copy_params.dstDevice = (CUdeviceptr) out_data;
+      copy_params.dstPitch = out_stride;
 
       copy_params.WidthInBytes = GST_VIDEO_INFO_COMP_WIDTH (info, i) *
           GST_VIDEO_INFO_COMP_PSTRIDE (info, i);
@@ -1698,18 +1792,22 @@ gst_nv_encoder_prepare_task_input_cuda (GstNvEncoder * self,
       if (!gst_cuda_result (cuda_ret)) {
         GST_ERROR_OBJECT (self, "Copy failed");
         gst_video_frame_unmap (&in_frame);
-        gst_video_frame_unmap (&out_frame);
+        gst_memory_unmap (out_mem, &out_map);
 
         gst_buffer_unref (copy);
         return GST_FLOW_ERROR;
       }
+
+      out_data += GST_VIDEO_INFO_COMP_HEIGHT (info, i) * out_stride;
     }
 
     gst_video_frame_unmap (&in_frame);
-    gst_video_frame_unmap (&out_frame);
+    gst_memory_unmap (out_mem, &out_map);
 
-    if (stream && stream != priv->stream)
+    if (stream && stream != priv->stream) {
       CuStreamSynchronize (gst_cuda_stream_get_handle (stream));
+      sync_done = TRUE;
+    }
 
     buffer = copy;
     mem = gst_buffer_peek_memory (copy, 0);
@@ -1718,7 +1816,12 @@ gst_nv_encoder_prepare_task_input_cuda (GstNvEncoder * self,
     buffer = gst_buffer_ref (buffer);
   }
 
-  status = object->AcquireResource (mem, &resource);
+  if (is_extern_mem) {
+    status = object->AcquireResourceWithSize (mem, info->width, info->height,
+        out_stride, &resource);
+  } else {
+    status = object->AcquireResource (mem, &resource);
+  }
 
   if (status != NV_ENC_SUCCESS) {
     GST_ERROR_OBJECT (self, "Failed to get resource, status %"
@@ -1729,7 +1832,7 @@ gst_nv_encoder_prepare_task_input_cuda (GstNvEncoder * self,
   }
 
   stream = gst_cuda_memory_get_stream (cmem);
-  if (stream != priv->stream) {
+  if (stream != priv->stream && !sync_done) {
     /* different stream, needs sync */
     gst_cuda_memory_sync (cmem);
   }