From b266aa5e65635d76e984be2aa4f7421a25cca8eb Mon Sep 17 00:00:00 2001
From: Seungha Yang <seungha@centricular.com>
Date: Thu, 29 Aug 2024 20:15:20 +0900
Subject: [PATCH] cuda: Add support for stream ordered allocation

Default CUDA memory allocation will cause implicit global
synchronization. This stream ordered allocation can avoid it
since memory allocation and free operations are asynchronous
and executed in the associated cuda stream context

Part-of: <https://gitlab.freedesktop.org/gstreamer/gstreamer/-/merge_requests/7427>
---
 .../gst-libs/gst/cuda/gstcuda-private.h       |   3 +
 .../gst-libs/gst/cuda/gstcudabufferpool.cpp   |  53 ++++++-
 .../gst-libs/gst/cuda/gstcudabufferpool.h     |   7 +
 .../gst-libs/gst/cuda/gstcudamemory.cpp       | 136 ++++++++++++++----
 .../gst-libs/gst/cuda/gstcudamemory.h         |  14 ++
 5 files changed, 182 insertions(+), 31 deletions(-)

diff --git a/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcuda-private.h b/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcuda-private.h
index f2d8761518..8799941ecb 100644
--- a/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcuda-private.h
+++ b/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcuda-private.h
@@ -56,6 +56,9 @@ void          gst_cuda_memory_set_from_fixed_pool (GstMemory * mem);
 GST_CUDA_API
 gboolean      gst_cuda_memory_is_from_fixed_pool (GstMemory * mem);
 
+GST_CUDA_API
+gboolean      gst_cuda_memory_is_stream_ordered (GstMemory * mem);
+
 gpointer      gst_cuda_get_win32_handle_metadata (void);
 
 G_END_DECLS
diff --git a/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudabufferpool.cpp b/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudabufferpool.cpp
index d933a0fad5..a5efd1fedc 100644
--- a/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudabufferpool.cpp
+++ b/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudabufferpool.cpp
@@ -133,8 +133,14 @@ gst_cuda_buffer_pool_set_config (GstBufferPool * pool, GstStructure * config)
     priv->alloc = gst_cuda_pool_allocator_new_for_virtual_memory (self->context,
         priv->stream, &info, &priv->prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
   } else {
-    priv->alloc = gst_cuda_pool_allocator_new (self->context, priv->stream,
-        &info);
+    gboolean stream_ordered =
+        gst_buffer_pool_config_get_cuda_stream_ordered_alloc (config);
+    GstStructure *alloc_config = gst_structure_new ("alloc-config",
+        GST_CUDA_ALLOCATOR_OPT_STREAM_ORDERED, G_TYPE_BOOLEAN, stream_ordered,
+        nullptr);
+
+    priv->alloc = gst_cuda_pool_allocator_new_full (self->context, priv->stream,
+        &info, alloc_config);
   }
 
   if (!priv->alloc) {
@@ -334,6 +340,49 @@ gst_buffer_pool_config_set_cuda_alloc_method (GstStructure * config,
       GST_TYPE_CUDA_MEMORY_ALLOC_METHOD, method, nullptr);
 }
 
+/**
+ * gst_buffer_pool_config_get_cuda_stream_ordered_alloc:
+ * @config: a buffer pool config
+ *
+ * Returns: %TRUE if stream ordered allocation is enabled
+ *
+ * Since: 1.26
+ */
+gboolean
+gst_buffer_pool_config_get_cuda_stream_ordered_alloc (GstStructure * config)
+{
+  gboolean stream_ordered = FALSE;
+
+  g_return_val_if_fail (config, FALSE);
+
+  if (!gst_structure_get_boolean (config,
+          "cuda-stream-ordered-alloc", &stream_ordered)) {
+    /* Disable stream-ordered alloc by default */
+    return FALSE;
+  }
+
+  return stream_ordered;
+}
+
+/**
+ * gst_buffer_pool_config_set_cuda_stream_ordered_alloc:
+ * @config: a buffer pool config
+ * @stream_ordered: whether stream ordered allocation is allowed
+ *
+ * Sets stream ordered allocation option
+ *
+ * Since: 1.26
+ */
+void
+gst_buffer_pool_config_set_cuda_stream_ordered_alloc (GstStructure * config,
+    gboolean stream_ordered)
+{
+  g_return_if_fail (config);
+
+  gst_structure_set (config, "cuda-stream-ordered-alloc", G_TYPE_BOOLEAN,
+      stream_ordered, nullptr);
+}
+
 static void
 gst_cuda_buffer_pool_dispose (GObject * object)
 {
diff --git a/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudabufferpool.h b/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudabufferpool.h
index 1ae7b5dbcf..2c42130338 100644
--- a/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudabufferpool.h
+++ b/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudabufferpool.h
@@ -86,5 +86,12 @@ GST_CUDA_API
 void                     gst_buffer_pool_config_set_cuda_alloc_method (GstStructure * config,
                                                                        GstCudaMemoryAllocMethod method);
 
+GST_CUDA_API
+gboolean        gst_buffer_pool_config_get_cuda_stream_ordered_alloc (GstStructure * config);
+
+GST_CUDA_API
+void            gst_buffer_pool_config_set_cuda_stream_ordered_alloc (GstStructure * config,
+                                                                      gboolean stream_ordered);
+
 G_END_DECLS
 
diff --git a/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudamemory.cpp b/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudamemory.cpp
index dfe4c01b65..8c4a98a3a0 100644
--- a/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudamemory.cpp
+++ b/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudamemory.cpp
@@ -115,8 +115,8 @@ struct _GstCudaMemoryPrivate
   CUtexObject texture[GST_VIDEO_MAX_PLANES][2];
 
   gboolean saw_io = FALSE;
-
   gboolean from_fixed_pool = FALSE;
+  gboolean stream_ordred_alloc = FALSE;
 
   std::map < gint64, std::unique_ptr < GstCudaMemoryTokenData >> token_map;
 
@@ -280,10 +280,19 @@ gst_cuda_allocator_update_info (const GstVideoInfo * reference,
   return TRUE;
 }
 
+static size_t
+do_align (size_t value, size_t align)
+{
+  if (align == 0)
+    return value;
+
+  return ((value + align - 1) / align) * align;
+}
+
 static GstMemory *
 gst_cuda_allocator_alloc_internal (GstCudaAllocator * self,
     GstCudaContext * context, GstCudaStream * stream, const GstVideoInfo * info,
-    guint width_in_bytes, guint alloc_height)
+    guint width_in_bytes, guint alloc_height, gboolean stream_ordered)
 {
   GstCudaMemoryPrivate *priv;
   GstCudaMemory *mem;
@@ -291,12 +300,26 @@ gst_cuda_allocator_alloc_internal (GstCudaAllocator * self,
   gboolean ret = FALSE;
   gsize pitch;
   GstVideoInfo alloc_info;
+  gint texture_align = gst_cuda_context_get_texture_alignment (context);
+  auto stream_handle = gst_cuda_stream_get_handle (stream);
 
   if (!gst_cuda_context_push (context))
     return nullptr;
 
-  ret = gst_cuda_result (CuMemAllocPitch (&data, &pitch, width_in_bytes,
-          alloc_height, 16));
+  if (stream_ordered) {
+    g_assert (stream_handle);
+
+    pitch = do_align (width_in_bytes, texture_align);
+    ret = gst_cuda_result (CuMemAllocAsync (&data, pitch * alloc_height,
+            stream_handle));
+
+    if (ret)
+      ret = gst_cuda_result (CuStreamSynchronize (stream_handle));
+  } else {
+    ret = gst_cuda_result (CuMemAllocPitch (&data, &pitch, width_in_bytes,
+            alloc_height, 16));
+  }
+
   gst_cuda_context_pop (nullptr);
 
   if (!ret) {
@@ -307,7 +330,12 @@ gst_cuda_allocator_alloc_internal (GstCudaAllocator * self,
   if (!gst_cuda_allocator_update_info (info, pitch, alloc_height, &alloc_info)) {
     GST_ERROR_OBJECT (self, "Couldn't calculate aligned info");
     gst_cuda_context_push (context);
-    CuMemFree (data);
+
+    if (stream_ordered)
+      CuMemFreeAsync (data, stream_handle);
+    else
+      CuMemFree (data);
+
     gst_cuda_context_pop (nullptr);
     return nullptr;
   }
@@ -321,7 +349,8 @@ gst_cuda_allocator_alloc_internal (GstCudaAllocator * self,
   priv->pitch = pitch;
   priv->width_in_bytes = width_in_bytes;
   priv->height = alloc_height;
-  priv->texture_align = gst_cuda_context_get_texture_alignment (context);
+  priv->texture_align = texture_align;
+  priv->stream_ordred_alloc = stream_ordered;
   if (stream)
     priv->stream = gst_cuda_stream_ref (stream);
 
@@ -370,7 +399,12 @@ gst_cuda_allocator_free (GstAllocator * allocator, GstMemory * memory)
 #endif
       }
     } else {
-      gst_cuda_result (CuMemFree (priv->data));
+      if (priv->stream_ordred_alloc) {
+        gst_cuda_result (CuMemFreeAsync (priv->data,
+                gst_cuda_stream_get_handle (priv->stream)));
+      } else {
+        gst_cuda_result (CuMemFree (priv->data));
+      }
     }
   }
 
@@ -570,7 +604,8 @@ cuda_mem_copy (GstMemory * mem, gssize offset, gssize size)
 
   if (!copy) {
     copy = gst_cuda_allocator_alloc_internal (self, context, stream,
-        &src_mem->info, src_mem->priv->width_in_bytes, src_mem->priv->height);
+        &src_mem->info, src_mem->priv->width_in_bytes, src_mem->priv->height,
+        FALSE);
   }
 
   if (!copy) {
@@ -1113,7 +1148,7 @@ gst_cuda_allocator_alloc (GstCudaAllocator * allocator,
   alloc_height = gst_cuda_allocator_calculate_alloc_height (info);
 
   return gst_cuda_allocator_alloc_internal (allocator, context, stream,
-      info, info->stride[0], alloc_height);
+      info, info->stride[0], alloc_height, FALSE);
 }
 
 /**
@@ -1235,13 +1270,17 @@ gst_cuda_memory_is_from_fixed_pool (GstMemory * mem)
   return cmem->priv->from_fixed_pool;
 }
 
-static size_t
-do_align (size_t value, size_t align)
+gboolean
+gst_cuda_memory_is_stream_ordered (GstMemory * mem)
 {
-  if (align == 0)
-    return value;
+  GstCudaMemory *cmem;
 
-  return ((value + align - 1) / align) * align;
+  if (!gst_is_cuda_memory (mem))
+    return FALSE;
+
+  cmem = GST_CUDA_MEMORY_CAST (mem);
+
+  return cmem->priv->stream_ordred_alloc;
 }
 
 /**
@@ -1401,6 +1440,7 @@ struct _GstCudaPoolAllocatorPrivate
   guint outstanding;
   guint cur_mems;
   gboolean flushing;
+  gboolean stream_ordered_alloc;
 };
 
 static void gst_cuda_pool_allocator_finalize (GObject * object);
@@ -1700,10 +1740,16 @@ gst_cuda_pool_allocator_alloc (GstCudaPoolAllocator * self, GstMemory ** mem)
     new_mem = gst_cuda_allocator_virtual_alloc (nullptr,
         self->context, self->stream, &self->info, &priv->prop,
         priv->granularity_flags);
+  } else if (priv->stream_ordered_alloc && self->stream) {
+    auto allocator = (GstCudaAllocator *) _gst_cuda_allocator;
+    new_mem = gst_cuda_allocator_alloc_internal (allocator,
+        self->context, self->stream, &self->info, self->info.stride[0],
+        gst_cuda_allocator_calculate_alloc_height (&self->info), TRUE);
   } else {
     new_mem = gst_cuda_allocator_alloc (nullptr,
         self->context, self->stream, &self->info);
   }
+
   if (!new_mem) {
     GST_ERROR_OBJECT (self, "Failed to allocate new memory");
     g_atomic_int_add (&priv->cur_mems, -1);
@@ -1813,21 +1859,7 @@ GstCudaPoolAllocator *
 gst_cuda_pool_allocator_new (GstCudaContext * context, GstCudaStream * stream,
     const GstVideoInfo * info)
 {
-  GstCudaPoolAllocator *self;
-
-  g_return_val_if_fail (GST_IS_CUDA_CONTEXT (context), nullptr);
-  g_return_val_if_fail (!stream || GST_IS_CUDA_STREAM (stream), nullptr);
-
-  self = (GstCudaPoolAllocator *)
-      g_object_new (GST_TYPE_CUDA_POOL_ALLOCATOR, nullptr);
-  gst_object_ref_sink (self);
-
-  self->context = (GstCudaContext *) gst_object_ref (context);
-  if (stream)
-    self->stream = gst_cuda_stream_ref (stream);
-  self->info = *info;
-
-  return self;
+  return gst_cuda_pool_allocator_new_full (context, stream, info, nullptr);
 }
 
 /**
@@ -1873,6 +1905,52 @@ gst_cuda_pool_allocator_new_for_virtual_memory (GstCudaContext * context,
   return self;
 }
 
+/**
+ * gst_cuda_pool_allocator_new_full:
+ * @context: a #GstCudaContext
+ * @stream: (allow-none): a #GstCudaStream
+ * @info: a #GstVideoInfo
+ * @config: (transfer full) (allow-none): a #GstStructure with configuration options
+ *
+ * Creates a new #GstCudaPoolAllocator instance with given @config
+ *
+ * Returns: (transfer full): a new #GstCudaPoolAllocator instance
+ *
+ * Since: 1.26
+ */
+GstCudaPoolAllocator *
+gst_cuda_pool_allocator_new_full (GstCudaContext * context,
+    GstCudaStream * stream, const GstVideoInfo * info, GstStructure * config)
+{
+  GstCudaPoolAllocator *self;
+
+  g_return_val_if_fail (GST_IS_CUDA_CONTEXT (context), nullptr);
+  g_return_val_if_fail (!stream || GST_IS_CUDA_STREAM (stream), nullptr);
+
+  self = (GstCudaPoolAllocator *)
+      g_object_new (GST_TYPE_CUDA_POOL_ALLOCATOR, nullptr);
+  gst_object_ref_sink (self);
+
+  self->context = (GstCudaContext *) gst_object_ref (context);
+  if (stream)
+    self->stream = gst_cuda_stream_ref (stream);
+  self->info = *info;
+
+  if (config) {
+    gboolean allow_stream_ordered = FALSE;
+    if (stream && gst_structure_get_boolean (config,
+            GST_CUDA_ALLOCATOR_OPT_STREAM_ORDERED, &allow_stream_ordered) &&
+        allow_stream_ordered) {
+      g_object_get (context,
+          "stream-ordered-alloc", &self->priv->stream_ordered_alloc, nullptr);
+    }
+
+    gst_structure_free (config);
+  }
+
+  return self;
+}
+
 /**
  * gst_cuda_pool_allocator_acquire_memory:
  * @allocator: a #GstCudaPoolAllocator
diff --git a/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudamemory.h b/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudamemory.h
index e3a8eb9912..361c2bcf6d 100644
--- a/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudamemory.h
+++ b/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudamemory.h
@@ -174,6 +174,15 @@ typedef enum
 GST_CUDA_API
 GType gst_cuda_memory_alloc_method_get_type (void);
 
+/**
+ * GST_CUDA_ALLOCATOR_OPT_STREAM_ORDERED:
+ *
+ * #G_TYPE_BOOLEAN Allows stream ordered allocation. Default is %FALSE
+ *
+ * Since: 1.26
+ */
+#define GST_CUDA_ALLOCATOR_OPT_STREAM_ORDERED "GstCudaAllocator.stream-ordered"
+
 /**
  * GstCudaMemory:
  *
@@ -339,6 +348,11 @@ GstCudaPoolAllocator * gst_cuda_pool_allocator_new_for_virtual_memory (GstCudaCo
                                                                        CUmemAllocationGranularity_flags granularity_flags);
 
 GST_CUDA_API
+GstCudaPoolAllocator * gst_cuda_pool_allocator_new_full (GstCudaContext * context,
+                                                         GstCudaStream * stream,
+                                                         const GstVideoInfo * info,
+                                                         GstStructure * config);
+GST_CUDA_API
 GstFlowReturn          gst_cuda_pool_allocator_acquire_memory (GstCudaPoolAllocator * allocator,
                                                                GstMemory ** memory);