From cdaa798ac783d618638e839d5ae5d8bbe5e9bd4f Mon Sep 17 00:00:00 2001
From: Seungha Yang <seungha@centricular.com>
Date: Tue, 10 Sep 2024 19:29:44 +0900
Subject: [PATCH] cuda: Add methods to enable stream ordered allocation

Adding prefer-stream-ordered-alloc property to GstCudaContext.
If stream ordered allocation buffer pool option is not configured
and this property is enabled, buffer pool will enable the stream
ordered allocation. Otherwise it will follow default behavior.

If GST_CUDA_ENABLE_STREAM_ORDERED_ALLOC env is set,
default behavior is enabling the stream ordered allocation.
Otherwise sync alloc/free method will be used.

Part-of: <https://gitlab.freedesktop.org/gstreamer/gstreamer/-/merge_requests/7427>
---
 .../gst-plugins-bad/docs/libs/cuda/index.md   |  9 ++++
 .../gst-libs/gst/cuda/gstcudabufferpool.cpp   | 46 ++++++++++++++++---
 .../gst-libs/gst/cuda/gstcudabufferpool.h     |  3 +-
 .../gst-libs/gst/cuda/gstcudacontext.cpp      | 29 ++++++++++++
 4 files changed, 80 insertions(+), 7 deletions(-)

diff --git a/subprojects/gst-plugins-bad/docs/libs/cuda/index.md b/subprojects/gst-plugins-bad/docs/libs/cuda/index.md
index 5e529291b8..abfe9b061e 100644
--- a/subprojects/gst-plugins-bad/docs/libs/cuda/index.md
+++ b/subprojects/gst-plugins-bad/docs/libs/cuda/index.md
@@ -23,3 +23,12 @@ Example: `GST_CUDA_CRITICAL_ERRORS=2,700`
 As a result of the above example, if `CUDA_ERROR_OUT_OF_MEMORY(2)` or
 `CUDA_ERROR_ILLEGAL_ADDRESS(700)` error is detected in GStreamer CUDA library,
 the process will be aborted.
+
+
+**`GST_CUDA_ENABLE_STREAM_ORDERED_ALLOC`. (Since: 1.26)**
+
+As of 1.26, GStreamer CUDA library supports stream ordered CUDA allocation
+(e.g., cuMemAllocAsync). The new allocation method is disabled by default
+unless it's explicitly requested via buffer pool option.
+This environment variable can be used to change the default behavior
+so that the stream ordered allocation can be used by default.
diff --git a/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudabufferpool.cpp b/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudabufferpool.cpp
index a5efd1fedc..49ea29ae80 100644
--- a/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudabufferpool.cpp
+++ b/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudabufferpool.cpp
@@ -86,6 +86,19 @@ gst_cuda_buffer_pool_update_alloc_prop (GstCudaBufferPool * self)
   return TRUE;
 }
 
+static gboolean
+default_stream_ordered_alloc_enabled (void)
+{
+  static gboolean enabled = FALSE;
+  GST_CUDA_CALL_ONCE_BEGIN {
+    if (g_getenv ("GST_CUDA_ENABLE_STREAM_ORDERED_ALLOC"))
+      enabled = TRUE;
+  }
+  GST_CUDA_CALL_ONCE_END;
+
+  return enabled;
+}
+
 static gboolean
 gst_cuda_buffer_pool_set_config (GstBufferPool * pool, GstStructure * config)
 {
@@ -133,8 +146,25 @@ gst_cuda_buffer_pool_set_config (GstBufferPool * pool, GstStructure * config)
     priv->alloc = gst_cuda_pool_allocator_new_for_virtual_memory (self->context,
         priv->stream, &info, &priv->prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
   } else {
-    gboolean stream_ordered =
-        gst_buffer_pool_config_get_cuda_stream_ordered_alloc (config);
+    gboolean stream_ordered = FALSE;
+    if (!gst_buffer_pool_config_get_cuda_stream_ordered_alloc (config,
+            &stream_ordered)) {
+      gboolean prefer_stream_ordered = FALSE;
+      g_object_get (self->context, "prefer-stream-ordered-alloc",
+          &prefer_stream_ordered, nullptr);
+      if (prefer_stream_ordered) {
+        GST_DEBUG_OBJECT (self, "Stream ordered alloc was enabled in context");
+        stream_ordered = TRUE;
+      } else {
+        stream_ordered = default_stream_ordered_alloc_enabled ();
+        GST_DEBUG_OBJECT (self, "Use default stream ordered alloc: %d",
+            stream_ordered);
+      }
+    } else {
+      GST_DEBUG_OBJECT (self,
+          "stream ordered alloc by config: %d", stream_ordered);
+    }
+
     GstStructure *alloc_config = gst_structure_new ("alloc-config",
         GST_CUDA_ALLOCATOR_OPT_STREAM_ORDERED, G_TYPE_BOOLEAN, stream_ordered,
         nullptr);
@@ -343,13 +373,15 @@ gst_buffer_pool_config_set_cuda_alloc_method (GstStructure * config,
 /**
  * gst_buffer_pool_config_get_cuda_stream_ordered_alloc:
  * @config: a buffer pool config
+ * @enabled: (out): whether stream ordered allocation was requested or not
  *
- * Returns: %TRUE if stream ordered allocation is enabled
+ * Returns: %TRUE stream ordered allocation option was specified
  *
  * Since: 1.26
  */
 gboolean
-gst_buffer_pool_config_get_cuda_stream_ordered_alloc (GstStructure * config)
+gst_buffer_pool_config_get_cuda_stream_ordered_alloc (GstStructure * config,
+    gboolean * enabled)
 {
   gboolean stream_ordered = FALSE;
 
@@ -357,11 +389,13 @@ gst_buffer_pool_config_get_cuda_stream_ordered_alloc (GstStructure * config)
 
   if (!gst_structure_get_boolean (config,
           "cuda-stream-ordered-alloc", &stream_ordered)) {
-    /* Disable stream-ordered alloc by default */
     return FALSE;
   }
 
-  return stream_ordered;
+  if (enabled)
+    *enabled = stream_ordered;
+
+  return TRUE;
 }
 
 /**
diff --git a/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudabufferpool.h b/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudabufferpool.h
index 2c42130338..8dc8f54499 100644
--- a/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudabufferpool.h
+++ b/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudabufferpool.h
@@ -87,7 +87,8 @@ void                     gst_buffer_pool_config_set_cuda_alloc_method (GstStruct
                                                                        GstCudaMemoryAllocMethod method);
 
 GST_CUDA_API
-gboolean        gst_buffer_pool_config_get_cuda_stream_ordered_alloc (GstStructure * config);
+gboolean        gst_buffer_pool_config_get_cuda_stream_ordered_alloc (GstStructure * config,
+                                                                      gboolean * enabled);
 
 GST_CUDA_API
 void            gst_buffer_pool_config_set_cuda_stream_ordered_alloc (GstStructure * config,
diff --git a/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudacontext.cpp b/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudacontext.cpp
index ab6e0afa09..e5cc64925e 100644
--- a/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudacontext.cpp
+++ b/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudacontext.cpp
@@ -55,6 +55,7 @@ enum
   PROP_VIRTUAL_MEMORY,
   PROP_OS_HANDLE,
   PROP_STREAM_ORDERED_ALLOC,
+  PROP_PREFER_STREAM_ORDERED_ALLLOC,
 };
 
 struct _GstCudaContextPrivate
@@ -66,11 +67,14 @@ struct _GstCudaContextPrivate
   gboolean virtual_memory_supported;
   gboolean os_handle_supported;
   gboolean stream_ordered_alloc_supported;
+  gboolean prefer_stream_ordered_alloc;
 
   gint tex_align;
 
   GHashTable *accessible_peer;
   gboolean owns_context;
+
+  GMutex lock;
 };
 
 #define gst_cuda_context_parent_class parent_class
@@ -152,6 +156,17 @@ gst_cuda_context_class_init (GstCudaContextClass * klass)
           "Device supports stream ordered allocation", FALSE,
           (GParamFlags) (G_PARAM_READABLE | G_PARAM_STATIC_STRINGS)));
 
+  /**
+   * GstCudaContext:prefer-stream-ordered-alloc:
+   *
+   * Since: 1.26
+   */
+  g_object_class_install_property (gobject_class,
+      PROP_PREFER_STREAM_ORDERED_ALLLOC,
+      g_param_spec_boolean ("prefer-stream-ordered-alloc",
+          "Prefer Stream Ordered Alloc", "Prefers stream ordered allocation",
+          FALSE, (GParamFlags) (G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS)));
+
   gst_cuda_memory_init_once ();
 }
 
@@ -162,6 +177,7 @@ gst_cuda_context_init (GstCudaContext * context)
       gst_cuda_context_get_instance_private (context);
 
   priv->accessible_peer = g_hash_table_new (g_direct_hash, g_direct_equal);
+  g_mutex_init (&priv->lock);
 
   context->priv = priv;
 }
@@ -173,10 +189,16 @@ gst_cuda_context_set_property (GObject * object, guint prop_id,
   GstCudaContext *context = GST_CUDA_CONTEXT (object);
   GstCudaContextPrivate *priv = context->priv;
 
+
   switch (prop_id) {
     case PROP_DEVICE_ID:
       priv->device_id = g_value_get_uint (value);
       break;
+    case PROP_PREFER_STREAM_ORDERED_ALLLOC:
+      g_mutex_lock (&priv->lock);
+      priv->prefer_stream_ordered_alloc = g_value_get_boolean (value);
+      g_mutex_unlock (&priv->lock);
+      break;
     default:
       G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec);
       break;
@@ -206,6 +228,11 @@ gst_cuda_context_get_property (GObject * object, guint prop_id,
     case PROP_STREAM_ORDERED_ALLOC:
       g_value_set_boolean (value, priv->stream_ordered_alloc_supported);
       break;
+    case PROP_PREFER_STREAM_ORDERED_ALLLOC:
+      g_mutex_lock (&priv->lock);
+      g_value_set_boolean (value, priv->prefer_stream_ordered_alloc);
+      g_mutex_unlock (&priv->lock);
+      break;
     default:
       G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec);
       break;
@@ -419,6 +446,8 @@ gst_cuda_context_finalize (GObject * object)
     gst_cuda_result (CuCtxDestroy (priv->context));
   }
 
+  g_mutex_clear (&priv->lock);
+
   G_OBJECT_CLASS (parent_class)->finalize (object);
 }