From a2df44da7d88c55e1c3dec9b0291b027646ec22b Mon Sep 17 00:00:00 2001 From: Seungha Yang Date: Wed, 12 Jun 2024 01:02:39 +0900 Subject: [PATCH] d3d12: Workaround for Intel iGPU decoder crash Observed Intel GPU driver crash when multiple decoders are configured in a process. It might be because of frequent command queue alloc/free or too many in-flight decoding commands. In order to make command queue persistent and limit the number of in-flight command lists, holds global decoding command queue. Part-of: --- .../gst-libs/gst/d3d12/gstd3d12-private.h | 22 +++ .../gst/d3d12/gstd3d12commandqueue.cpp | 2 +- .../gst/d3d12/gstd3d12device-private.h | 20 +++ .../gst-libs/gst/d3d12/gstd3d12device.cpp | 125 +++++++++++++++++- .../sys/d3d12/gstd3d12decoder.cpp | 63 ++++----- 5 files changed, 199 insertions(+), 33 deletions(-) diff --git a/subprojects/gst-plugins-bad/gst-libs/gst/d3d12/gstd3d12-private.h b/subprojects/gst-plugins-bad/gst-libs/gst/d3d12/gstd3d12-private.h index fa78c20fb9..83034ffecc 100644 --- a/subprojects/gst-plugins-bad/gst-libs/gst/d3d12/gstd3d12-private.h +++ b/subprojects/gst-plugins-bad/gst-libs/gst/d3d12/gstd3d12-private.h @@ -89,4 +89,26 @@ private: GstD3D12Device *device_; }; +class GstD3D12DeviceDecoderLockGuard +{ +public: + explicit GstD3D12DeviceDecoderLockGuard(GstD3D12Device * device) : device_ (device) + { + if (device_) + gst_d3d12_device_decoder_lock (device_); + } + + ~GstD3D12DeviceDecoderLockGuard() + { + if (device_) + gst_d3d12_device_decoder_unlock (device_); + } + + GstD3D12DeviceDecoderLockGuard(const GstD3D12DeviceDecoderLockGuard&) = delete; + GstD3D12DeviceDecoderLockGuard& operator=(const GstD3D12DeviceDecoderLockGuard&) = delete; + +private: + GstD3D12Device *device_; +}; + #endif /* __cplusplus */ diff --git a/subprojects/gst-plugins-bad/gst-libs/gst/d3d12/gstd3d12commandqueue.cpp b/subprojects/gst-plugins-bad/gst-libs/gst/d3d12/gstd3d12commandqueue.cpp index a2eae45ed9..df85ddaf0a 100644 --- a/subprojects/gst-plugins-bad/gst-libs/gst/d3d12/gstd3d12commandqueue.cpp +++ b/subprojects/gst-plugins-bad/gst-libs/gst/d3d12/gstd3d12commandqueue.cpp @@ -164,7 +164,7 @@ gst_d3d12_command_queue_new (ID3D12Device * device, ComPtr < ID3D12CommandQueue > cq; auto hr = device->CreateCommandQueue (desc, IID_PPV_ARGS (&cq)); if (FAILED (hr)) { - GST_ERROR ("Couldn't create command queue, hr: 0x%x", (guint) hr); + GST_WARNING ("Couldn't create command queue, hr: 0x%x", (guint) hr); return nullptr; } diff --git a/subprojects/gst-plugins-bad/gst-libs/gst/d3d12/gstd3d12device-private.h b/subprojects/gst-plugins-bad/gst-libs/gst/d3d12/gstd3d12device-private.h index d7e176463a..651b2c4b9a 100644 --- a/subprojects/gst-plugins-bad/gst-libs/gst/d3d12/gstd3d12device-private.h +++ b/subprojects/gst-plugins-bad/gst-libs/gst/d3d12/gstd3d12device-private.h @@ -25,6 +25,14 @@ G_BEGIN_DECLS +enum GstD3D12WAFlags +{ + GST_D3D12_WA_NONE = 0, + GST_D3D12_WA_DECODER_RACE = (1 << 0), +}; + +DEFINE_ENUM_FLAG_OPERATORS (GstD3D12WAFlags); + struct GstD3D12CopyTextureRegionArgs { D3D12_TEXTURE_COPY_LOCATION dst; @@ -71,5 +79,17 @@ void gst_d3d12_device_11on12_unlock (GstD3D12Device * device); GST_D3D12_API void gst_d3d12_device_check_device_removed (GstD3D12Device * device); +GST_D3D12_API +GstD3D12CommandQueue * gst_d3d12_device_get_decode_queue (GstD3D12Device * device); + +GST_D3D12_API +void gst_d3d12_device_decoder_lock (GstD3D12Device * device); + +GST_D3D12_API +void gst_d3d12_device_decoder_unlock (GstD3D12Device * device); + +GST_D3D12_API +GstD3D12WAFlags gst_d3d12_device_get_workaround_flags (GstD3D12Device * device); + G_END_DECLS diff --git a/subprojects/gst-plugins-bad/gst-libs/gst/d3d12/gstd3d12device.cpp b/subprojects/gst-plugins-bad/gst-libs/gst/d3d12/gstd3d12device.cpp index 04b8a2faca..bf0e0bd126 100644 --- a/subprojects/gst-plugins-bad/gst-libs/gst/d3d12/gstd3d12device.cpp +++ b/subprojects/gst-plugins-bad/gst-libs/gst/d3d12/gstd3d12device.cpp @@ -126,6 +126,8 @@ struct DeviceInner gst_clear_object (&direct_queue); gst_clear_object (©_queue); + for (guint i = 0; i < num_decode_queue; i++) + gst_clear_object (&decode_queue[i]); gst_clear_object (&direct_ca_pool); gst_clear_object (&direct_cl_pool); @@ -154,6 +156,9 @@ struct DeviceInner if (copy_queue) gst_d3d12_command_queue_drain (copy_queue); + + for (guint i = 0; i < num_decode_queue; i++) + gst_d3d12_command_queue_drain (decode_queue[i]); } void ReportLiveObjects () @@ -230,6 +235,11 @@ struct DeviceInner GstD3D12CommandQueue *direct_queue = nullptr; GstD3D12CommandQueue *copy_queue = nullptr; + GstD3D12CommandQueue *decode_queue[2] = { nullptr, }; + guint num_decode_queue = 0; + guint decode_queue_index = 0; + std::recursive_mutex decoder_lock; + GstD3D12WAFlags wa_flags = GST_D3D12_WA_NONE; GstD3D12CommandListPool *direct_cl_pool = nullptr; GstD3D12CommandAllocatorPool *direct_ca_pool = nullptr; @@ -239,6 +249,8 @@ struct DeviceInner GstD3D12FenceDataPool *fence_data_pool = nullptr; + D3D12_FEATURE_DATA_ARCHITECTURE feature_data_arch = { }; + guint rtv_inc_size; guint adapter_index = 0; @@ -961,6 +973,24 @@ gst_d3d12_device_find_adapter (const GstD3D12DeviceConstructData * data, return E_FAIL; } +static gboolean +is_intel_gen11_or_older (UINT vendor_id, D3D_FEATURE_LEVEL feature_level, + const std::string & description) +{ + if (vendor_id != 0x8086) + return FALSE; + + /* Arc GPU supports feature level 12.2 and iGPU Xe does 12.1 */ + if (feature_level <= D3D_FEATURE_LEVEL_12_0) + return TRUE; + + /* gen 11 is UHD xxx, older ones are HD xxx */ + if (description.find ("HD") != std::string::npos) + return TRUE; + + return FALSE; +} + static GstD3D12Device * gst_d3d12_device_new_internal (const GstD3D12DeviceConstructData * data) { @@ -970,6 +1000,13 @@ gst_d3d12_device_new_internal (const GstD3D12DeviceConstructData * data) HRESULT hr; UINT factory_flags = 0; guint index = 0; + const D3D_FEATURE_LEVEL feature_levels[] = { + D3D_FEATURE_LEVEL_11_0, + D3D_FEATURE_LEVEL_11_1, + D3D_FEATURE_LEVEL_12_0, + D3D_FEATURE_LEVEL_12_1, + D3D_FEATURE_LEVEL_12_2, + }; gst_d3d12_device_enable_debug (); gst_d3d12_device_enable_dred (); @@ -1014,16 +1051,31 @@ gst_d3d12_device_new_internal (const GstD3D12DeviceConstructData * data) priv->device_id = desc.DeviceId; priv->adapter_index = index; + device->CheckFeatureSupport (D3D12_FEATURE_ARCHITECTURE, + &priv->feature_data_arch, sizeof (D3D12_FEATURE_DATA_ARCHITECTURE)); + + D3D12_FEATURE_DATA_FEATURE_LEVELS flevel = { }; + flevel.NumFeatureLevels = G_N_ELEMENTS (feature_levels); + flevel.pFeatureLevelsRequested = feature_levels; + device->CheckFeatureSupport (D3D12_FEATURE_FEATURE_LEVELS, + &flevel, sizeof (flevel)); + std::wstring_convert < std::codecvt_utf8 < wchar_t >, wchar_t >converter; priv->description = converter.to_bytes (desc.Description); GST_INFO_OBJECT (self, "adapter index %d: D3D12 device vendor-id: 0x%04x, device-id: 0x%04x, " - "Flags: 0x%x, adapter-luid: %" G_GINT64_FORMAT ", %s", + "Flags: 0x%x, adapter-luid: %" G_GINT64_FORMAT ", is-UMA: %d, " + "feature-level: 0x%x, %s", priv->adapter_index, desc.VendorId, desc.DeviceId, desc.Flags, - priv->adapter_luid, priv->description.c_str ()); + priv->adapter_luid, priv->feature_data_arch.UMA, + flevel.MaxSupportedFeatureLevel, priv->description.c_str ()); gst_d3d12_device_setup_format_table (self); + if (priv->feature_data_arch.UMA && is_intel_gen11_or_older (priv->vendor_id, + flevel.MaxSupportedFeatureLevel, priv->description)) { + priv->wa_flags |= GST_D3D12_WA_DECODER_RACE; + } if (gst_d3d12_device_enable_debug ()) { ComPtr < ID3D12InfoQueue > info_queue; @@ -1071,6 +1123,30 @@ gst_d3d12_device_new_internal (const GstD3D12DeviceConstructData * data) priv->fence_data_pool = gst_d3d12_fence_data_pool_new (); + { + ComPtr < ID3D12VideoDevice > video_device; + auto hr = device.As (&video_device); + if (SUCCEEDED (hr)) { + queue_desc.Type = D3D12_COMMAND_LIST_TYPE_VIDEO_DECODE; + for (guint i = 0; i < G_N_ELEMENTS (priv->decode_queue); i++) { + priv->decode_queue[i] = gst_d3d12_command_queue_new (device.Get (), + &queue_desc, D3D12_FENCE_FLAG_NONE, 8); + if (!priv->decode_queue) + break; + + GST_OBJECT_FLAG_SET (priv->decode_queue[i], + GST_OBJECT_FLAG_MAY_BE_LEAKED); + priv->num_decode_queue++; + + /* XXX: Old Intel iGPU crashes with multiple decode queues */ + if ((priv->wa_flags & GST_D3D12_WA_DECODER_RACE) == + GST_D3D12_WA_DECODER_RACE) { + break; + } + } + } + } + GST_OBJECT_FLAG_SET (priv->direct_queue, GST_OBJECT_FLAG_MAY_BE_LEAKED); GST_OBJECT_FLAG_SET (priv->direct_cl_pool, GST_OBJECT_FLAG_MAY_BE_LEAKED); GST_OBJECT_FLAG_SET (priv->direct_ca_pool, GST_OBJECT_FLAG_MAY_BE_LEAKED); @@ -1823,3 +1899,48 @@ gst_d3d12_device_check_device_removed (GstD3D12Device * device) manager->OnDeviceRemoved (priv->adapter_luid); } } + +GstD3D12CommandQueue * +gst_d3d12_device_get_decode_queue (GstD3D12Device * device) +{ + g_return_val_if_fail (GST_IS_D3D12_DEVICE (device), nullptr); + auto priv = device->priv->inner; + + if (!priv->num_decode_queue) + return nullptr; + + std::lock_guard < std::mutex > lk (priv->lock); + auto queue = priv->decode_queue[priv->decode_queue_index]; + priv->decode_queue_index++; + priv->decode_queue_index %= priv->num_decode_queue; + + return queue; +} + +void +gst_d3d12_device_decoder_lock (GstD3D12Device * device) +{ + g_return_if_fail (GST_IS_D3D12_DEVICE (device)); + + auto priv = device->priv->inner; + if ((priv->wa_flags & GST_D3D12_WA_DECODER_RACE) == GST_D3D12_WA_DECODER_RACE) + priv->decoder_lock.lock (); +} + +void +gst_d3d12_device_decoder_unlock (GstD3D12Device * device) +{ + g_return_if_fail (GST_IS_D3D12_DEVICE (device)); + + auto priv = device->priv->inner; + if ((priv->wa_flags & GST_D3D12_WA_DECODER_RACE) == GST_D3D12_WA_DECODER_RACE) + priv->decoder_lock.unlock (); +} + +GstD3D12WAFlags +gst_d3d12_device_get_workaround_flags (GstD3D12Device * device) +{ + g_return_val_if_fail (GST_IS_D3D12_DEVICE (device), GST_D3D12_WA_NONE); + + return device->priv->inner->wa_flags; +} diff --git a/subprojects/gst-plugins-bad/sys/d3d12/gstd3d12decoder.cpp b/subprojects/gst-plugins-bad/sys/d3d12/gstd3d12decoder.cpp index dd058ddaa6..7244e75a82 100644 --- a/subprojects/gst-plugins-bad/sys/d3d12/gstd3d12decoder.cpp +++ b/subprojects/gst-plugins-bad/sys/d3d12/gstd3d12decoder.cpp @@ -226,15 +226,14 @@ struct DecoderCmdData { CloseHandle (event_handle); gst_clear_object (&ca_pool); - gst_clear_object (&queue); } ComPtr device; - ComPtr video_device; ComPtr cl; GstD3D12CommandQueue *queue = nullptr; GstD3D12CommandAllocatorPool *ca_pool = nullptr; + bool need_full_drain = false; /* Fence to wait at command record thread */ HANDLE event_handle; @@ -441,11 +440,7 @@ gst_d3d12_decoder_open (GstD3D12Decoder * decoder, GstElement * element) return FALSE; } - D3D12_COMMAND_QUEUE_DESC desc = { }; - desc.Type = D3D12_COMMAND_LIST_TYPE_VIDEO_DECODE; - desc.Flags = D3D12_COMMAND_QUEUE_FLAG_NONE; - cmd->queue = gst_d3d12_command_queue_new (cmd->device.Get (), &desc, - D3D12_FENCE_FLAG_NONE, ASYNC_DEPTH * 2); + cmd->queue = gst_d3d12_device_get_decode_queue (decoder->device); if (!cmd->queue) { GST_ERROR_OBJECT (element, "Couldn't create command queue"); return FALSE; @@ -454,6 +449,10 @@ gst_d3d12_decoder_open (GstD3D12Decoder * decoder, GstElement * element) cmd->ca_pool = gst_d3d12_command_allocator_pool_new (cmd->device.Get (), D3D12_COMMAND_LIST_TYPE_VIDEO_DECODE); + auto flags = gst_d3d12_device_get_workaround_flags (decoder->device); + if ((flags & GST_D3D12_WA_DECODER_RACE) == GST_D3D12_WA_DECODER_RACE) + cmd->need_full_drain = true; + priv->cmd = std::move (cmd); priv->flushing = false; @@ -511,14 +510,12 @@ gst_d3d12_decoder_close (GstD3D12Decoder * decoder) GST_DEBUG_OBJECT (decoder, "Close"); - if (priv->cmd) { - gst_d3d12_command_queue_fence_wait (priv->cmd->queue, priv->cmd->fence_val, - priv->cmd->event_handle); + { + GstD3D12DeviceDecoderLockGuard lk (decoder->device); + priv->session = nullptr; + priv->cmd = nullptr; } - priv->session = nullptr; - priv->cmd = nullptr; - gst_clear_object (&decoder->device); return TRUE; @@ -540,6 +537,13 @@ gst_d3d12_decoder_configure (GstD3D12Decoder * decoder, GST_FLOW_ERROR); g_return_val_if_fail (dpb_size > 0, GST_FLOW_ERROR); + if (!decoder->device) { + GST_ERROR_OBJECT (decoder, "Device was not configured"); + return GST_FLOW_ERROR; + } + + GstD3D12DeviceDecoderLockGuard dlk (decoder->device); + GstD3D12Format device_format; auto priv = decoder->priv; HRESULT hr; @@ -800,8 +804,12 @@ gst_d3d12_decoder_stop (GstD3D12Decoder * decoder) priv->flushing = true; if (priv->cmd) { - gst_d3d12_command_queue_fence_wait (priv->cmd->queue, priv->cmd->fence_val, - priv->cmd->event_handle); + if (priv->cmd->need_full_drain) { + gst_d3d12_command_queue_drain (priv->cmd->queue); + } else { + gst_d3d12_command_queue_fence_wait (priv->cmd->queue, + priv->cmd->fence_val, priv->cmd->event_handle); + } } if (priv->output_thread && priv->session) { @@ -814,6 +822,7 @@ gst_d3d12_decoder_stop (GstD3D12Decoder * decoder) g_clear_pointer (&priv->output_thread, g_thread_join); priv->flushing = false; + GstD3D12DeviceDecoderLockGuard lk (decoder->device); priv->session = nullptr; return TRUE; @@ -1112,8 +1121,8 @@ gst_d3d12_decoder_end_picture (GstD3D12Decoder * decoder, memset (&in_args, 0, sizeof (D3D12_VIDEO_DECODE_INPUT_STREAM_ARGUMENTS)); memset (&out_args, 0, sizeof (D3D12_VIDEO_DECODE_OUTPUT_STREAM_ARGUMENTS)); + GstD3D12DeviceDecoderLockGuard dlk (decoder->device); auto ca = gst_d3d12_command_allocator_get_handle (gst_ca); - hr = ca->Reset (); if (!gst_d3d12_result (hr, decoder->device)) { GST_ERROR_OBJECT (decoder, "Couldn't reset command allocator"); @@ -1299,17 +1308,6 @@ gst_d3d12_decoder_end_picture (GstD3D12Decoder * decoder, } decoder_pic->fence_val = priv->cmd->fence_val; - auto fence_handle = - gst_d3d12_command_queue_get_fence_handle (priv->cmd->queue); - dmem = (GstD3D12Memory *) gst_buffer_peek_memory (decoder_pic->buffer, 0); - gst_d3d12_memory_set_external_fence (dmem, - fence_handle, priv->cmd->fence_val); - if (decoder_pic->output_buffer) { - dmem = (GstD3D12Memory *) - gst_buffer_peek_memory (decoder_pic->output_buffer, 0); - gst_d3d12_memory_set_external_fence (dmem, - fence_handle, priv->cmd->fence_val); - } GstD3D12FenceData *fence_data; gst_d3d12_fence_data_pool_acquire (priv->fence_data_pool, &fence_data); @@ -1540,10 +1538,8 @@ gst_d3d12_decoder_process_output (GstD3D12Decoder * self, gst_buffer_ref (buffer)); } - auto fence_handle = - gst_d3d12_command_queue_get_fence_handle (priv->cmd->queue); gst_d3d12_device_copy_texture_region (self->device, copy_args.size (), - copy_args.data (), fence_data, fence_handle, decoder_pic->fence_val, + copy_args.data (), fence_data, nullptr, decoder_pic->fence_val, queue_type, ©_fence_val); if (!out_resource) { @@ -1616,6 +1612,8 @@ gst_d3d12_decoder_output_loop (GstD3D12Decoder * self) GST_DEBUG_OBJECT (self, "Entering output thread"); + auto event_handle = CreateEventEx (nullptr, nullptr, 0, EVENT_ALL_ACCESS); + while (true) { DecoderOutputData output_data; { @@ -1636,6 +1634,9 @@ gst_d3d12_decoder_output_loop (GstD3D12Decoder * self) auto decoder_pic = get_decoder_picture (output_data.picture); g_assert (decoder_pic); + gst_d3d12_command_queue_fence_wait (priv->cmd->queue, + decoder_pic->fence_val, event_handle); + if (priv->flushing) { GST_DEBUG_OBJECT (self, "Drop framem, we are flushing"); gst_codec_picture_unref (output_data.picture); @@ -1660,6 +1661,8 @@ gst_d3d12_decoder_output_loop (GstD3D12Decoder * self) GST_DEBUG_OBJECT (self, "Leaving output thread"); + CloseHandle (event_handle); + return nullptr; }