fastsamtensordecoder: Add FastSAM tensor decoder

Co-authored-by: Vineet Suryan <vineet.suryan@collabora.com>
2024-01-10 09:30:51 -05:00 · 2024-01-10 09:30:51 -05:00 · b7f964929c
commit b7f964929c
parent 6db576f033
5 changed files with 1203 additions and 1 deletions
--- a/subprojects/gst-plugins-bad/docs/plugins/gst_plugins_cache.json
+++ b/subprojects/gst-plugins-bad/docs/plugins/gst_plugins_cache.json
@ -248041,6 +248041,114 @@
    "tensordecoders": {
        "description": "Tensor decoders elements",
        "elements": {
+            "fastsamtensordecoder": {
+                "author": "Daniel Morin <daniel.morin@collabora.com>",
+                "description": "Decode tensors output from the inference of FastSAM model (segmentation) on video frames. The original repository of the FastSAM is located at https://github.com/CASIA-IVA-Lab/FastSAM. For easy experimentation a strawberry segmentation model based on FastSAM architecture in Onnx  format can be found at https://col.la/gstonnxmodelseg . This model already has tensors name embedded matching default values of tensors-masks-name and tensors-logits-name properties. It's also possible to embed tensor-ids into any model based on FastSAM architecture to allow this tensor-decoder to decode tensors. This process is described in the Readme of this repository: https://col.la/gstonnxmodels",
+                "hierarchy": [
+                    "GstFastSAMTensorDecoder",
+                    "GstBaseTransform",
+                    "GstElement",
+                    "GstObject",
+                    "GInitiallyUnowned",
+                    "GObject"
+                ],
+                "klass": "TensorDecoder/Video",
+                "pad-templates": {
+                    "sink": {
+                        "caps": "video/x-raw:\n",
+                        "direction": "sink",
+                        "presence": "always"
+                    },
+                    "src": {
+                        "caps": "video/x-raw:\n",
+                        "direction": "src",
+                        "presence": "always"
+                    }
+                },
+                "properties": {
+                    "box-confidence-threshold": {
+                        "blurb": "Boxes with a location confidence level inferior to this threshold will be excluded",
+                        "conditionally-available": false,
+                        "construct": false,
+                        "construct-only": false,
+                        "controllable": false,
+                        "default": "0.4",
+                        "max": "1",
+                        "min": "0",
+                        "mutable": "null",
+                        "readable": true,
+                        "type": "gfloat",
+                        "writable": true
+                    },
+                    "class-confidence-threshold": {
+                        "blurb": "Classes with a confidence level inferior to this threshold will be excluded",
+                        "conditionally-available": false,
+                        "construct": false,
+                        "construct-only": false,
+                        "controllable": false,
+                        "default": "0.4",
+                        "max": "1",
+                        "min": "0",
+                        "mutable": "null",
+                        "readable": true,
+                        "type": "gfloat",
+                        "writable": true
+                    },
+                    "iou-threshold": {
+                        "blurb": "Maximum intersection-over-union between bounding boxes to consider them distinct.",
+                        "conditionally-available": false,
+                        "construct": false,
+                        "construct-only": false,
+                        "controllable": false,
+                        "default": "0.7",
+                        "max": "1",
+                        "min": "0",
+                        "mutable": "null",
+                        "readable": true,
+                        "type": "gfloat",
+                        "writable": true
+                    },
+                    "max-detections": {
+                        "blurb": "Maximum object/masks detections.",
+                        "conditionally-available": false,
+                        "construct": false,
+                        "construct-only": false,
+                        "controllable": false,
+                        "default": "100",
+                        "max": "-1",
+                        "min": "0",
+                        "mutable": "null",
+                        "readable": true,
+                        "type": "guint",
+                        "writable": true
+                    },
+                    "tensors-name-logits": {
+                        "blurb": "Name that identify FastSAM logits tensors.",
+                        "conditionally-available": false,
+                        "construct": true,
+                        "construct-only": false,
+                        "controllable": false,
+                        "default": "Gst.Model.FastSAM.Segmentation.Logits",
+                        "mutable": "null",
+                        "readable": true,
+                        "type": "gchararray",
+                        "writable": true
+                    },
+                    "tensors-name-masks": {
+                        "blurb": "Name that identify FastSAM mask tensors.",
+                        "conditionally-available": false,
+                        "construct": true,
+                        "construct-only": false,
+                        "controllable": false,
+                        "default": "Gst.Model.FastSAM.Segmentation.Masks",
+                        "mutable": "null",
+                        "readable": true,
+                        "type": "gchararray",
+                        "writable": true
+                    }
+                },
+                "rank": "primary"
+            },
            "ssdobjectdetector": {
                "author": "Aaron Boxer <aaron.boxer@collabora.com>, Marcus Edel <marcus.edel@collabora.com>",
                "description": "Apply tensor output from inference to detect objects in video frames",
--- a/subprojects/gst-plugins-bad/gst/tensordecoders/gstfastsamtensordecoder.c
+++ b/subprojects/gst-plugins-bad/gst/tensordecoders/gstfastsamtensordecoder.c
@ -0,0 +1,990 @@
+/*
+ * GStreamer gstreamer-fastsamtensordecoder
+ * Copyright (C) 2024 Collabora Ltd.
+ *  Authors: Daniel Morin <daniel.morin@collabora.com>
+ *           Vineet Suryan <vineet.suryan@collabora.com>
+ *
+ * gstfastsamtensordecoder.c
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/**
+ * SECTION:element-fastsamtensordecoder.c
+ * @short_description: Decode tensors from a FastSAM detection and segmentation
+ * neural network.
+ *
+ *
+ * This element can parse per-buffer inference tensors meta data generated by an upstream
+ * inference element
+ *
+ *
+ * ## Example launch command:
+ *
+ * Test image file, model file and labels file can be found here :
+ * https://gitlab.collabora.com/gstreamer/onnx-models
+ *
+ * GST_DEBUG=fastsamtensordecoder \
+ * gst-launch-1.0 multifilesrc location=strawberry_crops.jpg ! decodebin \
+ *  ! videoconvertscale add-borders=1 ! onnxinference execution-provider=cpu
+ *  model-file=segmentation.onnx input-image-format=chw input-tensor-offset=0 \
+ *  input-tensor-scale=255.0 ! fastsamtensordecoder \
+ *  class-confidence-threshold=0.8 iou-threshold=0.7 max-detections=100
+ *  ! objectdetectionoverlay object-detection-outline-color=0xFF0000FF
+ *  draw-labels=true ! segmentationoverlay hint-maximum-segment-type=50 \
+ *  ! videoconvert ! ximagesink
+ *
+ */
+
+#ifdef HAVE_CONFI_H
+#include "config.h"
+#endif
+
+#include "gstfastsamtensordecoder.h"
+
+#include <gst/analytics/analytics.h>
+
+#include <math.h>
+
+#define GST_MODEL_FASTSAM_SEGMENTATION_MASK \
+  "Gst.Model.FastSAM.Segmentation.Masks"
+#define GST_MODEL_FASTSAM_SEGMENTATION_LOGITS \
+  "Gst.Model.FastSAM.Segmentation.Logits"
+
+GST_DEBUG_CATEGORY_STATIC (fastsam_tensor_decoder_debug);
+#define GST_CAT_DEFAULT fastsam_tensor_decoder_debug
+
+GST_ELEMENT_REGISTER_DEFINE (fastsam_tensor_decoder, "fastsamtensordecoder",
+    GST_RANK_PRIMARY, GST_TYPE_FASTSAM_TENSOR_DECODER);
+
+/* GstFastSAMTensorDecoder properties, see properties description in
+ * gst_fastsam_tensor_decoder_class_init for more details. */
+enum
+{
+  PROP_0,
+  PROP_BOX_CONFI_THRESH,
+  PROP_CLS_CONFI_THRESH,
+  PROP_IOU_THRESH,
+  PROP_MAX_DETECTION,
+  PROP_MASK_TENSOR_NAME,
+  PROP_LOGITS_TENSOR_NAME
+};
+
+/* For debug purpose */
+typedef struct _DebugCandidates
+{
+  GstFastSAMTensorDecoder *self;
+  gsize fields;                 /* Fields count do debug */
+  gsize offset;                 /* Fields offset */
+  gsize start;                  /* First field index to debug */
+} DebugCandidates;
+
+/* Default properties value */
+static const gfloat DEFAULT_BOX_CONFI_THRESH = 0.4f;
+static const gfloat DEFAULT_CLS_CONFI_THRESH = 0.4f;
+static const gfloat DEFAULT_IOU_THRESH = 0.7f;
+static const gsize DEFAULT_MAX_DETECTION = 100;
+
+/* Global variable storing class for OD. Generally OD has class
+ * and we need to provide one but this class is just a placeholder.*/
+GQuark OOI_CLASS_ID;
+
+/* To tensor-id are defined by a string that is converted to quark
+ * which is just an integer value using a hash function. For efficiency
+ * we compare on the quark (hash value). Since tensor-id never change we
+ * just calculate the hash once during initialization and store the value in
+ * these variables. */
+GQuark GST_MODEL_FASTSAM_SEGMENTATION_MASKS_ID;
+GQuark GST_MODEL_FASTSAM_SEGMENTATION_LOGITS_ID;
+
+/* GStreamer element srcpad template. Template of a srcpad that can receive
+ * any raw video. */
+static GstStaticPadTemplate gst_fastsam_tensor_decoder_src_template =
+GST_STATIC_PAD_TEMPLATE ("src",
+    GST_PAD_SRC,
+    GST_PAD_ALWAYS,
+    GST_STATIC_CAPS ("video/x-raw"));
+
+/* GStreamer element sinkpad template. Template of a sinkpad that can receive
+ * any raw video. */
+static GstStaticPadTemplate gst_fastsam_tensor_decoder_sink_template =
+GST_STATIC_PAD_TEMPLATE ("sink",
+    GST_PAD_SINK,
+    GST_PAD_ALWAYS,
+    GST_STATIC_CAPS ("video/x-raw"));
+
+/* Prototypes */
+static void gst_fastsam_tensor_decoder_set_property (GObject * object,
+    guint prop_id, const GValue * value, GParamSpec * pspec);
+static void gst_fastsam_tensor_decoder_get_property (GObject * object,
+    guint prop_id, GValue * value, GParamSpec * pspec);
+
+static void gst_fastsam_tensor_decoder_finalize (GObject * object);
+
+static GstFlowReturn gst_fastsam_tensor_decoder_transform_ip (GstBaseTransform *
+    trans, GstBuffer * buf);
+static gboolean gst_fastsam_tensor_decoder_set_caps (GstBaseTransform * trans,
+    GstCaps * incaps, GstCaps * outcaps);
+static void gst_fastsam_tensor_decoder_decode_masks_f32 (GstFastSAMTensorDecoder
+    * self, GstTensor * masks_tensor, GstTensor * logits_tensor,
+    GstAnalyticsRelationMeta * rmeta);
+
+G_DEFINE_TYPE (GstFastSAMTensorDecoder, gst_fastsam_tensor_decoder,
+    GST_TYPE_BASE_TRANSFORM);
+
+static void
+gst_fastsam_tensor_decoder_class_init (GstFastSAMTensorDecoderClass * klass)
+{
+  GObjectClass *gobject_class = (GObjectClass *) klass;
+  GstElementClass *element_class = (GstElementClass *) klass;
+  GstBaseTransformClass *basetransform_class = (GstBaseTransformClass *) klass;
+
+  /* Define GstFastSAMTensorDecoder debug category. */
+  GST_DEBUG_CATEGORY_INIT (fastsam_tensor_decoder_debug, "fastsamtensordecoder",
+      0, "Tensor decoder for FastSAM segmentation N.N.");
+
+  /* Set GObject vmethod to get and set property */
+  gobject_class->set_property = gst_fastsam_tensor_decoder_set_property;
+  gobject_class->get_property = gst_fastsam_tensor_decoder_get_property;
+
+  /* Set GObject vmethod finalize */
+  gobject_class->finalize = gst_fastsam_tensor_decoder_finalize;
+
+  /* Define GstFastSAMTensorDecoder properties using GObject properties
+   * interface.*/
+  g_object_class_install_property (G_OBJECT_CLASS (klass),
+      PROP_BOX_CONFI_THRESH,
+      g_param_spec_float ("box-confidence-threshold",
+          "Box location confidence threshold",
+          "Boxes with a location confidence level inferior to this threshold "
+          "will be excluded",
+          0.0, 1.0, DEFAULT_BOX_CONFI_THRESH,
+          (GParamFlags) (G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS)));
+
+  g_object_class_install_property (G_OBJECT_CLASS (klass),
+      PROP_CLS_CONFI_THRESH,
+      g_param_spec_float ("class-confidence-threshold",
+          "Class confidence threshold",
+          "Classes with a confidence level inferior to this threshold "
+          "will be excluded",
+          0.0, 1.0, DEFAULT_CLS_CONFI_THRESH,
+          (GParamFlags) (G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS)));
+
+  g_object_class_install_property (G_OBJECT_CLASS (klass),
+      PROP_IOU_THRESH,
+      g_param_spec_float ("iou-threshold",
+          "Maximum IOU threshold",
+          "Maximum intersection-over-union between bounding boxes to "
+          "consider them distinct.",
+          0.0, 1.0, DEFAULT_IOU_THRESH,
+          (GParamFlags) (G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS)));
+
+  g_object_class_install_property (G_OBJECT_CLASS (klass),
+      PROP_MAX_DETECTION,
+      g_param_spec_uint ("max-detections",
+          "Maximum object/masks detections.",
+          "Maximum object/masks detections.",
+          0, G_MAXUINT, DEFAULT_MAX_DETECTION,
+          (GParamFlags) (G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS)));
+
+  g_object_class_install_property (G_OBJECT_CLASS (klass),
+      PROP_MASK_TENSOR_NAME,
+      g_param_spec_string ("tensors-name-masks",
+          "Mask tensors name",
+          "Name that identify FastSAM mask tensors.",
+          GST_MODEL_FASTSAM_SEGMENTATION_MASK,
+          (GParamFlags) (G_PARAM_READWRITE | G_PARAM_CONSTRUCT |
+              G_PARAM_STATIC_STRINGS)));
+
+  g_object_class_install_property (G_OBJECT_CLASS (klass),
+      PROP_LOGITS_TENSOR_NAME,
+      g_param_spec_string ("tensors-name-logits",
+          "Logits tensors name",
+          "Name that identify FastSAM logits tensors.",
+          GST_MODEL_FASTSAM_SEGMENTATION_LOGITS,
+          (GParamFlags) (G_PARAM_READWRITE | G_PARAM_CONSTRUCT |
+              G_PARAM_STATIC_STRINGS)));
+
+  /* Element description. */
+  gst_element_class_set_static_metadata (element_class, "fastsamtensordecoder",
+      "TensorDecoder/Video",
+      "Decode tensors output from the inference of FastSAM model (segmentation)"
+      " on video frames. The original repository of the FastSAM is located at"
+      " https://github.com/CASIA-IVA-Lab/FastSAM. For easy experimentation a"
+      " strawberry segmentation model based on FastSAM architecture in Onnx "
+      " format can be found at https://col.la/gstonnxmodelseg . This model "
+      "already has tensors name embedded matching default "
+      "values of tensors-masks-name and tensors-logits-name properties. It's "
+      "also possible to embed tensor-ids into any model based on FastSAM "
+      "architecture to allow this tensor-decoder to decode tensors. This "
+      "process is described in the Readme of this repository: "
+      "https://col.la/gstonnxmodels",
+      "Daniel Morin <daniel.morin@collabora.com>");
+
+  /* Add pads to element base on pad template defined earlier */
+  gst_element_class_add_pad_template (element_class,
+      gst_static_pad_template_get (&gst_fastsam_tensor_decoder_src_template));
+  gst_element_class_add_pad_template (element_class,
+      gst_static_pad_template_get (&gst_fastsam_tensor_decoder_sink_template));
+
+  /* Set GstBaseTransform vmethod transform_ip. This methode is called
+   * by the srcpad when it receive buffer. ip stand for in-place meaning the
+   * buffer remain unchanged by the element. Tensor-decoder only monitor
+   * buffer it receive for a meta attach to the buffer that is a GstTensorMeta
+   * and has a tensor-id can be handled by GstFastSAMTensorDecoder. */
+  basetransform_class->transform_ip =
+      GST_DEBUG_FUNCPTR (gst_fastsam_tensor_decoder_transform_ip);
+
+  /* Set GstBaseTransform set_caps vmethod. This will be called once the
+   * capability negotiation has been completed. We will be able to extract
+   * resolution from this callback. */
+  basetransform_class->set_caps =
+      GST_DEBUG_FUNCPTR (gst_fastsam_tensor_decoder_set_caps);
+
+  /* Calculate the class id placeholder (also a quark) that will be set on all
+   * OD analytics-meta. */
+  OOI_CLASS_ID = g_quark_from_static_string ("FastSAM-None");
+
+  /* Calculate the FastSAM Mask tensor-id */
+  GST_MODEL_FASTSAM_SEGMENTATION_MASKS_ID =
+      g_quark_from_static_string (GST_MODEL_FASTSAM_SEGMENTATION_MASK);
+
+  /* Calculate the FastSAM Logits tensor-id */
+  GST_MODEL_FASTSAM_SEGMENTATION_LOGITS_ID =
+      g_quark_from_static_string (GST_MODEL_FASTSAM_SEGMENTATION_LOGITS);
+}
+
+static void
+gst_fastsam_tensor_decoder_init (GstFastSAMTensorDecoder * self)
+{
+  /* GstFastSAMTensorDecoder instance initialization */
+  self->box_confi_thresh = DEFAULT_BOX_CONFI_THRESH;
+  self->cls_confi_thresh = DEFAULT_CLS_CONFI_THRESH;
+  self->iou_thresh = DEFAULT_IOU_THRESH;
+  self->max_detection = DEFAULT_MAX_DETECTION;
+  self->sel_candidates = NULL;
+  self->selected = NULL;
+  self->mask_w = 256;
+  self->mask_h = 256;
+  self->mask_length = self->mask_w * self->mask_h;
+  memset (&self->mask_roi, 0, sizeof (BBox));
+  self->mask_pool = NULL;
+  gst_base_transform_set_passthrough (GST_BASE_TRANSFORM (self), FALSE);
+}
+
+static void
+gst_fastsam_tensor_decoder_finalize (GObject * object)
+{
+  GstFastSAMTensorDecoder *self = GST_FASTSAM_TENSOR_DECODER (object);
+
+  if (self->sel_candidates) {
+    g_ptr_array_unref (g_steal_pointer (&self->sel_candidates));
+  }
+
+  if (self->selected) {
+    g_ptr_array_unref (g_steal_pointer (&self->selected));
+  }
+
+  if (self->mask_pool) {
+    gst_object_unref (self->mask_pool);
+  }
+
+  G_OBJECT_CLASS (gst_fastsam_tensor_decoder_parent_class)->finalize (object);
+}
+
+static void
+gst_fastsam_tensor_decoder_set_property (GObject * object, guint prop_id,
+    const GValue * value, GParamSpec * pspec)
+{
+  GstFastSAMTensorDecoder *self = GST_FASTSAM_TENSOR_DECODER (object);
+
+  switch (prop_id) {
+    case PROP_BOX_CONFI_THRESH:
+      GST_OBJECT_LOCK (self);
+      self->box_confi_thresh = g_value_get_float (value);
+      GST_OBJECT_UNLOCK (self);
+      break;
+    case PROP_CLS_CONFI_THRESH:
+      GST_OBJECT_LOCK (self);
+      self->cls_confi_thresh = g_value_get_float (value);
+      GST_OBJECT_UNLOCK (self);
+      break;
+    case PROP_IOU_THRESH:
+      GST_OBJECT_LOCK (self);
+      self->iou_thresh = g_value_get_float (value);
+      GST_OBJECT_UNLOCK (self);
+      break;
+    case PROP_MAX_DETECTION:
+      GST_OBJECT_LOCK (self);
+      self->max_detection = g_value_get_uint (value);
+      GST_OBJECT_UNLOCK (self);
+      break;
+    case PROP_MASK_TENSOR_NAME:
+      GST_OBJECT_LOCK (self);
+      self->mask_tensor_id = g_quark_from_string (g_value_get_string (value));
+      GST_OBJECT_UNLOCK (self);
+      break;
+    case PROP_LOGITS_TENSOR_NAME:
+      GST_OBJECT_LOCK (self);
+      self->logits_tensor_id = g_quark_from_string (g_value_get_string (value));
+      GST_OBJECT_UNLOCK (self);
+      break;
+    default:
+      G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec);
+      break;
+  }
+}
+
+static void
+gst_fastsam_tensor_decoder_get_property (GObject * object, guint prop_id,
+    GValue * value, GParamSpec * pspec)
+{
+  GstFastSAMTensorDecoder *self = GST_FASTSAM_TENSOR_DECODER (object);
+
+  switch (prop_id) {
+    case PROP_BOX_CONFI_THRESH:
+      g_value_set_float (value, self->box_confi_thresh);
+      break;
+    case PROP_CLS_CONFI_THRESH:
+      g_value_set_float (value, self->cls_confi_thresh);
+      break;
+    case PROP_IOU_THRESH:
+      g_value_set_float (value, self->iou_thresh);
+      break;
+    case PROP_MAX_DETECTION:
+      g_value_set_uint (value, self->max_detection);
+      break;
+    case PROP_MASK_TENSOR_NAME:
+      GST_OBJECT_LOCK (self);
+      g_value_set_string (value, g_quark_to_string (self->mask_tensor_id));
+      GST_OBJECT_UNLOCK (self);
+      break;
+    case PROP_LOGITS_TENSOR_NAME:
+      GST_OBJECT_LOCK (self);
+      g_value_set_string (value, g_quark_to_string (self->logits_tensor_id));
+      GST_OBJECT_UNLOCK (self);
+      break;
+    default:
+      G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec);
+      break;
+  }
+}
+
+/* gst_fastsam_tensor_decoder_get_tensor_meta
+ * @buf:in: buffer
+ * @mask_tensor:out: Mask tensor
+ * @logits_tensor:out: Logits tensor
+ * @return: TRUE if buf has mask and logits tensor attach to it.
+ * Retrieve FastSAM masks and logits tensors from buffer.
+ */
+static gboolean
+gst_fastsam_tensor_decoder_get_tensor_meta (GstFastSAMTensorDecoder * self,
+    GstBuffer * buf, GstTensor ** mask_tensor, GstTensor ** logits_tensor)
+{
+  GstTensorMeta *tensor_meta;
+  gint mask_tensor_idx, logits_tensor_idx;
+
+  g_return_val_if_fail (mask_tensor != NULL, FALSE);
+  g_return_val_if_fail (logits_tensor != NULL, FALSE);
+
+  *mask_tensor = NULL;
+  *logits_tensor = NULL;
+
+  /* Retrieve all TensorMeta attach the buffer */
+  tensor_meta = gst_buffer_get_tensor_meta (buf);
+  if (!tensor_meta) {
+    GST_LOG_OBJECT (self, "No tensor meta");
+    return FALSE;
+  }
+
+  GST_LOG_OBJECT (self, "Num tensors %zu", tensor_meta->num_tensors);
+
+  /* Retrieve the index of the tensor that has a tensor-id matching
+   * GST_MODEL_FASTSAM_SEGMENTATION_MASKS_ID in the GstTensorMeta. */
+  mask_tensor_idx = gst_tensor_meta_get_index_from_id (tensor_meta,
+      GST_MODEL_FASTSAM_SEGMENTATION_MASKS_ID);
+
+  /* Retrieve the index of the tensor that has a tensor-id matching
+   * GST_MODEL_FASTSAM_SEGMENTATION_LOGITS_ID in the GstTensorMeta. */
+  logits_tensor_idx = gst_tensor_meta_get_index_from_id (tensor_meta,
+      GST_MODEL_FASTSAM_SEGMENTATION_LOGITS_ID);
+
+  if (mask_tensor_idx >= 0 && logits_tensor_idx >= 0) {
+    GST_LOG_OBJECT (self, "Masks tensor id: %d", mask_tensor_idx);
+    GST_LOG_OBJECT (self, "Masks tensor id: %d", logits_tensor_idx);
+
+    *mask_tensor = tensor_meta->tensors[mask_tensor_idx];
+    *logits_tensor = tensor_meta->tensors[logits_tensor_idx];
+
+    return TRUE;
+  } else {
+    GST_INFO_OBJECT (self, "Couldn't find mask or logits tensor, skipping");
+  }
+
+  return FALSE;
+}
+
+/* gst_fastsam_tensor_decoder_set_caps:
+ *
+ * Callback on caps negociation completed. We use it here to retrieve
+ * video resolution. See GstBaseTransform for more details.
+ */
+static gboolean
+gst_fastsam_tensor_decoder_set_caps (GstBaseTransform * trans, GstCaps * incaps,
+    GstCaps * outcaps)
+{
+  GstFastSAMTensorDecoder *self = GST_FASTSAM_TENSOR_DECODER (trans);
+
+  if (!gst_video_info_from_caps (&self->video_info, incaps)) {
+    GST_ERROR_OBJECT (self, "Failed to parse caps");
+    return FALSE;
+  }
+
+  if (gst_base_transform_is_passthrough (trans)) {
+    GST_ERROR_OBJECT (self, "Failed. Can't handle passthrough");
+    return FALSE;
+  }
+
+  /* The masks need to be cropped to fit the SAR of the image. */
+  /* TODO: We're reconstructing the transformation that was done on the
+   * original image based on the assumption that the complete image without
+   * deformation would be analyzed. This assumption is not alway true and
+   * we should try to find a way to convey this transformation information
+   * and retrieve from here to know the transformation that need to be done
+   * on the mask.*/
+
+  if (self->video_info.width > self->video_info.height) {
+    self->bb2mask_gain = ((gfloat) self->mask_w) / self->video_info.width;
+    self->mask_roi.x = 0;
+    self->mask_roi.w = self->mask_w;
+    self->mask_roi.h = ((gfloat) self->bb2mask_gain) * self->video_info.height;
+    self->mask_roi.y = (self->mask_h - self->mask_roi.h) / 2;
+  } else {
+    self->bb2mask_gain = ((gfloat) self->mask_h) / self->video_info.height;
+    self->mask_roi.y = 0;
+    self->mask_roi.h = self->mask_h;
+    self->mask_roi.w = self->bb2mask_gain * self->video_info.width;
+    self->mask_roi.x = (self->mask_w - self->mask_roi.w) / 2;
+  }
+
+  if (self->mask_pool == NULL) {
+    GstVideoInfo minfo;
+    GstCaps *caps;
+    gst_video_info_init (&minfo);
+    gst_video_info_set_format (&minfo, GST_VIDEO_FORMAT_GRAY8, 256, 256);
+    caps = gst_video_info_to_caps (&minfo);
+    self->mask_pool = gst_video_buffer_pool_new ();
+    GstStructure *config = gst_buffer_pool_get_config (self->mask_pool);
+    gst_buffer_pool_config_set_params (config, caps, self->mask_length, 0, 0);
+    gst_buffer_pool_config_add_option (config,
+        GST_BUFFER_POOL_OPTION_VIDEO_META);
+    g_return_val_if_fail (gst_buffer_pool_set_config (self->mask_pool, config),
+        FALSE);
+    g_return_val_if_fail (gst_buffer_pool_set_active (self->mask_pool, TRUE),
+        FALSE);
+    gst_caps_unref (caps);
+  }
+
+  return TRUE;
+}
+
+/* gst_fastsam_tensor_decoder_transform_ip:
+ * @trans: Instance
+ * @buf:inout: Buffer containing media and where tensors can be attached
+ * @return: Flow errors
+ * Decode FastSAM tensors, post-process tensors and store decoded information
+ * into an analytics-meta that is attached to the buffer before been pushed
+ * downstream.
+ */
+static GstFlowReturn
+gst_fastsam_tensor_decoder_transform_ip (GstBaseTransform * trans,
+    GstBuffer * buf)
+{
+  GstFastSAMTensorDecoder *self = GST_FASTSAM_TENSOR_DECODER (trans);
+  GstTensor *masks_tensor, *logits_tensor;
+  GstAnalyticsRelationMeta *rmeta;
+  gsize mask_w, mask_h;
+
+  if (!gst_fastsam_tensor_decoder_get_tensor_meta (self, buf, &masks_tensor,
+          &logits_tensor))
+    return GST_FLOW_OK;
+
+  if (masks_tensor->num_dims < 3) {
+    GST_ELEMENT_ERROR (self, STREAM, DECODE, (NULL),
+        ("Masks tensor must have at least 3 dimensions,"
+            "but only has %zu", masks_tensor->num_dims));
+    return GST_FLOW_ERROR;
+  }
+
+  if (logits_tensor->num_dims != 4) {
+    GST_ELEMENT_ERROR (self, STREAM, DECODE, (NULL),
+        ("Logits tensor must have 4 dimensions but has %zu",
+            masks_tensor->num_dims));
+    return GST_FLOW_ERROR;
+  }
+
+  mask_w = logits_tensor->dims[2];
+  mask_h = logits_tensor->dims[3];
+
+  /* The masks need to be cropped to fit the SAR of the image. */
+  /* TODO: We're reconstructing the transformation that was done on the
+   * original image based on the assumption that the complete image without
+   * deformation would be analyzed. This assumption is not alway true and
+   * we should try to find a way to convey this transformation information
+   * and retrieve from here to know the transformation that need to be done
+   * on the mask.*/
+
+  if (self->mask_w != mask_w || self->mask_h != mask_h) {
+    self->mask_w = mask_w;
+    self->mask_h = mask_h;
+    self->mask_length = mask_w * mask_h;
+
+    if (self->video_info.width > self->video_info.height) {
+      self->bb2mask_gain = ((gfloat) self->mask_w) / self->video_info.width;
+      self->mask_roi.x = 0;
+      self->mask_roi.w = self->mask_w;
+      self->mask_roi.h =
+          ((gfloat) self->bb2mask_gain) * self->video_info.height;
+      self->mask_roi.y = (self->mask_h - self->mask_roi.h) / 2;
+    } else {
+      self->bb2mask_gain = ((gfloat) self->mask_h) / self->video_info.height;
+      self->mask_roi.y = 0;
+      self->mask_roi.h = self->mask_h;
+      self->mask_roi.w = self->bb2mask_gain * self->video_info.width;
+      self->mask_roi.x = (self->mask_w - self->mask_roi.w) / 2;
+    }
+
+    if (self->mask_pool) {
+      gst_buffer_pool_set_active (self->mask_pool, FALSE);
+      g_clear_object (&self->mask_pool);
+    }
+  }
+
+  if (self->mask_pool == NULL) {
+    GstVideoInfo minfo;
+    GstCaps *caps;
+    gst_video_info_init (&minfo);
+    gst_video_info_set_format (&minfo, GST_VIDEO_FORMAT_GRAY8, self->mask_w,
+        self->mask_h);
+    caps = gst_video_info_to_caps (&minfo);;
+    self->mask_pool = gst_video_buffer_pool_new ();
+
+    GstStructure *config = gst_buffer_pool_get_config (self->mask_pool);
+    gst_buffer_pool_config_set_params (config, caps, self->mask_length, 0, 0);
+    gst_buffer_pool_config_add_option (config,
+        GST_BUFFER_POOL_OPTION_VIDEO_META);
+    gst_buffer_pool_set_config (self->mask_pool, config);
+    gst_buffer_pool_set_active (self->mask_pool, TRUE);
+    gst_caps_unref (caps);
+  }
+
+
+  static GstAnalyticsRelationMetaInitParams rmeta_init_params = {
+    .initial_buf_size = 1024,
+    .initial_relation_order = 10
+  };
+
+  /* Retrieve or attach an analytics-relation-meta to the buffer.
+   * Analytics-relation-meta are container that can reveive multiple
+   * analytics-meta, like OD and Segmentation. The following call will only
+   * retrieve an analytics-relation-meta if it exist or create one if it
+   * does not exist. */
+  rmeta = gst_buffer_add_analytics_relation_meta_full (buf, &rmeta_init_params);
+  g_return_val_if_fail (rmeta != NULL, GST_FLOW_ERROR);
+
+  /* Decode masks_tensor and attach the information in a structured way
+   * to rmeta.
+   * TODO: I think we need to send both tensors masks and logits
+   * to gst_fastsam_tensor_decoder_decode_masks_f32 since both are
+   * required simultanously to extract the segmentation. If this is the case
+   * we probably should rename gst_fastsam_tensor_decoder_decode_masks_f32 to
+   * gst_fastsam_tensor_decoder_decode_f32. */
+  gst_fastsam_tensor_decoder_decode_masks_f32 (self, masks_tensor,
+      logits_tensor, rmeta);
+
+  return GST_FLOW_OK;
+}
+
+/* Evaluate if there's an intersection between segement s1 and s2 */
+static guint
+linear_intersection (guint s1_min, guint s1_max, guint s2_min, guint s2_max)
+{
+  guint tmp;
+  if (s1_max > s2_min && s2_max > s1_min) {
+    if (s1_min > s2_min) {
+      tmp = (s2_max > s1_max) ? s1_max : s2_max;
+      return tmp - s1_min;
+    } else {
+      tmp = (s1_max > s2_max) ? s2_max : s1_max;
+      return tmp - s2_min;
+    }
+  }
+  return 0.0f;
+}
+
+static gfloat
+iou (guint bb1_x, guint bb1_y, guint bb1_w, guint bb1_h,
+    guint bb2_x, guint bb2_y, guint bb2_w, guint bb2_h)
+{
+  /* Rational: linear intersection is much faster to calculate then
+   * 2d intersection. We project the two bounding boxes considered for
+   * intersection on one axis and verify if the segments the create intersect.
+   * If they don't, the bounding boxes can't intersect in 2d and we don't
+   * need to verify if they intersect on the other dimension. If they
+   * intersect on the first dimension we verify if they intersec on the other
+   * dimension. Again if the don't intersect the bounding boxes can't intersect
+   * on in a 2D space. If they intersected on both axis we calculate the IoU.*/
+  const guint x_intersection =
+      linear_intersection (bb1_x, bb1_x + bb1_w, bb2_x, bb2_x + bb2_w);
+  if (x_intersection > 0) {
+    const guint y_intersection = linear_intersection (bb1_y, bb1_y + bb1_h,
+        bb2_y, bb2_y + bb2_h);
+    if (y_intersection > 0) {
+      const guint bb1_area = bb1_w * bb1_h;
+      const guint bb2_area = bb2_w * bb2_h;
+      const guint intersect_area = x_intersection * y_intersection;
+      const guint union_area = bb1_area + bb2_area - intersect_area;
+      return union_area == 0 ? 0.0f : ((gfloat) intersect_area) / union_area;
+    }
+  }
+
+  return 0.0f;
+}
+
+/* Extract bounding box from tensor data */
+static void
+gst_fastsam_tensor_decoder_convert_bbox (gfloat * candidate, gsize * offset,
+    BBox * bbox)
+{
+  gfloat w = *(candidate + offset[2]);
+  gfloat h = *(candidate + offset[3]);
+  bbox->x = *(candidate + offset[0]) - (w / 2);
+  bbox->y = *(candidate + offset[1]) - (h / 2);
+  bbox->w = w + 0.5;
+  bbox->h = h + 0.5;
+}
+
+/* Calculate iou between boundingbox of candidate c1 and c2
+ */
+static gfloat
+gst_fastsam_tensor_decoder_iou (gfloat * c1, gfloat * c2, gsize * offset,
+    BBox * bb1, BBox * bb2)
+{
+  gst_fastsam_tensor_decoder_convert_bbox (c1, offset, bb1);
+  gst_fastsam_tensor_decoder_convert_bbox (c2, offset, bb2);
+  return iou (bb1->x, bb1->y, bb1->w, bb1->h, bb2->x, bb2->y, bb2->w, bb2->h);
+}
+
+/* Compare c1 and c2
+ * Utility function for sorting candiates based on the a field identified
+ * by offset.
+ */
+static gint
+gst_fastsam_tensor_decoder_sort_candidates (gconstpointer c1, gconstpointer c2,
+    gpointer offset)
+{
+  const gfloat *c1_confi =
+      (*((const gfloat **) c1) + GPOINTER_TO_SIZE (offset));
+  const gfloat *c2_confi =
+      (*((const gfloat **) c2) + GPOINTER_TO_SIZE (offset));
+  return *c1_confi < *c2_confi ? 1 : *c1_confi > *c2_confi ? -1 : 0;
+}
+
+static void
+gst_fastsam_tensor_decoder_debug_print_candidate (gpointer candidate_,
+    gpointer data)
+{
+  DebugCandidates *ctx = data;
+  const gfloat *candidate = candidate_;
+
+  for (gsize i = ctx->start; i < ctx->fields + ctx->start; i++) {
+    GST_TRACE_OBJECT (ctx->self, "Field %lu: %f", i,
+        *(candidate + (i * ctx->offset)));
+  }
+}
+
+static float
+sigmoid (float x)
+{
+  /* Check for positive overflow */
+  if (x > 0) {
+    double exp_neg_x = exp (-x);
+    return 1.0 / (1.0 + exp_neg_x);
+  }
+  /* Check for negative overflow and improve stability for negative x */
+  else {
+    double exp_x = exp (x);
+    return exp_x / (1.0 + exp_x);
+  }
+}
+
+static gboolean
+gst_fastsam_tensor_decoder_decode_valid_bb (GstFastSAMTensorDecoder * self,
+    gfloat x, gfloat y, gfloat w, gfloat h)
+{
+  if (x > (GST_VIDEO_INFO_WIDTH (&self->video_info)))
+    return FALSE;
+  if (y > (GST_VIDEO_INFO_HEIGHT (&self->video_info)))
+    return FALSE;
+  if (x < -(gfloat) (GST_VIDEO_INFO_WIDTH (&self->video_info) / 2.0))
+    return FALSE;
+  if (y < -(gfloat) (GST_VIDEO_INFO_HEIGHT (&self->video_info) / 2.0))
+    return FALSE;
+  if (w <= 0)
+    return FALSE;
+  if (h <= 0)
+    return FALSE;
+  if (w > (GST_VIDEO_INFO_WIDTH (&self->video_info)))
+    return FALSE;
+  if (h > (GST_VIDEO_INFO_HEIGHT (&self->video_info)))
+    return FALSE;
+
+  return TRUE;
+}
+
+static void
+gst_fastsam_tensor_decoder_decode_masks_f32 (GstFastSAMTensorDecoder * self,
+    GstTensor * masks_tensor, GstTensor * logits_tensor,
+    GstAnalyticsRelationMeta * rmeta)
+{
+  /*guint batch_size = masks_tensor->dims[0]; */
+  /*guint num_masks = masks_tensor->dims[1]; */
+  GstMapInfo map_info_masks, map_info_logits, out_mask_info;
+  gfloat *candidate, **candidates, iou, *data_logits;
+  gboolean rv, keep;
+  gsize offset, x_offset, y_offset, w_offset, h_offset, c_offset, offsets[4];
+  gsize m0_offset;
+  GPtrArray *sel_candidates = self->sel_candidates, *selected = self->selected;
+  BBox bb1, bb2, bb_mask;
+  GstAnalyticsODMtd od_mtd;
+  GstAnalyticsSegmentationMtd seg_mtd;
+  guint8 *mask_data;
+
+  /* Retrieve memory at index 0 and map it in READ mode */
+  rv = gst_buffer_map (masks_tensor->data, &map_info_masks, GST_MAP_READ);
+  g_assert (rv);
+
+  /* Retrieve memory at index 0 from logits_tensor in READ mode */
+  rv = gst_buffer_map (logits_tensor->data, &map_info_logits, GST_MAP_READ);
+  g_assert (rv);
+  data_logits = (gfloat *) map_info_logits.data;
+
+  GST_LOG_OBJECT (self, "Mask Tensor shape dims %zu", masks_tensor->num_dims);
+
+  /* Trace masks tensor dimensions */
+  if (gst_debug_category_get_threshold (GST_CAT_DEFAULT) >= GST_LEVEL_TRACE) {
+    for (gsize i = 0; i < masks_tensor->num_dims; i++) {
+      GST_TRACE_OBJECT (self, "Masks Tensor dim %zu: %zu", i,
+          masks_tensor->dims[i]);
+    }
+  }
+
+  /* Allocated array to store selected candidates */
+  if (sel_candidates == NULL) {
+    /* Number of candidates can be large, keep the array to avoid frequent
+     * allocation */
+    sel_candidates = g_ptr_array_new_full (masks_tensor->dims[2], NULL);
+    self->sel_candidates = sel_candidates;
+    selected = g_ptr_array_new_full (masks_tensor->dims[2], NULL);
+    self->selected = selected;
+  } else {
+    /* Reset lengths when we re-use arrays */
+    sel_candidates->len = 0;
+    selected->len = 0;
+  }
+
+  /* masks_tensor->dims[2] contain the number of candidates. Let's call the
+   * number of candidates C. We store this value in offset as we use it
+   * calculate the offset of candidate fields. The variable #data_masks above point
+   * at the masks tensor data, but candidates data is organize like a plane.
+   * Candidates bbox X coord fields from 0 to C start at the begining of the
+   * tensor data and are continguous in memory, followed by all candidates
+   * field Y, followed by field W, ... followed by field class confidence level,
+   * ..., followed by all candidates mask0, ..., followed by all candidates
+   * mask31. Bellow we pre-calculate each field offset relative to the
+   * candidate pointer (pointer to field X), which will allow us to easily
+   * access each candiates field.
+   * */
+  offset = masks_tensor->dims[2];
+  x_offset = 0;
+  y_offset = offset;
+  w_offset = 2 * offset;
+  h_offset = 3 * offset;
+  c_offset = 4 * offset;
+  m0_offset = 5 * offset;
+  offsets[0] = x_offset;
+  offsets[1] = y_offset;
+  offsets[2] = w_offset;
+  offsets[3] = h_offset;
+
+#define MASK_X(candidate, index) candidate[m0_offset + (index * offset)]
+#define BB_X(candidate) candidate[x_offset]
+#define BB_Y(candidate) candidate[y_offset]
+#define BB_W(candidate) candidate[w_offset]
+#define BB_H(candidate) candidate[h_offset]
+
+  candidate = (gfloat *) map_info_masks.data;;
+  for (gsize c_idx = 0; c_idx < masks_tensor->dims[2]; c_idx++) {
+    /* FastSAM only has one class, but this confidence level is still used
+     * to evaluate the relevance of the candidate. Here we filter candidates
+     * based on their class confidence level.*/
+    if (candidate[c_offset] > self->cls_confi_thresh &&
+        gst_fastsam_tensor_decoder_decode_valid_bb (self,
+            BB_X (candidate), BB_Y (candidate), BB_W (candidate),
+            BB_H (candidate))) {
+      g_ptr_array_add (sel_candidates, candidate);
+      GST_TRACE_OBJECT (self,
+          "%lu: x,y=(%f;%f) w,h=(%f;%f), s=%f c=%f",
+          c_idx,
+          candidate[x_offset],
+          candidate[y_offset],
+          candidate[w_offset],
+          candidate[h_offset],
+          candidate[w_offset] * candidate[h_offset], candidate[c_offset]);
+    }
+
+    /* Pointer arithmetic, going to the next candidate. This is the candidate
+     * pointer that is now incremented to the next candidate which is also
+     * the field X of the next candidate.*/
+    candidate += 1;
+  }
+
+  GST_LOG_OBJECT (self, "Selected candidates count: %u", sel_candidates->len);
+
+  /* We sort the remaining candidates because, in the next selection phase we
+   * have a maximum and we want to make sure that considered only the candidates
+   * with the highest class confidence level before potentially reaching the
+   * maximum.*/
+  g_ptr_array_sort_with_data (sel_candidates,
+      gst_fastsam_tensor_decoder_sort_candidates, GSIZE_TO_POINTER (c_offset));
+
+  if (gst_debug_category_get_threshold (GST_CAT_DEFAULT) >= GST_LEVEL_TRACE) {
+    /* For debug purpose only. Prints candidates before NMS */
+    DebugCandidates ctx;
+    ctx.start = 0;
+    ctx.fields = 5;
+    ctx.offset = offset;
+    ctx.self = self;
+    g_ptr_array_foreach (sel_candidates,
+        gst_fastsam_tensor_decoder_debug_print_candidate, &ctx);
+  }
+
+  GstBuffer *mask_buf;
+  guint region_ids[2] = { 0, 0 };
+
+  /* Algorithm in part inspired by OpenCV NMSBoxes */
+  candidates = (gfloat **) sel_candidates->pdata;
+  for (gsize c = 0; c < sel_candidates->len; c++) {
+    keep = TRUE;
+
+    /* We only want to a NMS using IoU between candidates we've decided to
+     * keep and the new one we considering to keep. selected array contain
+     * the candidates we decided to keep and candidates[c] is the candidate
+     * we're considering to keep or reject */
+    for (gsize s = 0; s < selected->len && keep; s++) {
+      iou = gst_fastsam_tensor_decoder_iou (candidates[c], selected->pdata[s],
+          offsets, &bb1, &bb2);
+      keep = iou <= self->iou_thresh;
+    }
+
+    if (keep) {
+      candidate = sel_candidates->pdata[c];
+      if (selected->len == 0) {
+        /* The first bounding-box always get in as there's no others bbox
+         * to filter on based on IoU */
+        gst_fastsam_tensor_decoder_convert_bbox (candidate, offsets, &bb1);
+      }
+
+      g_ptr_array_add (selected, candidate);
+      region_ids[1] = selected->len;
+
+      /* We add the analytics-objectdetection-meta to the buffer. Since
+       * there's only one class the class confidence level is set to -1.0
+       * as it's deemed not important. */
+      gst_analytics_relation_meta_add_od_mtd (rmeta, OOI_CLASS_ID,
+          bb1.x, bb1.y, bb1.w, bb1.h, -1.0, &od_mtd);
+
+      bb_mask.x = self->bb2mask_gain * bb1.x + self->mask_roi.x;
+      bb_mask.y = self->bb2mask_gain * bb1.y + self->mask_roi.y;
+      bb_mask.w = self->bb2mask_gain * bb1.w;
+      bb_mask.h = self->bb2mask_gain * bb1.h;
+
+      mask_buf = NULL;
+      g_assert (gst_buffer_pool_acquire_buffer (self->mask_pool,
+              &mask_buf, NULL) == GST_FLOW_OK);
+      g_assert (GST_IS_BUFFER (mask_buf));
+      GstVideoMeta *vmeta = gst_buffer_get_video_meta (mask_buf);
+      g_assert (vmeta != NULL);
+      vmeta->width = bb_mask.w;
+      vmeta->height = bb_mask.h;
+
+      gst_buffer_map (mask_buf, &out_mask_info, GST_MAP_READWRITE);
+      mask_data = (guint8 *) out_mask_info.data;
+
+#define MX_MAX (bb_mask.x + bb_mask.w)
+#define MY_MAX (bb_mask.y + bb_mask.h)
+
+      for (gint my = bb_mask.y, i = 0, j; my < MY_MAX; my++) {
+        for (gint mx = bb_mask.x; mx < MX_MAX; mx++, i++) {
+          float sum = 0.0f;
+          j = my * self->mask_w + mx;
+          for (gint k = 0; k < 32; ++k) {
+            GST_TRACE_OBJECT (self, "protos data at (%d, %d) is %f", j, k,
+                data_logits[k * self->mask_length + j]);
+            sum +=
+                MASK_X (candidate, k) * data_logits[k * self->mask_length + j];
+          }
+          mask_data[i] = sigmoid (sum) > 0.5 ? selected->len : 0;
+        }
+      }
+
+      gst_analytics_relation_meta_add_segmentation_mtd (rmeta, mask_buf,
+          GST_SEGMENTATION_TYPE_INSTANCE, 1, region_ids, bb1.x, bb1.y, bb1.w,
+          bb1.h, &seg_mtd);
+
+      gst_analytics_relation_meta_set_relation (rmeta,
+          GST_ANALYTICS_REL_TYPE_RELATE_TO, seg_mtd.id, od_mtd.id);
+
+      gst_analytics_relation_meta_set_relation (rmeta,
+          GST_ANALYTICS_REL_TYPE_RELATE_TO, od_mtd.id, seg_mtd.id);
+
+      gst_buffer_unmap (mask_buf, &out_mask_info);
+
+      /* If the maximum number of candidate selected is reached exit the
+       * selection process. */
+      if (selected->len >= self->max_detection) {
+        break;
+      }
+    }
+  }
+
+  GST_LOG_OBJECT (self, "Selected count: %u", selected->len);
+
+  if (gst_debug_category_get_threshold (GST_CAT_DEFAULT) >= GST_LEVEL_TRACE) {
+    DebugCandidates ctx;
+    /* For debug purpose only. Prints candidates after NMS */
+    ctx.start = 0;
+    ctx.fields = 5;
+    ctx.offset = offset;
+    ctx.self = self;
+    g_ptr_array_foreach (selected,
+        gst_fastsam_tensor_decoder_debug_print_candidate, &ctx);
+  }
+
+  /* We unmap the memory */
+  gst_buffer_unmap (masks_tensor->data, &map_info_masks);
+  gst_buffer_unmap (logits_tensor->data, &map_info_logits);
+}
--- a/subprojects/gst-plugins-bad/gst/tensordecoders/gstfastsamtensordecoder.h
+++ b/subprojects/gst-plugins-bad/gst/tensordecoders/gstfastsamtensordecoder.h
@ -0,0 +1,101 @@
+/*
+ * GStreamer gstreamer-fastsamtensordecoder
+ * Copyright (C) 2024 Collabora Ltd
+ *  Authors: Daniel Morin <daniel.morin@collabora.com>
+ *           Vineet Suryan <vineet.suryan@collabora.com>
+ *
+ * gstfastsamtensordecoder.h
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+
+#ifndef __GST_FASTSAM_TENSOR_DECODER_H__
+#define __GST_FASTSAM_TENSOR_DECODER_H__
+
+#include <gst/gst.h>
+#include <gst/video/video.h>
+#include <gst/base/base.h>
+
+G_BEGIN_DECLS
+
+#define GST_TYPE_FASTSAM_TENSOR_DECODER (gst_fastsam_tensor_decoder_get_type ())
+G_DECLARE_FINAL_TYPE (GstFastSAMTensorDecoder, gst_fastsam_tensor_decoder,
+    GST, FASTSAM_TENSOR_DECODER, GstBaseTransform)
+
+typedef struct _BBox
+{
+  gint x;
+  gint y;
+  guint w;
+  guint h;
+} BBox;
+
+struct _GstFastSAMTensorDecoder
+{
+  GstBaseTransform basetransform;
+  /* Box confidence threshold */
+  gfloat box_confi_thresh;
+  /* Class confidence threshold */
+  gfloat cls_confi_thresh;
+  /* Intersection-of-Union threshold */
+  gfloat iou_thresh;
+  /* Maximum detection/mask */
+  gsize max_detection;
+  /* Video Info */
+  GstVideoInfo video_info;
+
+  /* Candidates with a class confidence level above threshold. */
+  GPtrArray *sel_candidates;
+
+  /* Final candidates selected that respect class confidence level,
+   * NMS and maximum detection. */
+  GPtrArray *selected;
+
+  /* Tensor-id identifying mask tensors out of FastSAM inference process. */
+  GQuark mask_tensor_id;
+
+  /* Tensor-id identifying logits tensors out of FastSAM inference process. */
+  GQuark logits_tensor_id;
+
+  /* Region of the mask that contain valid segmentation information */
+  BBox mask_roi;
+
+  /* Scaling factor to convert bounding-box coordinates to mask coordinates */
+  gfloat bb2mask_gain;
+
+  /* Mask width */
+  guint mask_w;
+
+  /* Mask height */
+  guint mask_h;
+
+  /* Mask length */
+  gsize mask_length;
+
+  /* BufferPool for mask */
+  GstBufferPool *mask_pool;
+};
+
+struct _GstFastSAMTensorDecoderClass
+{
+  GstBaseTransformClass parent_class;
+};
+
+GST_ELEMENT_REGISTER_DECLARE (fastsam_tensor_decoder)
+
+G_END_DECLS
+#endif /* __GST_FASTSAM_TENSOR_DECODER_H__ */
--- a/subprojects/gst-plugins-bad/gst/tensordecoders/gsttensordecoders.c
+++ b/subprojects/gst-plugins-bad/gst/tensordecoders/gsttensordecoders.c
@ -25,6 +25,7 @@
 #endif

 #include "gstssdobjectdetector.h"
+#include "gstfastsamtensordecoder.h"

 /**
 * SECTION:plugin-tensordecoders
@ -38,6 +39,7 @@ plugin_init (GstPlugin * plugin)
 {
  gboolean ret = FALSE;
  ret |= GST_ELEMENT_REGISTER (ssd_object_detector, plugin);
+  ret |= GST_ELEMENT_REGISTER (fastsam_tensor_decoder, plugin);

  return ret;
 }
--- a/subprojects/gst-plugins-bad/gst/tensordecoders/meson.build
+++ b/subprojects/gst-plugins-bad/gst/tensordecoders/meson.build
@ -1,6 +1,7 @@
 tensordecoders_sources = [
  'gsttensordecoders.c',
-  'gstssdobjectdetector.c'
+  'gstssdobjectdetector.c',
+  'gstfastsamtensordecoder.c'
 ]

 tensordecoders_headers = [