From fe19bc0a2ee0dc770e999be0cd6057d21af4282e Mon Sep 17 00:00:00 2001
From: He Junyan <junyan.he@intel.com>
Date: Tue, 22 Sep 2020 14:54:19 +0800
Subject: [PATCH] videoparsers: av1: Add the AV1 parse.

This AV1 parse implements the conversion between alignment of obu,
tu and frame, and the conversion between stream-format of obu-stream
and annexb.

TODO:
1. May need a property of operating_point to filter the OBUs
2. May add a property to disable deep parse.

Part-of: <https://gitlab.freedesktop.org/gstreamer/gst-plugins-bad/-/merge_requests/1614>
---
 gst/videoparsers/gstav1parse.c | 1699 ++++++++++++++++++++++++++++++++
 gst/videoparsers/gstav1parse.h |   34 +
 gst/videoparsers/meson.build   |    1 +
 gst/videoparsers/plugin.c      |    9 +
 4 files changed, 1743 insertions(+)
 create mode 100644 gst/videoparsers/gstav1parse.c
 create mode 100644 gst/videoparsers/gstav1parse.h

diff --git a/gst/videoparsers/gstav1parse.c b/gst/videoparsers/gstav1parse.c
new file mode 100644
index 0000000000..115b85a0a0
--- /dev/null
+++ b/gst/videoparsers/gstav1parse.c
@@ -0,0 +1,1699 @@
+/* GStreamer
+ * Copyright (C) 2020 He Junyan <junyan.he@intel.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*
+ * SECTION:element-av1parse
+ * @title: av1parse
+ * @short_description: An AV1 stream parse.
+ *
+ * The minimal unit should be the BYTE.
+ * There are four types of AV1 alignment in the AV1 stream.
+ *
+ * alignment: byte, obu, frame, tu
+ *
+ * 1. Aligned to byte. The basic and default one for input.
+ * 2. Aligned to obu(Open Bitstream Units). The default one for output.
+ * 3. Aligned to frame. This ensures that each buffer contains only one
+ *    frame or frame header with show_existing flag for the base or sub
+ *    layer. It is useful for the decoder.
+ * 4. Aligned to tu(Temporal Unit). A temporal unit consists of all the
+ *    OBUs that are associated with a specific, distinct time instant.
+ *    When scalability is disabled, it contains just exact one showing
+ *    frame(may contain several unshowing frames). When scalability is
+ *    enabled, it contains frames depending on the layer number. It should
+ *    begin with a temporal delimiter obu. It may be useful for mux/demux
+ *    to index the data of some timestamp.
+ *
+ * The annex B define a special format for the temporal unit. The size of
+ * each temporal unit is extract out to the header of the buffer, and no
+ * size field inside the each obu. There are two stream formats:
+ *
+ * stream-format: obu-stream, annexb
+ *
+ * 1. obu-stream. The basic and default one.
+ * 2. annexb. A special stream of temporal unit. It also implies that the
+ *    alignment should be TU.
+ *
+ * This AV1 parse implements the conversion between the alignments and the
+ * stream-formats. If the input and output have the same alignment and the
+ * same stream-format, it will check and bypass the data.
+ *
+ * ## Example launch line to generate annex B format AV1 stream:
+ * ```
+ * gst-launch-1.0 filesrc location=sample.av1 ! ivfparse ! av1parse !  \
+ *   video/x-av1,alignment=\(string\)tu,stream-format=\(string\)annexb ! \
+ *   filesink location=matroskamux ! filesink location=trans.mkv
+ * ```
+ *
+ * Since: 1.20
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <gst/base/gstbitreader.h>
+#include <gst/base/gstbitwriter.h>
+#include <gst/codecparsers/gstav1parser.h>
+#include <gst/video/video.h>
+#include "gstav1parse.h"
+
+#include <string.h>
+
+#define GST_AV1_MAX_LEB_128_SIZE 8
+
+GST_DEBUG_CATEGORY (av1_parse_debug);
+#define GST_CAT_DEFAULT av1_parse_debug
+
+/* We combine the stream format and the alignment
+   together. When stream format is annexb, the
+   alignment must be TU. */
+typedef enum
+{
+  GST_AV1_PARSE_ALIGN_ERROR = -1,
+  GST_AV1_PARSE_ALIGN_NONE = 0,
+  GST_AV1_PARSE_ALIGN_BYTE,
+  GST_AV1_PARSE_ALIGN_OBU,
+  GST_AV1_PARSE_ALIGN_FRAME,
+  GST_AV1_PARSE_ALIGN_TEMPORAL_UNIT,
+  GST_AV1_PARSE_ALIGN_TEMPORAL_UNIT_ANNEX_B,
+} GstAV1ParseAligment;
+
+struct _GstAV1Parse
+{
+  GstBaseParse parent;
+
+  gint width;
+  gint height;
+  gint subsampling_x;
+  gint subsampling_y;
+  guint8 bit_depth;
+  gchar *colorimetry;
+  GstAV1Profile profile;
+
+  GstAV1ParseAligment in_align;
+  GstAV1ParseAligment align;
+
+  GstAV1Parser *parser;
+  GstAdapter *cache_out;
+  guint last_parsed_offset;
+  GstAdapter *frame_cache;
+  guint highest_spatial_id;
+  gboolean update_caps;
+  gboolean discont;
+  gboolean header;
+  gboolean keyframe;
+};
+
+static GstStaticPadTemplate sinktemplate = GST_STATIC_PAD_TEMPLATE ("sink",
+    GST_PAD_SINK,
+    GST_PAD_ALWAYS,
+    GST_STATIC_CAPS ("video/x-av1"));
+
+static GstStaticPadTemplate srctemplate = GST_STATIC_PAD_TEMPLATE ("src",
+    GST_PAD_SRC,
+    GST_PAD_ALWAYS,
+    GST_STATIC_CAPS ("video/x-av1, parsed = (boolean) true, "
+        "stream-format=(string) { obu-stream, annexb }, "
+        "alignment=(string) { obu, tu, frame }"));
+
+#define parent_class gst_av1_parse_parent_class
+G_DEFINE_TYPE (GstAV1Parse, gst_av1_parse, GST_TYPE_BASE_PARSE);
+
+static void
+remove_fields (GstCaps * caps, gboolean all)
+{
+  guint i, n;
+
+  n = gst_caps_get_size (caps);
+  for (i = 0; i < n; i++) {
+    GstStructure *s = gst_caps_get_structure (caps, i);
+
+    if (all) {
+      gst_structure_remove_field (s, "alignment");
+      gst_structure_remove_field (s, "stream-format");
+    }
+    gst_structure_remove_field (s, "parsed");
+  }
+}
+
+static const gchar *
+_obu_name (GstAV1OBUType type)
+{
+  switch (type) {
+    case GST_AV1_OBU_SEQUENCE_HEADER:
+      return "sequence header";
+    case GST_AV1_OBU_TEMPORAL_DELIMITER:
+      return "temporal delimiter";
+    case GST_AV1_OBU_FRAME_HEADER:
+      return "frame header";
+    case GST_AV1_OBU_TILE_GROUP:
+      return "tile group";
+    case GST_AV1_OBU_METADATA:
+      return "metadata";
+    case GST_AV1_OBU_FRAME:
+      return "frame";
+    case GST_AV1_OBU_REDUNDANT_FRAME_HEADER:
+      return "redundant frame header";
+    case GST_AV1_OBU_TILE_LIST:
+      return "tile list";
+    case GST_AV1_OBU_PADDING:
+      return "padding";
+    default:
+      return "unknown";
+  }
+
+  return NULL;
+}
+
+static guint32
+_read_leb128 (guint8 * data, GstAV1ParserResult * retval, guint32 * comsumed)
+{
+  guint8 leb128_byte = 0;
+  guint64 value = 0;
+  gint i;
+  gboolean result;
+  GstBitReader br;
+  guint32 cur_pos;
+
+  gst_bit_reader_init (&br, data, 8);
+
+  cur_pos = gst_bit_reader_get_pos (&br);
+  for (i = 0; i < 8; i++) {
+    leb128_byte = 0;
+    result = gst_bit_reader_get_bits_uint8 (&br, &leb128_byte, 8);
+    if (result == FALSE) {
+      *retval = GST_AV1_PARSER_BITSTREAM_ERROR;
+      return 0;
+    }
+
+    value |= (((gint) leb128_byte & 0x7f) << (i * 7));
+    if (!(leb128_byte & 0x80))
+      break;
+  }
+
+  *comsumed = (gst_bit_reader_get_pos (&br) - cur_pos) / 8;
+  /* check for bitstream conformance see chapter4.10.5 */
+  if (value < G_MAXUINT32) {
+    *retval = GST_AV1_PARSER_OK;
+    return (guint32) value;
+  } else {
+    GST_WARNING ("invalid leb128");
+    *retval = GST_AV1_PARSER_BITSTREAM_ERROR;
+    return 0;
+  }
+}
+
+static gsize
+_leb_size_in_bytes (guint64 value)
+{
+  gsize size = 0;
+  do {
+    ++size;
+  } while ((value >>= 7) != 0);
+
+  return size;
+}
+
+static gboolean
+_write_leb128 (guint8 * data, guint * len, guint64 value)
+{
+  guint leb_size = _leb_size_in_bytes (value);
+  guint i;
+
+  if (value > G_MAXUINT32 || leb_size > GST_AV1_MAX_LEB_128_SIZE)
+    return FALSE;
+
+  for (i = 0; i < leb_size; ++i) {
+    guint8 byte = value & 0x7f;
+    value >>= 7;
+
+    /* Signal that more bytes follow. */
+    if (value != 0)
+      byte |= 0x80;
+
+    *(data + i) = byte;
+  }
+
+  *len = leb_size;
+  return TRUE;
+}
+
+static gboolean gst_av1_parse_start (GstBaseParse * parse);
+static gboolean gst_av1_parse_stop (GstBaseParse * parse);
+static GstFlowReturn gst_av1_parse_handle_frame (GstBaseParse * parse,
+    GstBaseParseFrame * frame, gint * skipsize);
+static gboolean gst_av1_parse_set_sink_caps (GstBaseParse * parse,
+    GstCaps * caps);
+static GstCaps *gst_av1_parse_get_sink_caps (GstBaseParse * parse,
+    GstCaps * filter);
+
+static void
+gst_av1_parse_reset (GstAV1Parse * self)
+{
+  self->width = 0;
+  self->height = 0;
+  self->subsampling_x = -1;
+  self->subsampling_y = -1;
+  self->profile = GST_AV1_PROFILE_UNDEFINED;
+  self->bit_depth = 0;
+  self->align = GST_AV1_PARSE_ALIGN_NONE;
+  self->in_align = GST_AV1_PARSE_ALIGN_NONE;
+  self->discont = TRUE;
+  self->header = FALSE;
+  self->keyframe = FALSE;
+  self->last_parsed_offset = 0;
+  self->highest_spatial_id = 0;
+  g_clear_pointer (&self->colorimetry, g_free);
+  g_clear_pointer (&self->parser, gst_av1_parser_free);
+  gst_adapter_clear (self->cache_out);
+  gst_adapter_clear (self->frame_cache);
+}
+
+static void
+gst_av1_parse_init (GstAV1Parse * self)
+{
+  gst_base_parse_set_pts_interpolation (GST_BASE_PARSE (self), FALSE);
+  gst_base_parse_set_infer_ts (GST_BASE_PARSE (self), FALSE);
+
+  GST_PAD_SET_ACCEPT_INTERSECT (GST_BASE_PARSE_SINK_PAD (self));
+  GST_PAD_SET_ACCEPT_TEMPLATE (GST_BASE_PARSE_SINK_PAD (self));
+
+  self->cache_out = gst_adapter_new ();
+  self->frame_cache = gst_adapter_new ();
+}
+
+static void
+gst_av1_parse_finalize (GObject * object)
+{
+  GstAV1Parse *self = GST_AV1_PARSE (object);
+
+  gst_av1_parse_reset (self);
+  g_object_unref (self->cache_out);
+  g_object_unref (self->frame_cache);
+
+  G_OBJECT_CLASS (parent_class)->finalize (object);
+}
+
+static void
+gst_av1_parse_class_init (GstAV1ParseClass * klass)
+{
+  GObjectClass *gobject_class = (GObjectClass *) klass;
+  GstBaseParseClass *parse_class = GST_BASE_PARSE_CLASS (klass);
+  GstElementClass *element_class = GST_ELEMENT_CLASS (klass);
+
+  gobject_class->finalize = gst_av1_parse_finalize;
+  parse_class->start = GST_DEBUG_FUNCPTR (gst_av1_parse_start);
+  parse_class->stop = GST_DEBUG_FUNCPTR (gst_av1_parse_stop);
+  parse_class->handle_frame = GST_DEBUG_FUNCPTR (gst_av1_parse_handle_frame);
+  parse_class->set_sink_caps = GST_DEBUG_FUNCPTR (gst_av1_parse_set_sink_caps);
+  parse_class->get_sink_caps = GST_DEBUG_FUNCPTR (gst_av1_parse_get_sink_caps);
+
+  gst_element_class_add_static_pad_template (element_class, &srctemplate);
+  gst_element_class_add_static_pad_template (element_class, &sinktemplate);
+
+  gst_element_class_set_static_metadata (element_class, "AV1 parser",
+      "Codec/Parser/Converter/Video",
+      "Parses AV1 streams", "He Junyan <junyan.he@intel.com>");
+
+  GST_DEBUG_CATEGORY_INIT (av1_parse_debug, "av1parse", 0, "av1 parser");
+}
+
+static gboolean
+gst_av1_parse_start (GstBaseParse * parse)
+{
+  GstAV1Parse *self = GST_AV1_PARSE (parse);
+
+  GST_DEBUG_OBJECT (self, "start");
+
+  gst_av1_parse_reset (self);
+  self->parser = gst_av1_parser_new ();
+
+  /* At least the OBU header. */
+  gst_base_parse_set_min_frame_size (parse, 1);
+
+  return TRUE;
+}
+
+static gboolean
+gst_av1_parse_stop (GstBaseParse * parse)
+{
+  GstAV1Parse *self = GST_AV1_PARSE (parse);
+
+  GST_DEBUG_OBJECT (self, "stop");
+  g_clear_pointer (&self->parser, gst_av1_parser_free);
+
+  return TRUE;
+}
+
+static const gchar *
+gst_av1_parse_profile_to_string (GstAV1Profile profile)
+{
+  switch (profile) {
+    case GST_AV1_PROFILE_0:
+      return "0";
+    case GST_AV1_PROFILE_1:
+      return "1";
+    case GST_AV1_PROFILE_2:
+      return "2";
+    default:
+      break;
+  }
+
+  return NULL;
+}
+
+static GstAV1Profile
+gst_av1_parse_profile_from_string (const gchar * profile)
+{
+  if (!profile)
+    return GST_AV1_PROFILE_UNDEFINED;
+
+  if (g_strcmp0 (profile, "0") == 0)
+    return GST_AV1_PROFILE_0;
+  else if (g_strcmp0 (profile, "1") == 0)
+    return GST_AV1_PROFILE_1;
+  else if (g_strcmp0 (profile, "2") == 0)
+    return GST_AV1_PROFILE_2;
+
+  return GST_AV1_PROFILE_UNDEFINED;
+}
+
+static const gchar *
+gst_av1_parse_alignment_to_steam_format_string (GstAV1ParseAligment align)
+{
+  switch (align) {
+    case GST_AV1_PARSE_ALIGN_BYTE:
+      return "obu-stream";
+    case GST_AV1_PARSE_ALIGN_OBU:
+    case GST_AV1_PARSE_ALIGN_TEMPORAL_UNIT:
+    case GST_AV1_PARSE_ALIGN_FRAME:
+      return "obu-stream";
+    case GST_AV1_PARSE_ALIGN_TEMPORAL_UNIT_ANNEX_B:
+      return "annexb";
+    default:
+      GST_WARNING ("Unrecognized steam format");
+      break;
+  }
+
+  return NULL;
+}
+
+static const gchar *
+gst_av1_parse_alignment_to_string (GstAV1ParseAligment align)
+{
+  switch (align) {
+    case GST_AV1_PARSE_ALIGN_BYTE:
+      return "byte";
+    case GST_AV1_PARSE_ALIGN_OBU:
+      return "obu";
+    case GST_AV1_PARSE_ALIGN_TEMPORAL_UNIT:
+    case GST_AV1_PARSE_ALIGN_TEMPORAL_UNIT_ANNEX_B:
+      return "tu";
+    case GST_AV1_PARSE_ALIGN_FRAME:
+      return "frame";
+    default:
+      GST_WARNING ("Unrecognized alignment");
+      break;
+  }
+
+  return NULL;
+}
+
+static GstAV1ParseAligment
+gst_av1_parse_alignment_from_string (const gchar * align,
+    const gchar * stream_format)
+{
+  if (!align && !stream_format)
+    return GST_AV1_PARSE_ALIGN_NONE;
+
+  if (stream_format) {
+    if (g_strcmp0 (stream_format, "annexb") == 0) {
+      if (align && g_strcmp0 (align, "tu") != 0) {
+        /* annex b stream must align to TU. */
+        return GST_AV1_PARSE_ALIGN_ERROR;
+      } else {
+        return GST_AV1_PARSE_ALIGN_TEMPORAL_UNIT_ANNEX_B;
+      }
+    } else if (g_strcmp0 (stream_format, "obu-stream") != 0) {
+      /* unrecognized */
+      return GST_AV1_PARSE_ALIGN_NONE;
+    }
+
+    /* stream-format is obu-stream, depends on align */
+  }
+
+  if (align) {
+    if (g_strcmp0 (align, "byte") == 0) {
+      return GST_AV1_PARSE_ALIGN_BYTE;
+    } else if (g_strcmp0 (align, "obu") == 0) {
+      return GST_AV1_PARSE_ALIGN_OBU;
+    } else if (g_strcmp0 (align, "tu") == 0) {
+      return GST_AV1_PARSE_ALIGN_TEMPORAL_UNIT;
+    } else if (g_strcmp0 (align, "frame") == 0) {
+      return GST_AV1_PARSE_ALIGN_FRAME;
+    } else {
+      /* unrecognized */
+      return GST_AV1_PARSE_ALIGN_NONE;
+    }
+  }
+
+  return GST_AV1_PARSE_ALIGN_NONE;
+}
+
+static GstAV1ParseAligment
+gst_av1_parse_alignment_from_caps (GstCaps * caps)
+{
+  GstAV1ParseAligment align;
+
+  align = GST_AV1_PARSE_ALIGN_NONE;
+
+  GST_DEBUG ("parsing caps: %" GST_PTR_FORMAT, caps);
+
+  if (caps && gst_caps_get_size (caps) > 0) {
+    GstStructure *s = gst_caps_get_structure (caps, 0);
+    const gchar *str_align = NULL;
+    const gchar *str_stream = NULL;
+
+    str_align = gst_structure_get_string (s, "alignment");
+    str_stream = gst_structure_get_string (s, "stream-format");
+
+    if (str_align || str_stream)
+      align = gst_av1_parse_alignment_from_string (str_align, str_stream);
+  }
+
+  return align;
+}
+
+static void
+gst_av1_parse_update_src_caps (GstAV1Parse * self, GstCaps * caps)
+{
+  GstCaps *sink_caps, *src_caps;
+  GstCaps *final_caps = NULL;
+  GstStructure *s = NULL;
+  gint width, height;
+  gint par_n = 0, par_d = 0;
+  gint fps_n = 0, fps_d = 0;
+  const gchar *profile = NULL;
+
+  if (!self->update_caps)
+    return;
+
+  /* if this is being called from the first _setcaps call, caps on the sinkpad
+   * aren't set yet and so they need to be passed as an argument */
+  if (caps)
+    sink_caps = gst_caps_ref (caps);
+  else
+    sink_caps = gst_pad_get_current_caps (GST_BASE_PARSE_SINK_PAD (self));
+
+  /* carry over input caps as much as possible; override with our own stuff */
+  if (!sink_caps)
+    sink_caps = gst_caps_new_empty_simple ("video/x-av1");
+  else
+    s = gst_caps_get_structure (sink_caps, 0);
+
+  final_caps = gst_caps_copy (sink_caps);
+
+  if (s && gst_structure_has_field (s, "width") &&
+      gst_structure_has_field (s, "height")) {
+    gst_structure_get_int (s, "width", &width);
+    gst_structure_get_int (s, "height", &height);
+  } else {
+    width = self->width;
+    height = self->height;
+  }
+
+  if (width > 0 && height > 0)
+    gst_caps_set_simple (final_caps, "width", G_TYPE_INT, width,
+        "height", G_TYPE_INT, height, NULL);
+
+  if (s && gst_structure_get_fraction (s, "pixel-aspect-ratio", &par_n, &par_d)) {
+    if (par_n != 0 && par_d != 0) {
+      gst_caps_set_simple (final_caps, "pixel-aspect-ratio",
+          GST_TYPE_FRACTION, par_n, par_d, NULL);
+    }
+  }
+
+  if (s && gst_structure_has_field (s, "framerate")) {
+    gst_structure_get_fraction (s, "framerate", &fps_n, &fps_d);
+  }
+
+  if (fps_n > 0 && fps_d > 0) {
+    gst_caps_set_simple (final_caps, "framerate",
+        GST_TYPE_FRACTION, fps_n, fps_d, NULL);
+    gst_base_parse_set_frame_rate (GST_BASE_PARSE (self), fps_n, fps_d, 0, 0);
+  }
+
+  if (self->colorimetry
+      && g_strcmp0 (self->colorimetry, GST_VIDEO_COLORIMETRY_SRGB)) {
+    const gchar *chroma_format = NULL;
+
+    if (self->subsampling_x == 1 && self->subsampling_y == 1)
+      chroma_format = "4:2:0";
+    else if (self->subsampling_x == 1 && self->subsampling_y == 0)
+      chroma_format = "4:2:2";
+    else if (self->subsampling_x == 0 && self->subsampling_y == 1)
+      chroma_format = "4:4:0";
+    else if (self->subsampling_x == 1 && self->subsampling_y == 1)
+      chroma_format = "4:4:4";
+
+    if (chroma_format)
+      gst_caps_set_simple (final_caps,
+          "chroma-format", G_TYPE_STRING, chroma_format, NULL);
+  }
+
+  if (self->bit_depth)
+    gst_caps_set_simple (final_caps,
+        "bit-depth-luma", G_TYPE_UINT, self->bit_depth,
+        "bit-depth-chroma", G_TYPE_UINT, self->bit_depth, NULL);
+
+  if (self->colorimetry && (!s || !gst_structure_has_field (s, "colorimetry")))
+    gst_caps_set_simple (final_caps,
+        "colorimetry", G_TYPE_STRING, self->colorimetry, NULL);
+
+  g_assert (self->align > GST_AV1_PARSE_ALIGN_NONE);
+  gst_caps_set_simple (final_caps, "parsed", G_TYPE_BOOLEAN, TRUE,
+      "stream-format", G_TYPE_STRING,
+      gst_av1_parse_alignment_to_steam_format_string (self->align),
+      "alignment", G_TYPE_STRING,
+      gst_av1_parse_alignment_to_string (self->align), NULL);
+
+  profile = gst_av1_parse_profile_to_string (self->profile);
+  if (profile)
+    gst_caps_set_simple (final_caps, "profile", G_TYPE_STRING, profile, NULL);
+
+  src_caps = gst_pad_get_current_caps (GST_BASE_PARSE_SRC_PAD (self));
+
+  if (!(src_caps && gst_caps_is_strictly_equal (src_caps, final_caps))) {
+    GST_DEBUG_OBJECT (self, "Update src caps %" GST_PTR_FORMAT, final_caps);
+    gst_pad_set_caps (GST_BASE_PARSE_SRC_PAD (self), final_caps);
+  }
+
+  gst_clear_caps (&src_caps);
+  gst_caps_unref (final_caps);
+  gst_caps_unref (sink_caps);
+
+  self->update_caps = FALSE;
+}
+
+/* check downstream caps to configure format and alignment */
+static void
+gst_av1_parse_negotiate (GstAV1Parse * self, GstCaps * in_caps)
+{
+  GstCaps *caps;
+  GstAV1ParseAligment align = self->align;
+
+  caps = gst_pad_get_allowed_caps (GST_BASE_PARSE_SRC_PAD (self));
+  GST_DEBUG_OBJECT (self, "allowed caps: %" GST_PTR_FORMAT, caps);
+
+  /* concentrate on leading structure, since decodebin parser
+   * capsfilter always includes parser template caps */
+  if (caps) {
+    caps = gst_caps_truncate (caps);
+    GST_DEBUG_OBJECT (self, "negotiating with caps: %" GST_PTR_FORMAT, caps);
+  }
+
+  /* Both upsteam and downstream support, best */
+  if (in_caps && caps) {
+    if (gst_caps_can_intersect (in_caps, caps)) {
+      GST_DEBUG_OBJECT (self, "downstream accepts upstream caps");
+      align = gst_av1_parse_alignment_from_caps (in_caps);
+      gst_clear_caps (&caps);
+    }
+  }
+  if (align != GST_AV1_PARSE_ALIGN_NONE)
+    goto done;
+
+  /* Select first one of downstream support */
+  if (caps && !gst_caps_is_empty (caps)) {
+    /* fixate to avoid ambiguity with lists when parsing */
+    caps = gst_caps_fixate (caps);
+    align = gst_av1_parse_alignment_from_caps (caps);
+  }
+  if (align != GST_AV1_PARSE_ALIGN_NONE)
+    goto done;
+
+  /* default */
+  if (align == GST_AV1_PARSE_ALIGN_NONE)
+    align = GST_AV1_PARSE_ALIGN_OBU;
+
+done:
+  self->align = align;
+  GST_INFO_OBJECT (self, "selected alignment %s",
+      gst_av1_parse_alignment_to_string (align));
+
+  gst_clear_caps (&caps);
+}
+
+static GstCaps *
+gst_av1_parse_get_sink_caps (GstBaseParse * parse, GstCaps * filter)
+{
+  GstCaps *peercaps, *templ;
+  GstCaps *res, *tmp, *pcopy;
+
+  templ = gst_pad_get_pad_template_caps (GST_BASE_PARSE_SINK_PAD (parse));
+  if (filter) {
+    GstCaps *fcopy = gst_caps_copy (filter);
+    /* Remove the fields we convert */
+    remove_fields (fcopy, TRUE);
+    peercaps = gst_pad_peer_query_caps (GST_BASE_PARSE_SRC_PAD (parse), fcopy);
+    gst_caps_unref (fcopy);
+  } else {
+    peercaps = gst_pad_peer_query_caps (GST_BASE_PARSE_SRC_PAD (parse), NULL);
+  }
+
+  pcopy = gst_caps_copy (peercaps);
+  remove_fields (pcopy, TRUE);
+
+  res = gst_caps_intersect_full (pcopy, templ, GST_CAPS_INTERSECT_FIRST);
+  gst_caps_unref (pcopy);
+  gst_caps_unref (templ);
+
+  if (filter) {
+    GstCaps *tmp = gst_caps_intersect_full (res, filter,
+        GST_CAPS_INTERSECT_FIRST);
+    gst_caps_unref (res);
+    res = tmp;
+  }
+
+  /* Try if we can put the downstream caps first */
+  pcopy = gst_caps_copy (peercaps);
+  remove_fields (pcopy, FALSE);
+  tmp = gst_caps_intersect_full (pcopy, res, GST_CAPS_INTERSECT_FIRST);
+  gst_caps_unref (pcopy);
+  if (!gst_caps_is_empty (tmp))
+    res = gst_caps_merge (tmp, res);
+  else
+    gst_caps_unref (tmp);
+
+  gst_caps_unref (peercaps);
+
+  return res;
+}
+
+static gboolean
+gst_av1_parse_set_sink_caps (GstBaseParse * parse, GstCaps * caps)
+{
+  GstAV1Parse *self = GST_AV1_PARSE (parse);
+  GstStructure *str;
+  GstAV1ParseAligment align;
+  GstCaps *in_caps = NULL;
+  const gchar *profile;
+
+  str = gst_caps_get_structure (caps, 0);
+
+  /* accept upstream info if provided */
+  gst_structure_get_int (str, "width", &self->width);
+  gst_structure_get_int (str, "height", &self->height);
+  profile = gst_structure_get_string (str, "profile");
+  if (profile)
+    self->profile = gst_av1_parse_profile_from_string (profile);
+
+  /* get upstream align from caps */
+  align = gst_av1_parse_alignment_from_caps (caps);
+  if (align == GST_AV1_PARSE_ALIGN_ERROR) {
+    GST_ERROR_OBJECT (self, "Sink caps %" GST_PTR_FORMAT " set stream-format"
+        " and alignment conflict.", caps);
+    return FALSE;
+  }
+
+  in_caps = gst_caps_copy (caps);
+  /* default */
+  if (align == GST_AV1_PARSE_ALIGN_NONE)
+    gst_caps_set_simple (in_caps, "alignment", G_TYPE_STRING,
+        gst_av1_parse_alignment_to_string (GST_AV1_PARSE_ALIGN_OBU), NULL);
+
+  /* negotiate with downstream, set output align */
+  gst_av1_parse_negotiate (self, in_caps);
+
+  self->update_caps = TRUE;
+
+  /* if all of decoder's capability related values are provided
+   * by upstream, update src caps now */
+  if (self->width > 0 && self->height > 0 && profile)
+    gst_av1_parse_update_src_caps (self, in_caps);
+
+  gst_caps_unref (in_caps);
+
+  self->in_align = align;
+
+  if (self->in_align == GST_AV1_PARSE_ALIGN_TEMPORAL_UNIT_ANNEX_B) {
+    gst_av1_parser_reset (self->parser, TRUE);
+  } else {
+    gst_av1_parser_reset (self->parser, FALSE);
+  }
+
+  return TRUE;
+}
+
+static GstFlowReturn
+gst_av1_parse_push_data (GstAV1Parse * self, GstBaseParseFrame * frame,
+    guint32 finish_sz)
+{
+  gsize sz;
+  GstBuffer *buf;
+  GstBuffer *buffer = frame->buffer;
+  GstFlowReturn ret = GST_FLOW_OK;
+
+  g_assert (self->align != self->in_align);
+  if (self->align == GST_AV1_PARSE_ALIGN_TEMPORAL_UNIT_ANNEX_B) {
+    guint8 size_data[GST_AV1_MAX_LEB_128_SIZE];
+    guint size_len = 0;
+    guint len;
+
+    /* Still some left in the frame cache */
+    len = gst_adapter_available (self->frame_cache);
+    if (len) {
+      buf = gst_adapter_take_buffer (self->frame_cache, len);
+
+      /* frame_unit_size */
+      _write_leb128 (size_data, &size_len, len);
+
+      gst_adapter_push (self->cache_out,
+          gst_buffer_new_wrapped (g_memdup (size_data, size_len), size_len));
+      gst_adapter_push (self->cache_out, buf);
+    }
+
+    len = gst_adapter_available (self->cache_out);
+    if (len) {
+      buf = gst_adapter_take_buffer (self->cache_out, len);
+
+      /* temporal_unit_size */
+      _write_leb128 (size_data, &size_len, len);
+
+      gst_adapter_push (self->cache_out,
+          gst_buffer_new_wrapped (g_memdup (size_data, size_len), size_len));
+      gst_adapter_push (self->cache_out, buf);
+    }
+  }
+
+  sz = gst_adapter_available (self->cache_out);
+  if (sz) {
+    buf = gst_adapter_take_buffer (self->cache_out, sz);
+    gst_buffer_copy_into (buf, buffer, GST_BUFFER_COPY_METADATA, 0, -1);
+    if (self->discont) {
+      GST_BUFFER_FLAG_SET (buf, GST_BUFFER_FLAG_DISCONT);
+      self->discont = FALSE;
+    }
+    if (self->header) {
+      GST_BUFFER_FLAG_SET (buf, GST_BUFFER_FLAG_HEADER);
+      self->header = FALSE;
+    }
+    if (self->keyframe) {
+      GST_BUFFER_FLAG_SET (buffer, GST_BUFFER_FLAG_DELTA_UNIT);
+      self->keyframe = FALSE;
+    }
+    /* Always be a frame boundary. */
+    GST_BUFFER_FLAG_SET (buffer, GST_BUFFER_FLAG_MARKER);
+    gst_buffer_replace (&frame->out_buffer, buf);
+    gst_buffer_unref (buf);
+
+    gst_av1_parse_update_src_caps (self, NULL);
+    GST_LOG_OBJECT (self, "complete one frame with size %" G_GSSIZE_FORMAT, sz);
+    ret = gst_base_parse_finish_frame (GST_BASE_PARSE (self), frame, finish_sz);
+  }
+
+  return ret;
+}
+
+static void
+gst_av1_parse_convert_to_annexb (GstAV1Parse * self, GstAV1OBU * obu,
+    gboolean frame_complete)
+{
+  guint8 size_data[GST_AV1_MAX_LEB_128_SIZE];
+  guint size_len = 0;
+  GstBitWriter bs;
+  GstBuffer *buf, *buf2;
+  guint8 *data;
+  guint len, len2, offset;
+
+  /* obu_length */
+  _write_leb128 (size_data, &size_len,
+      obu->obu_size + 1 + obu->header.obu_extention_flag);
+
+  gst_bit_writer_init_with_size (&bs, 128, FALSE);
+  /* obu_forbidden_bit */
+  gst_bit_writer_put_bits_uint8 (&bs, 0, 1);
+  /* obu_type */
+  gst_bit_writer_put_bits_uint8 (&bs, obu->obu_type, 4);
+  /* obu_extension_flag */
+  gst_bit_writer_put_bits_uint8 (&bs, obu->header.obu_extention_flag, 1);
+  /* obu_has_size_field */
+  gst_bit_writer_put_bits_uint8 (&bs, 0, 1);
+  /* obu_reserved_1bit */
+  gst_bit_writer_put_bits_uint8 (&bs, 0, 1);
+  if (obu->header.obu_extention_flag) {
+    /* temporal_id */
+    gst_bit_writer_put_bits_uint8 (&bs, obu->header.obu_temporal_id, 3);
+    /* spatial_id */
+    gst_bit_writer_put_bits_uint8 (&bs, obu->header.obu_spatial_id, 2);
+    /* extension_header_reserved_3bits */
+    gst_bit_writer_put_bits_uint8 (&bs, 0, 3);
+  }
+  g_assert (GST_BIT_WRITER_BIT_SIZE (&bs) % 8 == 0);
+
+  len = size_len;
+  len += GST_BIT_WRITER_BIT_SIZE (&bs) / 8;
+  len += obu->obu_size;
+
+  data = g_malloc (len);
+  offset = 0;
+
+  memcpy (data + offset, size_data, size_len);
+  offset += size_len;
+
+  memcpy (data + offset, GST_BIT_WRITER_DATA (&bs),
+      GST_BIT_WRITER_BIT_SIZE (&bs) / 8);
+  offset += GST_BIT_WRITER_BIT_SIZE (&bs) / 8;
+
+  memcpy (data + offset, obu->data, obu->obu_size);
+
+  /* The buf of this OBU */
+  buf = gst_buffer_new_wrapped (data, len);
+
+  gst_adapter_push (self->frame_cache, buf);
+
+  if (frame_complete) {
+    len2 = gst_adapter_available (self->frame_cache);
+    buf2 = gst_adapter_take_buffer (self->frame_cache, len2);
+
+    /* frame_unit_size */
+    _write_leb128 (size_data, &size_len, len2);
+    gst_adapter_push (self->cache_out,
+        gst_buffer_new_wrapped (g_memdup (size_data, size_len), size_len));
+
+    gst_adapter_push (self->cache_out, buf2);
+  }
+
+  gst_bit_writer_reset (&bs);
+}
+
+static void
+gst_av1_parse_convert_from_annexb (GstAV1Parse * self, GstAV1OBU * obu)
+{
+  guint8 size_data[GST_AV1_MAX_LEB_128_SIZE];
+  guint size_len = 0;
+  GstBuffer *buf;
+  guint len, offset;
+  guint8 *data;
+  GstBitWriter bs;
+
+  _write_leb128 (size_data, &size_len, obu->obu_size);
+
+  /* obu_header */
+  len = 1;
+  if (obu->header.obu_extention_flag)
+    len += 1;
+  len += size_len;
+  len += obu->obu_size;
+
+  gst_bit_writer_init_with_size (&bs, 128, FALSE);
+  /* obu_forbidden_bit */
+  gst_bit_writer_put_bits_uint8 (&bs, 0, 1);
+  /* obu_type */
+  gst_bit_writer_put_bits_uint8 (&bs, obu->obu_type, 4);
+  /* obu_extension_flag */
+  gst_bit_writer_put_bits_uint8 (&bs, obu->header.obu_extention_flag, 1);
+  /* obu_has_size_field */
+  gst_bit_writer_put_bits_uint8 (&bs, 1, 1);
+  /* obu_reserved_1bit */
+  gst_bit_writer_put_bits_uint8 (&bs, 0, 1);
+  if (obu->header.obu_extention_flag) {
+    /* temporal_id */
+    gst_bit_writer_put_bits_uint8 (&bs, obu->header.obu_temporal_id, 3);
+    /* spatial_id */
+    gst_bit_writer_put_bits_uint8 (&bs, obu->header.obu_spatial_id, 2);
+    /* extension_header_reserved_3bits */
+    gst_bit_writer_put_bits_uint8 (&bs, 0, 3);
+  }
+  g_assert (GST_BIT_WRITER_BIT_SIZE (&bs) % 8 == 0);
+
+  data = g_malloc (len);
+  offset = 0;
+  memcpy (data + offset, GST_BIT_WRITER_DATA (&bs),
+      GST_BIT_WRITER_BIT_SIZE (&bs) / 8);
+  offset += GST_BIT_WRITER_BIT_SIZE (&bs) / 8;
+
+  memcpy (data + offset, size_data, size_len);
+  offset += size_len;
+
+  memcpy (data + offset, obu->data, obu->obu_size);
+
+  buf = gst_buffer_new_wrapped (data, len);
+  gst_adapter_push (self->cache_out, buf);
+
+  gst_bit_writer_reset (&bs);
+}
+
+static GstAV1ParserResult
+gst_av1_parse_handle_sequence_obu (GstAV1Parse * self, GstAV1OBU * obu)
+{
+  GstAV1SequenceHeaderOBU seq_header;
+  GstAV1ParserResult res;
+  guint i;
+  guint val;
+
+  res = gst_av1_parser_parse_sequence_header_obu (self->parser,
+      obu, &seq_header);
+  if (res != GST_AV1_PARSER_OK)
+    return res;
+
+  if (self->width != seq_header.max_frame_width_minus_1 + 1) {
+    self->width = seq_header.max_frame_width_minus_1 + 1;
+    self->update_caps = TRUE;
+  }
+  if (self->height != seq_header.max_frame_height_minus_1 + 1) {
+    self->height = seq_header.max_frame_height_minus_1 + 1;
+    self->update_caps = TRUE;
+  }
+
+  if (seq_header.color_config.color_description_present_flag) {
+    GstVideoColorimetry cinfo;
+    gboolean have_cinfo = TRUE;
+    gchar *colorimetry = NULL;
+
+    if (have_cinfo) {
+      if (seq_header.color_config.color_range)
+        cinfo.range = GST_VIDEO_COLOR_RANGE_16_235;
+      else
+        cinfo.range = GST_VIDEO_COLOR_RANGE_0_255;
+
+      cinfo.matrix = gst_video_color_matrix_from_iso
+          (seq_header.color_config.matrix_coefficients);
+      cinfo.transfer = gst_video_transfer_function_from_iso
+          (seq_header.color_config.transfer_characteristics);
+      cinfo.primaries = gst_video_color_primaries_from_iso
+          (seq_header.color_config.color_primaries);
+      colorimetry = gst_video_colorimetry_to_string (&cinfo);
+    }
+
+    if (g_strcmp0 (colorimetry, self->colorimetry)) {
+      g_clear_pointer (&self->colorimetry, g_free);
+      self->colorimetry = colorimetry;
+      self->update_caps = TRUE;
+    }
+  }
+
+  if (g_strcmp0 (self->colorimetry, GST_VIDEO_COLORIMETRY_SRGB)) {
+    if (self->subsampling_x != seq_header.color_config.subsampling_x) {
+      self->subsampling_x = seq_header.color_config.subsampling_x;
+      self->update_caps = TRUE;
+    }
+
+    if (self->subsampling_y != seq_header.color_config.subsampling_y) {
+      self->subsampling_y = seq_header.color_config.subsampling_y;
+      self->update_caps = TRUE;
+    }
+  }
+
+  if (self->bit_depth != seq_header.bit_depth) {
+    self->bit_depth = seq_header.bit_depth;
+    self->update_caps = TRUE;
+  }
+
+  if (self->profile != seq_header.seq_profile) {
+    self->profile = seq_header.seq_profile;
+    self->update_caps = TRUE;
+  }
+
+  val = (self->parser->state.operating_point_idc >> 8) & 0x0f;
+  for (i = 0; i < (1 << GST_AV1_MAX_SPATIAL_LAYERS); i++) {
+    if (val & (1 << i))
+      self->highest_spatial_id = i;
+  }
+
+  return GST_AV1_PARSER_OK;
+}
+
+static GstAV1ParserResult
+gst_av1_parse_handle_one_obu (GstAV1Parse * self, GstAV1OBU * obu,
+    gboolean * frame_complete)
+{
+  GstAV1ParserResult res = GST_AV1_PARSER_OK;
+  GstAV1MetadataOBU metadata;
+  GstAV1FrameHeaderOBU frame_header;
+  GstAV1TileListOBU tile_list;
+  GstAV1TileGroupOBU tile_group;
+  GstAV1FrameOBU frame;
+
+  *frame_complete = FALSE;
+
+  switch (obu->obu_type) {
+    case GST_AV1_OBU_TEMPORAL_DELIMITER:
+      res = gst_av1_parser_parse_temporal_delimiter_obu (self->parser, obu);
+      break;
+    case GST_AV1_OBU_SEQUENCE_HEADER:
+      res = gst_av1_parse_handle_sequence_obu (self, obu);
+      break;
+    case GST_AV1_OBU_REDUNDANT_FRAME_HEADER:
+      res = gst_av1_parser_parse_frame_header_obu (self->parser, obu,
+          &frame_header);
+      break;
+    case GST_AV1_OBU_FRAME_HEADER:
+      res = gst_av1_parser_parse_frame_header_obu (self->parser, obu,
+          &frame_header);
+      break;
+    case GST_AV1_OBU_FRAME:
+      res = gst_av1_parser_parse_frame_obu (self->parser, obu, &frame);
+      break;
+    case GST_AV1_OBU_METADATA:
+      res = gst_av1_parser_parse_metadata_obu (self->parser, obu, &metadata);
+      break;
+    case GST_AV1_OBU_TILE_GROUP:
+      res =
+          gst_av1_parser_parse_tile_group_obu (self->parser, obu, &tile_group);
+      break;
+    case GST_AV1_OBU_TILE_LIST:
+      res = gst_av1_parser_parse_tile_list_obu (self->parser, obu, &tile_list);
+      break;
+    case GST_AV1_OBU_PADDING:
+      break;
+    default:
+      GST_WARNING_OBJECT (self, "an unrecognized obu type %d", obu->obu_type);
+      res = GST_AV1_PARSER_BITSTREAM_ERROR;
+      break;
+  }
+
+  GST_LOG_OBJECT (self, "parsing the obu %s, result is %d",
+      _obu_name (obu->obu_type), res);
+  if (res != GST_AV1_PARSER_OK)
+    goto out;
+
+  /* 7.5:
+     All OBU extension headers that are contained in the same temporal
+     unit and have the same spatial_id value must have the same temporal_id
+     value.
+     And
+     OBUs with spatial level IDs (spatial_id) greater than 0 must
+     appear within a temporal unit in increasing order of the spatial
+     level ID values. */
+  if (obu->header.obu_spatial_id > self->highest_spatial_id) {
+    GST_WARNING_OBJECT (self,
+        "spatial_id %d is bigger than highest_spatial_id %d",
+        obu->header.obu_spatial_id, self->highest_spatial_id);
+    res = GST_AV1_PARSER_BITSTREAM_ERROR;
+    goto out;
+  }
+
+  if (obu->obu_type == GST_AV1_OBU_SEQUENCE_HEADER)
+    self->header = TRUE;
+
+  if (obu->obu_type == GST_AV1_OBU_FRAME_HEADER
+      || obu->obu_type == GST_AV1_OBU_FRAME
+      || obu->obu_type == GST_AV1_OBU_REDUNDANT_FRAME_HEADER) {
+    GstAV1FrameHeaderOBU *fh = &frame_header;
+
+    if (obu->obu_type == GST_AV1_OBU_FRAME)
+      fh = &frame.frame_header;
+
+    /* if a show_existing_frame case, only update key frame.
+       otherwise, update all type of frame.  */
+    if (!fh->show_existing_frame || fh->frame_type == GST_AV1_KEY_FRAME)
+      res = gst_av1_parser_reference_frame_update (self->parser, fh);
+
+    if (res != GST_AV1_PARSER_OK)
+      GST_WARNING_OBJECT (self, "update frame get result %d", res);
+
+    if (fh->show_existing_frame)
+      *frame_complete = TRUE;
+
+    if (fh->frame_type == GST_AV1_KEY_FRAME)
+      self->keyframe = TRUE;
+  }
+
+  if (obu->obu_type == GST_AV1_OBU_TILE_GROUP
+      || obu->obu_type == GST_AV1_OBU_FRAME) {
+    GstAV1TileGroupOBU *tg = &tile_group;
+
+    if (obu->obu_type == GST_AV1_OBU_FRAME)
+      tg = &frame.tile_group;
+
+    if (tg->tg_end == tg->num_tiles - 1)
+      *frame_complete = TRUE;
+  }
+
+out:
+  if (res != GST_AV1_PARSER_OK) {
+    /* Some verbose OBU can be skip */
+    if (obu->obu_type == GST_AV1_OBU_REDUNDANT_FRAME_HEADER) {
+      GST_WARNING_OBJECT (self, "Ignore a verbose %s OBU parsing error",
+          _obu_name (obu->obu_type));
+      res = GST_AV1_PARSER_OK;
+    }
+  }
+
+  return res;
+}
+
+static GstFlowReturn
+gst_av1_parse_handle_obu_to_obu (GstBaseParse * parse,
+    GstBaseParseFrame * frame, gint * skipsize)
+{
+  GstAV1Parse *self = GST_AV1_PARSE (parse);
+  GstMapInfo map_info;
+  GstAV1OBU obu;
+  GstFlowReturn ret = GST_FLOW_OK;
+  GstAV1ParserResult res;
+  GstBuffer *buffer = gst_buffer_ref (frame->buffer);
+  guint32 consumed, total_consumed;
+  gboolean frame_complete;
+
+  if (!gst_buffer_map (buffer, &map_info, GST_MAP_READ)) {
+    *skipsize = 0;
+    GST_ERROR_OBJECT (parse, "Couldn't map incoming buffer");
+    return GST_FLOW_ERROR;
+  }
+
+  total_consumed = 0;
+again:
+  while (TRUE) {
+    frame_complete = FALSE;
+    res = gst_av1_parser_identify_one_obu (self->parser,
+        map_info.data + total_consumed, map_info.size - total_consumed,
+        &obu, &consumed);
+    if (res == GST_AV1_PARSER_OK)
+      res = gst_av1_parse_handle_one_obu (self, &obu, &frame_complete);
+    if (res != GST_AV1_PARSER_OK)
+      break;
+
+    total_consumed += consumed;
+
+    g_assert (total_consumed <= map_info.size);
+
+    if (total_consumed >= map_info.size)
+      break;
+  }
+
+  if (total_consumed) {
+    /* If we get something, always output it even already met some error.
+       Next handle_frame loop will handle that error. */
+    gst_av1_parse_update_src_caps (self, NULL);
+    if (self->discont) {
+      GST_BUFFER_FLAG_SET (buffer, GST_BUFFER_FLAG_DISCONT);
+      self->discont = FALSE;
+    }
+    if (self->header) {
+      GST_BUFFER_FLAG_SET (buffer, GST_BUFFER_FLAG_HEADER);
+      self->header = FALSE;
+    }
+    /* happen to be a frame boundary */
+    if (frame_complete)
+      GST_BUFFER_FLAG_SET (buffer, GST_BUFFER_FLAG_MARKER);
+
+    ret = gst_base_parse_finish_frame (parse, frame, total_consumed);
+    *skipsize = 0;
+    goto out;
+  }
+
+  if (res == GST_AV1_PARSER_BITSTREAM_ERROR) {
+    if (consumed) {
+      *skipsize = consumed;
+    } else {
+      *skipsize = map_info.size;
+    }
+    GST_WARNING_OBJECT (parse, "Parse obu error, discard %d.", *skipsize);
+    ret = GST_FLOW_OK;
+  } else if (res == GST_AV1_PARSER_NO_MORE_DATA) {
+    *skipsize = 0;
+
+    if (self->in_align == GST_AV1_PARSE_ALIGN_OBU) {
+      /* The buffer is already aligned to OBU, should not happen. */
+      if (consumed) {
+        *skipsize = consumed;
+      } else {
+        *skipsize = map_info.size;
+      }
+      GST_WARNING_OBJECT (parse, "Parse obu need more data, discard %d.",
+          *skipsize);
+    }
+    ret = GST_FLOW_OK;
+  } else if (res == GST_AV1_PARSER_DROP) {
+    GST_DEBUG_OBJECT (parse, "Drop %d data", consumed);
+    total_consumed += consumed;
+    goto again;
+  } else if (res != GST_AV1_PARSER_OK) {
+    GST_ERROR_OBJECT (parse, "Parse obu get unexpect error %d", res);
+    *skipsize = 0;
+    ret = GST_FLOW_ERROR;
+  }
+
+out:
+  gst_buffer_unmap (buffer, &map_info);
+  gst_buffer_unref (buffer);
+  return ret;
+}
+
+static GstFlowReturn
+gst_av1_parse_handle_to_small_and_equal_align (GstBaseParse * parse,
+    GstBaseParseFrame * frame, gint * skipsize)
+{
+  GstAV1Parse *self = GST_AV1_PARSE (parse);
+  GstMapInfo map_info;
+  GstAV1OBU obu;
+  GstFlowReturn ret = GST_FLOW_OK;
+  GstAV1ParserResult res;
+  GstBuffer *buffer = gst_buffer_ref (frame->buffer);
+  guint32 total_consumed, consumed;
+  gboolean need_convert = FALSE;
+  gboolean frame_complete;
+
+  if (!gst_buffer_map (buffer, &map_info, GST_MAP_READ)) {
+    GST_ERROR_OBJECT (parse, "Couldn't map incoming buffer");
+    return GST_FLOW_ERROR;
+  }
+
+  if (self->in_align != self->align
+      && (self->in_align == GST_AV1_PARSE_ALIGN_TEMPORAL_UNIT_ANNEX_B
+          || self->align == GST_AV1_PARSE_ALIGN_TEMPORAL_UNIT_ANNEX_B))
+    need_convert = TRUE;
+
+  total_consumed = 0;
+  frame_complete = FALSE;
+again:
+  while (TRUE) {
+    res = gst_av1_parser_identify_one_obu (self->parser,
+        map_info.data + total_consumed, map_info.size - total_consumed,
+        &obu, &consumed);
+    if (res == GST_AV1_PARSER_OK)
+      res = gst_av1_parse_handle_one_obu (self, &obu, &frame_complete);
+    if (res != GST_AV1_PARSER_OK)
+      break;
+
+    if (obu.obu_type == GST_AV1_OBU_TEMPORAL_DELIMITER && total_consumed) {
+      GST_DEBUG_OBJECT (self, "Encounter TD inside one %s aligned"
+          " buffer, should not happen normally.",
+          gst_av1_parse_alignment_to_string (self->in_align));
+      frame_complete = TRUE;
+      break;
+    }
+
+    total_consumed += consumed;
+
+    if (need_convert) {
+      if (self->in_align == GST_AV1_PARSE_ALIGN_TEMPORAL_UNIT_ANNEX_B) {
+        gst_av1_parse_convert_from_annexb (self, &obu);
+      } else {
+        gst_av1_parse_convert_to_annexb (self, &obu, frame_complete);
+      }
+    }
+
+    if (self->align == GST_AV1_PARSE_ALIGN_FRAME && frame_complete)
+      break;
+
+    g_assert (total_consumed <= map_info.size);
+    if (total_consumed >= map_info.size)
+      break;
+  }
+
+  if (res == GST_AV1_PARSER_BITSTREAM_ERROR) {
+    /* Discard the whole frame */
+    *skipsize = map_info.size;
+    GST_WARNING_OBJECT (parse, "Parse obu error, discard %d", *skipsize);
+    if (self->in_align == GST_AV1_PARSE_ALIGN_TEMPORAL_UNIT_ANNEX_B)
+      gst_av1_parser_reset_annex_b (self->parser);
+    ret = GST_FLOW_OK;
+    goto out;
+  } else if (res == GST_AV1_PARSER_NO_MORE_DATA) {
+    /* Discard the whole buffer */
+    *skipsize = map_info.size;
+    GST_WARNING_OBJECT (parse, "Parse obu need more data, discard %d.",
+        *skipsize);
+    if (self->in_align == GST_AV1_PARSE_ALIGN_TEMPORAL_UNIT_ANNEX_B)
+      gst_av1_parser_reset_annex_b (self->parser);
+    ret = GST_FLOW_OK;
+    goto out;
+  } else if (res == GST_AV1_PARSER_DROP) {
+    GST_DEBUG_OBJECT (parse, "Drop %d data", consumed);
+    total_consumed += consumed;
+    goto again;
+  } else if (res != GST_AV1_PARSER_OK) {
+    GST_ERROR_OBJECT (parse, "Parse obu get unexpect error %d", res);
+    *skipsize = 0;
+    ret = GST_FLOW_ERROR;
+    goto out;
+  }
+
+  g_assert (total_consumed >= map_info.size || frame_complete);
+  if (total_consumed >= map_info.size && !frame_complete
+      && self->align == GST_AV1_PARSE_ALIGN_FRAME) {
+    /* Warning and still consider this frame as complete */
+    GST_WARNING_OBJECT (self, "Exhaust the buffer but still incomplete frame,"
+        " should not happend in %s alignment",
+        gst_av1_parse_alignment_to_string (self->in_align));
+  }
+
+  if (!need_convert) {
+    gst_av1_parse_update_src_caps (self, NULL);
+    if (self->discont) {
+      GST_BUFFER_FLAG_SET (buffer, GST_BUFFER_FLAG_DISCONT);
+      self->discont = FALSE;
+    }
+    if (self->header) {
+      GST_BUFFER_FLAG_SET (buffer, GST_BUFFER_FLAG_HEADER);
+      self->header = FALSE;
+    }
+    if (self->keyframe) {
+      GST_BUFFER_FLAG_SET (buffer, GST_BUFFER_FLAG_DELTA_UNIT);
+      self->keyframe = FALSE;
+    }
+    /* Always be a frame boundary. */
+    GST_BUFFER_FLAG_SET (buffer, GST_BUFFER_FLAG_MARKER);
+    ret = gst_base_parse_finish_frame (parse, frame, total_consumed);
+  } else {
+    ret = gst_av1_parse_push_data (self, frame, total_consumed);
+  }
+
+out:
+  gst_buffer_unmap (buffer, &map_info);
+  gst_buffer_unref (buffer);
+  return ret;
+}
+
+static GstFlowReturn
+gst_av1_parse_handle_to_big_align (GstBaseParse * parse,
+    GstBaseParseFrame * frame, gint * skipsize)
+{
+  GstAV1Parse *self = GST_AV1_PARSE (parse);
+  GstMapInfo map_info;
+  GstAV1OBU obu;
+  GstFlowReturn ret = GST_FLOW_OK;
+  GstAV1ParserResult res = GST_AV1_PARSER_OK;
+  GstBuffer *buffer = gst_buffer_ref (frame->buffer);
+  guint32 consumed;
+  gboolean frame_complete;
+  gboolean complete;
+
+  g_assert (self->in_align <= GST_AV1_PARSE_ALIGN_FRAME);
+
+  if (!gst_buffer_map (buffer, &map_info, GST_MAP_READ)) {
+    *skipsize = 0;
+    GST_ERROR_OBJECT (parse, "Couldn't map incoming buffer");
+    return GST_FLOW_ERROR;
+  }
+
+  complete = FALSE;
+again:
+  while (self->last_parsed_offset < map_info.size) {
+    res = gst_av1_parser_identify_one_obu (self->parser,
+        map_info.data + self->last_parsed_offset,
+        map_info.size - self->last_parsed_offset, &obu, &consumed);
+    if (res == GST_AV1_PARSER_OK)
+      res = gst_av1_parse_handle_one_obu (self, &obu, &frame_complete);
+    if (res != GST_AV1_PARSER_OK)
+      break;
+
+    /* New TD come, always begin a new temporal unit or frame */
+    if (obu.obu_type == GST_AV1_OBU_TEMPORAL_DELIMITER
+        && (gst_adapter_available (self->cache_out) ||
+            gst_adapter_available (self->frame_cache))) {
+      complete = TRUE;
+      break;
+    }
+
+    if (self->align == GST_AV1_PARSE_ALIGN_TEMPORAL_UNIT ||
+        self->align == GST_AV1_PARSE_ALIGN_FRAME) {
+      GstBuffer *buf = gst_buffer_copy_region (buffer, GST_BUFFER_COPY_ALL,
+          self->last_parsed_offset, consumed);
+      gst_adapter_push (self->cache_out, buf);
+    } else if (self->align == GST_AV1_PARSE_ALIGN_TEMPORAL_UNIT_ANNEX_B) {
+      gst_av1_parse_convert_to_annexb (self, &obu, frame_complete);
+    } else {
+      g_assert_not_reached ();
+    }
+    self->last_parsed_offset += consumed;
+
+    if (self->align == GST_AV1_PARSE_ALIGN_FRAME && frame_complete)
+      complete = TRUE;
+
+    if (complete)
+      break;
+  }
+
+  /* Finish a complete frame anyway */
+  if (complete || GST_BASE_PARSE_DRAINING (parse)) {
+    *skipsize = 0;
+
+    /* push the left anyway if no error */
+    if (res == GST_AV1_PARSER_OK)
+      ret = gst_av1_parse_push_data (self, frame, self->last_parsed_offset);
+
+    self->last_parsed_offset = 0;
+
+    goto out;
+  }
+
+  if (res == GST_AV1_PARSER_BITSTREAM_ERROR) {
+    *skipsize = map_info.size;
+    GST_WARNING_OBJECT (parse, "Parse obu error, discard whole buffer %d.",
+        *skipsize);
+    /* The adapter will be cleared in next loop because of 
+       GST_BASE_PARSE_FRAME_FLAG_NEW_FRAME flag */
+    ret = GST_FLOW_OK;
+  } else if (res == GST_AV1_PARSER_NO_MORE_DATA) {
+    *skipsize = 0;
+
+    if (self->in_align >= GST_AV1_PARSE_ALIGN_OBU) {
+      /* The buffer is already aligned to OBU, should not happen.
+         The adapter will be cleared in next loop because of
+         GST_BASE_PARSE_FRAME_FLAG_NEW_FRAME flag */
+      *skipsize = map_info.size;
+      GST_WARNING_OBJECT (parse,
+          "Parse obu need more data, discard whole buffer %d.", *skipsize);
+    }
+    ret = GST_FLOW_OK;
+  } else if (res == GST_AV1_PARSER_DROP) {
+    GST_DEBUG_OBJECT (parse, "Drop %d data", consumed);
+    self->last_parsed_offset += consumed;
+    goto again;
+  } else if (res == GST_AV1_PARSER_OK) {
+    /* Everything is correct but still not get a frame or tu,
+       need more data */
+    GST_DEBUG_OBJECT (parse, "Need more data");
+    *skipsize = 0;
+    ret = GST_FLOW_OK;
+  } else {
+    GST_ERROR_OBJECT (parse, "Parse obu get unexpect error %d", res);
+    *skipsize = 0;
+    ret = GST_FLOW_ERROR;
+  }
+
+out:
+  gst_buffer_unmap (buffer, &map_info);
+  gst_buffer_unref (buffer);
+  return ret;
+}
+
+/* Try to recognize whether the input is annex-b format. */
+static GstFlowReturn
+gst_av1_parse_detect_alignment (GstBaseParse * parse,
+    GstBaseParseFrame * frame, gint * skipsize)
+{
+  GstAV1Parse *self = GST_AV1_PARSE (parse);
+  GstMapInfo map_info;
+  GstAV1OBU obu;
+  GstAV1ParserResult res;
+  GstBuffer *buffer = gst_buffer_ref (frame->buffer);
+  gboolean got_seq, got_frame;
+  gboolean frame_complete;
+  guint32 consumed, total_consumed;
+  guint32 frame_sz;
+  GstFlowReturn ret = GST_FLOW_OK;
+
+  if (!gst_buffer_map (buffer, &map_info, GST_MAP_READ)) {
+    *skipsize = 0;
+    GST_ERROR_OBJECT (parse, "Couldn't map incoming buffer");
+    return GST_FLOW_ERROR;
+  }
+
+  gst_av1_parser_reset (self->parser, FALSE);
+
+  /* Detect the alignment obu first */
+  got_seq = FALSE;
+  got_frame = FALSE;
+  total_consumed = 0;
+again:
+  while (TRUE) {
+    res = gst_av1_parser_identify_one_obu (self->parser,
+        map_info.data + total_consumed, map_info.size - total_consumed,
+        &obu, &consumed);
+    if (res == GST_AV1_PARSER_OK)
+      res = gst_av1_parse_handle_one_obu (self, &obu, &frame_complete);
+    if (res != GST_AV1_PARSER_OK)
+      break;
+
+    total_consumed += consumed;
+
+    if (obu.obu_type == GST_AV1_OBU_SEQUENCE_HEADER)
+      got_seq = TRUE;
+    if (obu.obu_type == GST_AV1_OBU_REDUNDANT_FRAME_HEADER ||
+        obu.obu_type == GST_AV1_OBU_FRAME ||
+        obu.obu_type == GST_AV1_OBU_FRAME_HEADER)
+      got_frame = TRUE;
+
+    if (got_seq || got_frame)
+      break;
+
+    if (total_consumed >= map_info.size)
+      break;
+  }
+
+  gst_av1_parser_reset (self->parser, FALSE);
+
+  if (res == GST_AV1_PARSER_OK || res == GST_AV1_PARSER_NO_MORE_DATA) {
+    *skipsize = 0;
+
+    /* If succeed recognize seq or frame, we can decide,
+       otherwise, just skipsize to 0 and get more data. */
+    if (got_seq || got_frame)
+      self->in_align = GST_AV1_PARSE_ALIGN_BYTE;
+
+    ret = GST_FLOW_OK;
+    goto out;
+  } else if (res == GST_AV1_PARSER_DROP) {
+    total_consumed += consumed;
+    goto again;
+  }
+
+  /* Try the annexb. The buffer should hold the whole frame, and
+     the buffer start with the frame size in leb128() format. */
+  if (map_info.size < 8) {
+    /* Get more data. */
+    *skipsize = 0;
+    ret = GST_FLOW_OK;
+    goto out;
+  }
+
+  frame_sz = _read_leb128 (map_info.data, &res, &consumed);
+  if (frame_sz == 0 || res != GST_AV1_PARSER_OK) {
+    /* Both modes does not match, we can decide a error */
+    ret = GST_FLOW_ERROR;
+    goto out;
+  }
+
+  if (frame_sz + consumed != map_info.size) {
+    GST_DEBUG_OBJECT (self, "Buffer size %" G_GSSIZE_FORMAT ", frame size %d,"
+        " consumed %d, does not match annex b format.",
+        map_info.size, frame_sz, consumed);
+    /* Both modes does not match, we can decide a error */
+    ret = GST_FLOW_ERROR;
+    goto out;
+  }
+
+  self->in_align = GST_AV1_PARSE_ALIGN_TEMPORAL_UNIT_ANNEX_B;
+  gst_av1_parser_reset (self->parser, TRUE);
+  ret = GST_FLOW_OK;
+
+out:
+  gst_buffer_unmap (buffer, &map_info);
+  gst_buffer_unref (buffer);
+  return ret;
+}
+
+static GstFlowReturn
+gst_av1_parse_handle_frame (GstBaseParse * parse,
+    GstBaseParseFrame * frame, gint * skipsize)
+{
+  GstAV1Parse *self = GST_AV1_PARSE (parse);
+  GstFlowReturn ret = GST_FLOW_OK;
+  guint in_level, out_level;
+
+  if (GST_BUFFER_FLAG_IS_SET (frame->buffer, GST_BUFFER_FLAG_DISCONT))
+    self->discont = TRUE;
+  else
+    self->discont = FALSE;
+
+  GST_LOG_OBJECT (self, "Input frame size %" G_GSSIZE_FORMAT,
+      gst_buffer_get_size (frame->buffer));
+
+  /* avoid stale cached parsing state */
+  if (frame->flags & GST_BASE_PARSE_FRAME_FLAG_NEW_FRAME) {
+    GST_LOG_OBJECT (self, "parsing new frame");
+    gst_adapter_clear (self->cache_out);
+    gst_adapter_clear (self->frame_cache);
+    self->last_parsed_offset = 0;
+    self->header = FALSE;
+    self->keyframe = FALSE;
+  } else {
+    GST_LOG_OBJECT (self, "resuming frame parsing");
+  }
+
+  /* When in pull mode, the sink pad has no caps, we may get the
+     caps by query the upstream element */
+  if (self->in_align == GST_AV1_PARSE_ALIGN_NONE) {
+    GstCaps *upstream_caps;
+
+    upstream_caps =
+        gst_pad_peer_query_caps (GST_BASE_PARSE_SINK_PAD (self), NULL);
+    if (upstream_caps) {
+      if (!gst_caps_is_empty (upstream_caps)
+          && !gst_caps_is_any (upstream_caps)) {
+        GST_LOG_OBJECT (self, "upstream caps: %" GST_PTR_FORMAT, upstream_caps);
+        /* fixate to avoid ambiguity with lists when parsing */
+        upstream_caps = gst_caps_fixate (upstream_caps);
+        self->in_align = gst_av1_parse_alignment_from_caps (upstream_caps);
+      }
+
+      gst_caps_unref (upstream_caps);
+
+      gst_av1_parser_reset (self->parser,
+          self->in_align == GST_AV1_PARSE_ALIGN_TEMPORAL_UNIT_ANNEX_B);
+    }
+
+    if (self->in_align != GST_AV1_PARSE_ALIGN_NONE)
+      GST_LOG_OBJECT (self, "Query the upstream get the alignment %d",
+          self->in_align);
+  }
+
+  if (self->in_align == GST_AV1_PARSE_ALIGN_NONE) {
+    /* Only happend at the first time of handle_frame, and the
+       alignment in the sink caps is unset. Try the default and
+       if error, try the annex B. */
+    ret = gst_av1_parse_detect_alignment (parse, frame, skipsize);
+    if (ret == GST_FLOW_OK && self->in_align != GST_AV1_PARSE_ALIGN_NONE)
+      GST_INFO_OBJECT (self, "Detect the input alignment %d", self->in_align);
+  }
+
+  if (self->in_align == GST_AV1_PARSE_ALIGN_NONE) {
+    GST_ERROR_OBJECT (self, "Input alignment is unknown");
+    return GST_FLOW_ERROR;
+  }
+
+  /* We may in pull mode and no caps is set */
+  if (self->align == GST_AV1_PARSE_ALIGN_NONE)
+    gst_av1_parse_negotiate (self, NULL);
+
+  in_level = self->in_align;
+  if (self->in_align == GST_AV1_PARSE_ALIGN_TEMPORAL_UNIT_ANNEX_B)
+    in_level = GST_AV1_PARSE_ALIGN_TEMPORAL_UNIT;
+  out_level = self->align;
+  if (self->align == GST_AV1_PARSE_ALIGN_TEMPORAL_UNIT_ANNEX_B)
+    out_level = GST_AV1_PARSE_ALIGN_TEMPORAL_UNIT;
+
+  if (self->in_align <= GST_AV1_PARSE_ALIGN_OBU
+      && self->align == GST_AV1_PARSE_ALIGN_OBU) {
+    ret = gst_av1_parse_handle_obu_to_obu (parse, frame, skipsize);
+  } else if (in_level < out_level) {
+    ret = gst_av1_parse_handle_to_big_align (parse, frame, skipsize);
+  } else {
+    ret = gst_av1_parse_handle_to_small_and_equal_align (parse,
+        frame, skipsize);
+  }
+
+  return ret;
+}
diff --git a/gst/videoparsers/gstav1parse.h b/gst/videoparsers/gstav1parse.h
new file mode 100644
index 0000000000..464658e1be
--- /dev/null
+++ b/gst/videoparsers/gstav1parse.h
@@ -0,0 +1,34 @@
+/* GStreamer
+ * Copyright (C) 2020 He Junyan <junyan.he@intel.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef __GST_AV1_PARSE_H__
+#define __GST_AV1_PARSE_H__
+
+#include <gst/gst.h>
+#include <gst/base/gstbaseparse.h>
+
+G_BEGIN_DECLS
+
+#define GST_TYPE_AV1_PARSE (gst_av1_parse_get_type())
+G_DECLARE_FINAL_TYPE (GstAV1Parse,
+    gst_av1_parse, GST, AV1_PARSE, GstBaseParse);
+
+G_END_DECLS
+
+#endif /* __GST_AV1_PARSE_H__ */
diff --git a/gst/videoparsers/meson.build b/gst/videoparsers/meson.build
index ad6c7e24f2..61f37576e8 100644
--- a/gst/videoparsers/meson.build
+++ b/gst/videoparsers/meson.build
@@ -13,6 +13,7 @@ vparse_sources = [
   'gstvideoparseutils.c',
   'gstjpeg2000parse.c',
   'gstvp9parse.c',
+  'gstav1parse.c',
 ]
 
 gstvideoparsersbad = library('gstvideoparsersbad',
diff --git a/gst/videoparsers/plugin.c b/gst/videoparsers/plugin.c
index 60e6125cbd..9cf2962ead 100644
--- a/gst/videoparsers/plugin.c
+++ b/gst/videoparsers/plugin.c
@@ -32,6 +32,7 @@
 #include "gstvc1parse.h"
 #include "gsth265parse.h"
 #include "gstvp9parse.h"
+#include "gstav1parse.h"
 
 GST_DEBUG_CATEGORY (videoparseutils_debug);
 
@@ -70,6 +71,14 @@ plugin_init (GstPlugin * plugin)
   ret |= gst_element_register (plugin, "vp9parse",
       GST_RANK_SECONDARY, GST_TYPE_VP9_PARSE);
 
+  /**
+   * element-av1parse:
+   *
+   * Since: 1.20
+   */
+  ret |= gst_element_register (plugin, "av1parse",
+      GST_RANK_SECONDARY, GST_TYPE_AV1_PARSE);
+
   return ret;
 }