From 74e0c05ff7d2270494d616ea2d86811bae5a3d53 Mon Sep 17 00:00:00 2001 From: Mark Nauwelaerts Date: Wed, 23 Jun 2010 11:12:00 +0200 Subject: [PATCH] matroskademux: UTF-8 subtitles may have markup Fixes #616936. --- gst/matroska/matroska-demux.c | 64 ++++++++++++++++++++++++++++++++--- gst/matroska/matroska-ids.h | 1 + 2 files changed, 61 insertions(+), 4 deletions(-) diff --git a/gst/matroska/matroska-demux.c b/gst/matroska/matroska-demux.c index 12f719389b..3b20acc702 100644 --- a/gst/matroska/matroska-demux.c +++ b/gst/matroska/matroska-demux.c @@ -112,8 +112,8 @@ static GstStaticPadTemplate subtitle_src_templ = GST_STATIC_PAD_TEMPLATE ("subtitle_%02d", GST_PAD_SRC, GST_PAD_SOMETIMES, - GST_STATIC_CAPS ("text/plain; application/x-ssa; application/x-ass; " - "application/x-usf; video/x-dvd-subpicture; " + GST_STATIC_CAPS ("text/x-pango-markup; application/x-ssa; " + "application/x-ass;application/x-usf; video/x-dvd-subpicture; " "subpicture/x-pgs; subtitle/x-kate; " "application/x-subtitle-unknown") ); @@ -3797,6 +3797,38 @@ gst_matroska_demux_add_wvpk_header (GstElement * element, return GST_FLOW_OK; } +/* @text must be null-terminated */ +static gboolean +gst_matroska_demux_subtitle_chunk_has_tag (GstElement * element, + const gchar * text) +{ + gchar *tag; + + /* yes, this might all lead to false positives ... */ + tag = (gchar *) text; + while ((tag = strchr (tag, '<'))) { + tag++; + if (*tag != '\0' && *(tag + 1) == '>') { + /* some common convenience ones */ + /* maybe any character will do here ? */ + switch (*tag) { + case 'b': + case 'i': + case 'u': + case 's': + return TRUE; + default: + return FALSE; + } + } + } + + if (strstr (text, "invalid_utf8) { if (g_utf8_validate (data, size, NULL)) { - return GST_FLOW_OK; + goto next; } GST_WARNING_OBJECT (element, "subtitle stream %d is not valid UTF-8, this " "is broken according to the matroska specification", stream->num); @@ -3862,6 +3894,29 @@ gst_matroska_demux_check_subtitle_buffer (GstElement * element, gst_buffer_unref (*buf); *buf = newbuf; + data = (const gchar *) GST_BUFFER_DATA (*buf); + size = GST_BUFFER_SIZE (*buf); + +next: + /* caps claim markup text, so we need to escape text, + * except if text is already markup and then needs no further escaping */ + sub_stream->seen_markup_tag = sub_stream->seen_markup_tag || + gst_matroska_demux_subtitle_chunk_has_tag (element, data); + + if (!sub_stream->seen_markup_tag) { + utf8 = g_markup_escape_text (data, size); + + newbuf = gst_buffer_new (); + GST_BUFFER_MALLOCDATA (newbuf) = (guint8 *) utf8; + GST_BUFFER_DATA (newbuf) = (guint8 *) utf8; + GST_BUFFER_SIZE (newbuf) = strlen (utf8); + gst_buffer_copy_metadata (newbuf, *buf, + GST_BUFFER_COPY_TIMESTAMPS | GST_BUFFER_COPY_FLAGS); + gst_buffer_unref (*buf); + + *buf = newbuf; + } + return GST_FLOW_OK; } @@ -6249,7 +6304,8 @@ gst_matroska_demux_subtitle_caps (GstMatroskaTrackSubtitleContext * /* TODO: Add GST_MATROSKA_CODEC_ID_SUBTITLE_BMP support * Check if we have to do something with codec_private */ if (!strcmp (codec_id, GST_MATROSKA_CODEC_ID_SUBTITLE_UTF8)) { - caps = gst_caps_new_simple ("text/plain", NULL); + /* well, plain text simply does not have a lot of markup ... */ + caps = gst_caps_new_simple ("text/x-pango-markup", NULL); context->postprocess_frame = gst_matroska_demux_check_subtitle_buffer; } else if (!strcmp (codec_id, GST_MATROSKA_CODEC_ID_SUBTITLE_SSA)) { caps = gst_caps_new_simple ("application/x-ssa", NULL); diff --git a/gst/matroska/matroska-ids.h b/gst/matroska/matroska-ids.h index 9027add1b3..dcb8bf6dd2 100644 --- a/gst/matroska/matroska-ids.h +++ b/gst/matroska/matroska-ids.h @@ -567,6 +567,7 @@ typedef struct _GstMatroskaTrackSubtitleContext { gboolean check_utf8; /* buffers should be valid UTF-8 */ gboolean invalid_utf8; /* work around broken files */ + gboolean seen_markup_tag; /* markup found in text */ } GstMatroskaTrackSubtitleContext; typedef struct _GstMatroskaIndex {