From ca6e20ae1aa33ff1e51145c969e066c8cda6a796 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim-Philipp=20M=C3=BCller?= Date: Wed, 8 Mar 2006 17:11:29 +0000 Subject: [PATCH] gst/typefind/gsttypefindfunctions.c: Make plain/text typefinder more conservative: firstly, check for embedded zeroes... Original commit message from CVS: * gst/typefind/gsttypefindfunctions.c: (utf8_type_find_count_embedded_zeroes), (utf8_type_find_have_valid_utf8_at_offset), (utf8_type_find): Make plain/text typefinder more conservative: firstly, check for embedded zeroes, which are perfectly valid UTF-8 characters, but also a fairly good sign that something is not a plain text file; secondly, probe into the middle of the file if possible. If we can't probe into the middle, limit the probability value to be returned to TYPE_FIND_POSSIBLE (see #333900). --- ChangeLog | 12 +++++ gst/typefind/gsttypefindfunctions.c | 81 +++++++++++++++++++++++++---- 2 files changed, 82 insertions(+), 11 deletions(-) diff --git a/ChangeLog b/ChangeLog index 4b6d244d0c..2e97630bca 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,15 @@ +2006-03-08 Tim-Philipp Müller + + * gst/typefind/gsttypefindfunctions.c: + (utf8_type_find_count_embedded_zeroes), + (utf8_type_find_have_valid_utf8_at_offset), (utf8_type_find): + Make plain/text typefinder more conservative: firstly, check + for embedded zeroes, which are perfectly valid UTF-8 characters, + but also a fairly good sign that something is not a plain text + file; secondly, probe into the middle of the file if possible. + If we can't probe into the middle, limit the probability value + to be returned to TYPE_FIND_POSSIBLE (see #333900). + 2006-03-08 Michael Smith * gst/typefind/gsttypefindfunctions.c: (plugin_init): diff --git a/gst/typefind/gsttypefindfunctions.c b/gst/typefind/gsttypefindfunctions.c index 73a8332bd1..923625660c 100644 --- a/gst/typefind/gsttypefindfunctions.c +++ b/gst/typefind/gsttypefindfunctions.c @@ -45,35 +45,94 @@ static gboolean xml_check_first_element (GstTypeFind * tf, static GstStaticCaps utf8_caps = GST_STATIC_CAPS ("text/plain"); #define UTF8_CAPS gst_static_caps_get(&utf8_caps) -static void -utf8_type_find (GstTypeFind * tf, gpointer unused) + +static guint +utf8_type_find_count_embedded_zeroes (const gchar * data, guint size) +{ + guint num = 0; + + while (size > 0) { + if (data[size - 1] == 0) + ++num; + --size; + } + + return num; +} + +static gboolean +utf8_type_find_have_valid_utf8_at_offset (GstTypeFind * tf, guint64 offset, + GstTypeFindProbability * prob) { guint8 *data; /* randomly decided values */ - guint size = 1024; /* starting size */ + guint min_size = 16; /* minimum size */ + guint size = 32 * 1024; /* starting size */ guint probability = 95; /* starting probability */ guint step = 10; /* how much we reduce probability in each * iteration */ - /* leave xml to the xml typefinders */ - if (xml_check_first_element (tf, "", 0)) - return; - - while (probability > step) { - data = gst_type_find_peek (tf, 0, size); + while (probability > step && size > min_size) { + data = gst_type_find_peek (tf, offset, size); if (data) { gchar *end; gchar *start = (gchar *) data; if (g_utf8_validate (start, size, (const gchar **) &end) || (end - start + 4 > size)) { /* allow last char to be cut off */ - gst_type_find_suggest (tf, probability, UTF8_CAPS); + /* embedded zeroes are a sure sign that this isn't a plain text file */ + if (utf8_type_find_count_embedded_zeroes (start, size) <= 2) { + *prob = probability; + return TRUE; + } } - return; + *prob = 0; + return FALSE; } size /= 2; probability -= step; } + *prob = 0; + return FALSE; +} + +static void +utf8_type_find (GstTypeFind * tf, gpointer unused) +{ + GstTypeFindProbability start_prob, mid_prob; + guint64 length; + + /* leave xml to the xml typefinders */ + if (xml_check_first_element (tf, "", 0)) + return; + + /* check beginning of stream */ + if (!utf8_type_find_have_valid_utf8_at_offset (tf, 0, &start_prob)) + return; + + GST_LOG ("start is plain text with probability of %u", start_prob); + + /* POSSIBLE is the highest probability we ever return if we can't + * probe into the middle of the file and don't know its length */ + + length = gst_type_find_get_length (tf); + if (length == 0 || length == (guint64) - 1) { + gst_type_find_suggest (tf, MIN (start_prob, GST_TYPE_FIND_POSSIBLE), + UTF8_CAPS); + return; + } + + if (length < 64 * 1024) { + gst_type_find_suggest (tf, start_prob, UTF8_CAPS); + return; + } + + /* check middle of stream */ + if (!utf8_type_find_have_valid_utf8_at_offset (tf, length / 2, &mid_prob)) + return; + + GST_LOG ("middle is plain text with probability of %u", mid_prob); + gst_type_find_suggest (tf, (start_prob + mid_prob) / 2, UTF8_CAPS); } /*** text/uri-list ***/