gst/subparse/gstsubparse.*: Text subtitle files may or may not be UTF-8. If it's not, we don't really want to see '?'...

Original commit message from CVS:
* gst/subparse/gstsubparse.c: (convert_encoding),
(gst_sub_parse_change_state):
* gst/subparse/gstsubparse.h:
Text subtitle files may or may not be UTF-8. If it's not, we
don't really want to see '?' characters in place of non-ASCII
characters like accented characters. So let's assume the input
is UTF-8 until we come across text that is clearly not. If it's
not UTF-8, we don't really know what it is, so try the following:
(a) see whether the GST_SUBTITLE_ENCODING environment variable
is set; if not, check (b) if the current locale encoding is
non-UTF-8 and use that if it is, or (c) assume ISO-8859-15 if
the current locale encoding is UTF-8 and the environment variable
was not set to any particular encoding. Not perfect, but better
than nothing (and better than before, I think) (fixes #172848).
This commit is contained in:
Tim-Philipp Müller 2006-03-24 17:57:39 +00:00
parent e7acd7aac6
commit 2ecb455728
3 changed files with 54 additions and 28 deletions

View File

@ -1,3 +1,20 @@
2006-03-24 Tim-Philipp Müller <tim at centricular dot net>
* gst/subparse/gstsubparse.c: (convert_encoding),
(gst_sub_parse_change_state):
* gst/subparse/gstsubparse.h:
Text subtitle files may or may not be UTF-8. If it's not, we
don't really want to see '?' characters in place of non-ASCII
characters like accented characters. So let's assume the input
is UTF-8 until we come across text that is clearly not. If it's
not UTF-8, we don't really know what it is, so try the following:
(a) see whether the GST_SUBTITLE_ENCODING environment variable
is set; if not, check (b) if the current locale encoding is
non-UTF-8 and use that if it is, or (c) assume ISO-8859-15 if
the current locale encoding is UTF-8 and the environment variable
was not set to any particular encoding. Not perfect, but better
than nothing (and better than before, I think) (fixes #172848).
2006-03-24 Thomas Vander Stichele <thomas at apestaart dot org> 2006-03-24 Thomas Vander Stichele <thomas at apestaart dot org>
* configure.ac: * configure.ac:

View File

@ -230,38 +230,45 @@ beach:
static gchar * static gchar *
convert_encoding (GstSubParse * self, const gchar * str, gsize len) convert_encoding (GstSubParse * self, const gchar * str, gsize len)
{ {
gsize bytes_read, bytes_written; const gchar *encoding;
gchar *rv; GError *err = NULL;
GString *converted; gchar *ret;
converted = g_string_new (NULL); if (self->valid_utf8) {
while (len) { if (g_utf8_validate (str, len, NULL)) {
#ifndef GST_DISABLE_GST_DEBUG GST_LOG_OBJECT (self, "valid UTF-8, no conversion needed");
gchar *dbg = g_strndup (str, len); return g_strndup (str, len);
GST_DEBUG ("Trying to convert '%s'", dbg);
g_free (dbg);
#endif
rv = g_locale_to_utf8 (str, len, &bytes_read, &bytes_written, NULL);
if (rv) {
g_string_append_len (converted, rv, bytes_written);
g_free (rv);
len -= bytes_read;
str += bytes_read;
} }
if (len) { GST_INFO_OBJECT (self, "invalid UTF-8!");
/* conversion error ocurred => skip one char */ self->valid_utf8 = FALSE;
len--; }
str++;
g_string_append_c (converted, '?'); encoding = g_getenv ("GST_SUBTITLE_ENCODING");
if (encoding == NULL || *encoding == '\0') {
/* if local encoding is UTF-8 and no encoding specified
* via the environment variable, assume ISO-8859-15 */
if (g_get_charset (&encoding)) {
encoding = "ISO-8859-15";
} }
} }
rv = converted->str;
g_string_free (converted, FALSE); ret = g_convert_with_fallback (str, len, "UTF-8", encoding, "*", NULL,
GST_DEBUG ("Converted to '%s'", rv); NULL, &err);
return rv;
if (err) {
GST_WARNING_OBJECT (self, "could not convert string from '%s' to UTF-8: %s",
encoding, err->message);
g_error_free (err);
/* invalid input encoding, fall back to ISO-8859-15 (always succeeds) */
ret = g_convert_with_fallback (str, len, "UTF-8", "ISO-8859-15", "*",
NULL, NULL, NULL);
}
GST_LOG_OBJECT (self, "successfully converted %d characters from %s to UTF-8"
"%s", len, encoding, (err) ? " , using ISO-8859-15 as fallback" : "");
return ret;
} }
static gchar * static gchar *
@ -833,6 +840,7 @@ gst_sub_parse_change_state (GstElement * element, GstStateChange transition)
/* format detection will init the parser state */ /* format detection will init the parser state */
self->offset = self->next_offset = 0; self->offset = self->next_offset = 0;
self->parser_type = GST_SUB_PARSE_FORMAT_UNKNOWN; self->parser_type = GST_SUB_PARSE_FORMAT_UNKNOWN;
self->valid_utf8 = TRUE;
break; break;
default: default:
break; break;

View File

@ -81,6 +81,7 @@ struct _GstSubParse {
gboolean need_segment; gboolean need_segment;
gboolean flushing; gboolean flushing;
gboolean valid_utf8;
}; };
struct _GstSubParseClass { struct _GstSubParseClass {