gst/subparse/gstsubparse.*: Text subtitle files may or may not be UTF-8. If it's not, we don't really want to see '?'...
Original commit message from CVS: * gst/subparse/gstsubparse.c: (convert_encoding), (gst_sub_parse_change_state): * gst/subparse/gstsubparse.h: Text subtitle files may or may not be UTF-8. If it's not, we don't really want to see '?' characters in place of non-ASCII characters like accented characters. So let's assume the input is UTF-8 until we come across text that is clearly not. If it's not UTF-8, we don't really know what it is, so try the following: (a) see whether the GST_SUBTITLE_ENCODING environment variable is set; if not, check (b) if the current locale encoding is non-UTF-8 and use that if it is, or (c) assume ISO-8859-15 if the current locale encoding is UTF-8 and the environment variable was not set to any particular encoding. Not perfect, but better than nothing (and better than before, I think) (fixes #172848).
This commit is contained in:
parent
e7acd7aac6
commit
2ecb455728
17
ChangeLog
17
ChangeLog
@ -1,3 +1,20 @@
|
|||||||
|
2006-03-24 Tim-Philipp Müller <tim at centricular dot net>
|
||||||
|
|
||||||
|
* gst/subparse/gstsubparse.c: (convert_encoding),
|
||||||
|
(gst_sub_parse_change_state):
|
||||||
|
* gst/subparse/gstsubparse.h:
|
||||||
|
Text subtitle files may or may not be UTF-8. If it's not, we
|
||||||
|
don't really want to see '?' characters in place of non-ASCII
|
||||||
|
characters like accented characters. So let's assume the input
|
||||||
|
is UTF-8 until we come across text that is clearly not. If it's
|
||||||
|
not UTF-8, we don't really know what it is, so try the following:
|
||||||
|
(a) see whether the GST_SUBTITLE_ENCODING environment variable
|
||||||
|
is set; if not, check (b) if the current locale encoding is
|
||||||
|
non-UTF-8 and use that if it is, or (c) assume ISO-8859-15 if
|
||||||
|
the current locale encoding is UTF-8 and the environment variable
|
||||||
|
was not set to any particular encoding. Not perfect, but better
|
||||||
|
than nothing (and better than before, I think) (fixes #172848).
|
||||||
|
|
||||||
2006-03-24 Thomas Vander Stichele <thomas at apestaart dot org>
|
2006-03-24 Thomas Vander Stichele <thomas at apestaart dot org>
|
||||||
|
|
||||||
* configure.ac:
|
* configure.ac:
|
||||||
|
@ -230,38 +230,45 @@ beach:
|
|||||||
static gchar *
|
static gchar *
|
||||||
convert_encoding (GstSubParse * self, const gchar * str, gsize len)
|
convert_encoding (GstSubParse * self, const gchar * str, gsize len)
|
||||||
{
|
{
|
||||||
gsize bytes_read, bytes_written;
|
const gchar *encoding;
|
||||||
gchar *rv;
|
GError *err = NULL;
|
||||||
GString *converted;
|
gchar *ret;
|
||||||
|
|
||||||
converted = g_string_new (NULL);
|
if (self->valid_utf8) {
|
||||||
while (len) {
|
if (g_utf8_validate (str, len, NULL)) {
|
||||||
#ifndef GST_DISABLE_GST_DEBUG
|
GST_LOG_OBJECT (self, "valid UTF-8, no conversion needed");
|
||||||
gchar *dbg = g_strndup (str, len);
|
return g_strndup (str, len);
|
||||||
|
|
||||||
GST_DEBUG ("Trying to convert '%s'", dbg);
|
|
||||||
g_free (dbg);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
rv = g_locale_to_utf8 (str, len, &bytes_read, &bytes_written, NULL);
|
|
||||||
if (rv) {
|
|
||||||
g_string_append_len (converted, rv, bytes_written);
|
|
||||||
g_free (rv);
|
|
||||||
|
|
||||||
len -= bytes_read;
|
|
||||||
str += bytes_read;
|
|
||||||
}
|
}
|
||||||
if (len) {
|
GST_INFO_OBJECT (self, "invalid UTF-8!");
|
||||||
/* conversion error ocurred => skip one char */
|
self->valid_utf8 = FALSE;
|
||||||
len--;
|
}
|
||||||
str++;
|
|
||||||
g_string_append_c (converted, '?');
|
encoding = g_getenv ("GST_SUBTITLE_ENCODING");
|
||||||
|
if (encoding == NULL || *encoding == '\0') {
|
||||||
|
/* if local encoding is UTF-8 and no encoding specified
|
||||||
|
* via the environment variable, assume ISO-8859-15 */
|
||||||
|
if (g_get_charset (&encoding)) {
|
||||||
|
encoding = "ISO-8859-15";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
rv = converted->str;
|
|
||||||
g_string_free (converted, FALSE);
|
ret = g_convert_with_fallback (str, len, "UTF-8", encoding, "*", NULL,
|
||||||
GST_DEBUG ("Converted to '%s'", rv);
|
NULL, &err);
|
||||||
return rv;
|
|
||||||
|
if (err) {
|
||||||
|
GST_WARNING_OBJECT (self, "could not convert string from '%s' to UTF-8: %s",
|
||||||
|
encoding, err->message);
|
||||||
|
g_error_free (err);
|
||||||
|
|
||||||
|
/* invalid input encoding, fall back to ISO-8859-15 (always succeeds) */
|
||||||
|
ret = g_convert_with_fallback (str, len, "UTF-8", "ISO-8859-15", "*",
|
||||||
|
NULL, NULL, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
GST_LOG_OBJECT (self, "successfully converted %d characters from %s to UTF-8"
|
||||||
|
"%s", len, encoding, (err) ? " , using ISO-8859-15 as fallback" : "");
|
||||||
|
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static gchar *
|
static gchar *
|
||||||
@ -833,6 +840,7 @@ gst_sub_parse_change_state (GstElement * element, GstStateChange transition)
|
|||||||
/* format detection will init the parser state */
|
/* format detection will init the parser state */
|
||||||
self->offset = self->next_offset = 0;
|
self->offset = self->next_offset = 0;
|
||||||
self->parser_type = GST_SUB_PARSE_FORMAT_UNKNOWN;
|
self->parser_type = GST_SUB_PARSE_FORMAT_UNKNOWN;
|
||||||
|
self->valid_utf8 = TRUE;
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
|
@ -81,6 +81,7 @@ struct _GstSubParse {
|
|||||||
gboolean need_segment;
|
gboolean need_segment;
|
||||||
|
|
||||||
gboolean flushing;
|
gboolean flushing;
|
||||||
|
gboolean valid_utf8;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct _GstSubParseClass {
|
struct _GstSubParseClass {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user