gst/subparse/gstsubparse.*: Text subtitle files may or may not be UTF-8. If it's not, we don't really want to see '?'...

Original commit message from CVS: * gst/subparse/gstsubparse.c: (convert_encoding), (gst_sub_parse_change_state): * gst/subparse/gstsubparse.h: Text subtitle files may or may not be UTF-8. If it's not, we don't really want to see '?' characters in place of non-ASCII characters like accented characters. So let's assume the input is UTF-8 until we come across text that is clearly not. If it's not UTF-8, we don't really know what it is, so try the following: (a) see whether the GST_SUBTITLE_ENCODING environment variable is set; if not, check (b) if the current locale encoding is non-UTF-8 and use that if it is, or (c) assume ISO-8859-15 if the current locale encoding is UTF-8 and the environment variable was not set to any particular encoding. Not perfect, but better than nothing (and better than before, I think) (fixes #172848).
2006-03-24 17:57:39 +00:00 · 2006-03-24 17:57:39 +00:00 · 2ecb455728
commit 2ecb455728
parent e7acd7aac6
3 changed files with 54 additions and 28 deletions
--- a/17
+++ b/17
@ -1,3 +1,20 @@
 2006-03-24  Tim-Philipp Müller  <tim at centricular dot net>
 	* gst/subparse/gstsubparse.c: (convert_encoding),
 	(gst_sub_parse_change_state):
 	* gst/subparse/gstsubparse.h:
 	  Text subtitle files may or may not be UTF-8. If it's not, we
 	  don't really want to see '?' characters in place of non-ASCII
 	  characters like accented characters. So let's assume the input
 	  is UTF-8 until we come across text that is clearly not. If it's
 	  not UTF-8, we don't really know what it is, so try the following:
 	  (a) see whether the GST_SUBTITLE_ENCODING environment variable
 	  is set; if not, check (b) if the current locale encoding is
 	  non-UTF-8 and use that if it is, or (c) assume ISO-8859-15 if
 	  the current locale encoding is UTF-8 and the environment variable
 	  was not set to any particular encoding. Not perfect, but better
 	  than nothing (and better than before, I think) (fixes #172848).
 2006-03-24  Thomas Vander Stichele <thomas at apestaart dot org>
 	* configure.ac:
--- a/gst/subparse/gstsubparse.c
+++ b/gst/subparse/gstsubparse.c
@ -230,38 +230,45 @@ beach:
 static gchar *
 convert_encoding (GstSubParse * self, const gchar * str, gsize len)
 {
-  gsize bytes_read, bytes_written;
+  const gchar *encoding;
-  gchar *rv;
+  GError *err = NULL;
-  GString *converted;
+  gchar *ret;
-  converted = g_string_new (NULL);
+  if (self->valid_utf8) {
-  while (len) {
+    if (g_utf8_validate (str, len, NULL)) {
-#ifndef GST_DISABLE_GST_DEBUG
+      GST_LOG_OBJECT (self, "valid UTF-8, no conversion needed");
-    gchar *dbg = g_strndup (str, len);
+      return g_strndup (str, len);
    GST_DEBUG ("Trying to convert '%s'", dbg);
    g_free (dbg);
 #endif
    rv = g_locale_to_utf8 (str, len, &bytes_read, &bytes_written, NULL);
    if (rv) {
      g_string_append_len (converted, rv, bytes_written);
      g_free (rv);
      len -= bytes_read;
      str += bytes_read;
    }
-    if (len) {
+    GST_INFO_OBJECT (self, "invalid UTF-8!");
-      /* conversion error ocurred => skip one char */
+    self->valid_utf8 = FALSE;
-      len--;
+  }
-      str++;
+
-      g_string_append_c (converted, '?');
+  encoding = g_getenv ("GST_SUBTITLE_ENCODING");
  if (encoding == NULL || *encoding == '\0') {
    /* if local encoding is UTF-8 and no encoding specified
     * via the environment variable, assume ISO-8859-15 */
    if (g_get_charset (&encoding)) {
      encoding = "ISO-8859-15";
    }
  }
-  rv = converted->str;
+
-  g_string_free (converted, FALSE);
+  ret = g_convert_with_fallback (str, len, "UTF-8", encoding, "*", NULL,
-  GST_DEBUG ("Converted to '%s'", rv);
+      NULL, &err);
-  return rv;
+
  if (err) {
    GST_WARNING_OBJECT (self, "could not convert string from '%s' to UTF-8: %s",
        encoding, err->message);
    g_error_free (err);
    /* invalid input encoding, fall back to ISO-8859-15 (always succeeds) */
    ret = g_convert_with_fallback (str, len, "UTF-8", "ISO-8859-15", "*",
        NULL, NULL, NULL);
  }
  GST_LOG_OBJECT (self, "successfully converted %d characters from %s to UTF-8"
      "%s", len, encoding, (err) ? " , using ISO-8859-15 as fallback" : "");
  return ret;
 }
 static gchar *
@ -833,6 +840,7 @@ gst_sub_parse_change_state (GstElement * element, GstStateChange transition)
      /* format detection will init the parser state */
      self->offset = self->next_offset = 0;
      self->parser_type = GST_SUB_PARSE_FORMAT_UNKNOWN;
      self->valid_utf8 = TRUE;
      break;
    default:
      break;
--- a/gst/subparse/gstsubparse.h
+++ b/gst/subparse/gstsubparse.h
@ -81,6 +81,7 @@ struct _GstSubParse {
  gboolean need_segment;
  gboolean flushing;
  gboolean valid_utf8;
 };
 struct _GstSubParseClass {