From 913dd3f78ea0d7aa5cc4d61b4a72e1fd97effc70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim-Philipp=20M=C3=BCller?= Date: Wed, 26 Jan 2005 12:38:02 +0000 Subject: [PATCH] Check environment variables GST_ID3V2_TAG_ENCODING, Original commit message from CVS: Check environment variables GST_ID3V2_TAG_ENCODING, GST_ID3_TAG_ENCODING and GST_TAG_ENCODING for a colon-separated list of character encodings to force interpretation of non-unicode strings stored in an ID3v2 tag to a particular encoding. If none is specified, try to use current locale's encoding, then fall back to ISO-8859-1 (which will always succeed). (Resolves #149274) Check environment variables GST_ID3V1_TAG_ENCODING, GST_ID3_TAG_ENCODING and GST_TAG_ENCODING for a colon-separated list of character encodings to use in case a string encountered in an ID3v1 tag is not valid UTF-8 already. If no encoding is specified, try to use the current locale's encoding, then fall back to ISO-8859-1 (which will always succeed). --- ChangeLog | 19 ++++++ ext/mad/gstid3tag.c | 156 ++++++++++++++++++++++++++++++++++++-------- 2 files changed, 147 insertions(+), 28 deletions(-) diff --git a/ChangeLog b/ChangeLog index eab0676d0d..950f2b7748 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,22 @@ +2005-01-26 Tim-Philipp Müller + + * ext/mad/gstid3tag.c: (mad_id3_parse_latin1_string), + (mad_id3_parse_comment_frame), (gst_mad_id3_to_tag_list): + Check environment variables GST_ID3V2_TAG_ENCODING, + GST_ID3_TAG_ENCODING and GST_TAG_ENCODING for a colon-separated + list of character encodings to force interpretation of non-unicode + strings stored in an ID3v2 tag to a particular encoding. If none + is specified, try to use current locale's encoding, then fall back + to ISO-8859-1 (which will always succeed). (Resolves #149274) + * gst/tags/gstid3tag.c: (gst_tag_from_id3_tag), + (gst_tag_extract_id3v1_string), (gst_tag_list_new_from_id3v1): + Check environment variables GST_ID3V1_TAG_ENCODING, + GST_ID3_TAG_ENCODING and GST_TAG_ENCODING for a colon-separated + list of character encodings to use in case a string encountered + in an ID3v1 tag is not valid UTF-8 already. If no encoding is + specified, try to use the current locale's encoding, then fall + back to ISO-8859-1 (which will always succeed). + 2005-01-25 Benjamin Otte * ext/mad/gstmad.c: (gst_mad_check_caps_reset), (gst_mad_chain): diff --git a/ext/mad/gstid3tag.c b/ext/mad/gstid3tag.c index 64fd5df7a9..830142b528 100644 --- a/ext/mad/gstid3tag.c +++ b/ext/mad/gstid3tag.c @@ -522,6 +522,113 @@ gst_id3_tag_src_event (GstPad * pad, GstEvent * event) return FALSE; } +static id3_utf8_t * +mad_id3_parse_latin1_string (const id3_ucs4_t * ucs4) +{ + gsize bytes_read, size; + const gchar *env; + char *latin1, *ret = NULL; + + latin1 = id3_ucs4_latin1duplicate (ucs4); + if (latin1 == NULL) + return NULL; + + size = strlen (latin1); + + env = g_getenv ("GST_ID3V2_TAG_ENCODING"); + if (!env || *env == '\0') + env = g_getenv ("GST_ID3_TAG_ENCODING"); + if (!env || *env == '\0') + env = g_getenv ("GST_TAG_ENCODING"); + + if (env && *env != '\0') { + gchar **c, **csets; + + csets = g_strsplit (env, G_SEARCHPATH_SEPARATOR_S, -1); + + for (c = csets; !ret && c && *c; ++c) { + gchar *utf8; + + if ((utf8 = + g_convert (latin1, size, "UTF-8", *c, &bytes_read, NULL, NULL))) { + if (bytes_read == size) { + ret = strdup (utf8); + } + g_free (utf8); + } + } + g_strfreev (csets); + } + + /* Try current locale (if not UTF-8). Should we really do this? + * What if the tag is really correct and in ISO-8859-1 and the + * current locale is some other charset where the full byte range + * is valid? In those cases ISO-8859-1 would have to be put into + * one of the above environment variables. Do the most common + * non-Western and non-UTF8 character sets modify only the range + * from 0x80-0xff, so that ASCII is still covered at least?) */ + if (!ret && !g_get_charset (&env)) { + gchar *utf8; + + if ((utf8 = g_locale_to_utf8 (latin1, size, &bytes_read, NULL, NULL))) { + if (bytes_read == size) { + ret = strdup (utf8); + } + g_free (utf8); + } + } + + /* Try ISO-8859-1 (this conversion should always suceed) */ + if (!ret) { + gchar *utf8; + + utf8 = + g_convert (latin1, size, "UTF-8", "ISO-8859-1", &bytes_read, NULL, + NULL); + if (utf8 != NULL && bytes_read == size) { + ret = strdup (utf8); + } + g_free (utf8); + } + + free (latin1); + return ret; +} + +static void +mad_id3_parse_comment_frame (GstTagList * tlist, const struct id3_frame *frame) +{ + const id3_ucs4_t *ucs4; + id3_utf8_t *utf8; + + g_assert (frame->nfields >= 4); + + ucs4 = id3_field_getfullstring (&frame->fields[3]); + g_assert (ucs4); + + if (frame->fields[0].type == ID3_FIELD_TYPE_TEXTENCODING + && frame->fields[0].number.value == ID3_FIELD_TEXTENCODING_ISO_8859_1) { + utf8 = mad_id3_parse_latin1_string (ucs4); + } else { + utf8 = id3_ucs4_utf8duplicate (ucs4); + } + + if (utf8 == NULL) + return; + + if (!g_utf8_validate (utf8, -1, NULL)) { + g_warning ("converted string is not valid utf-8"); + g_free (utf8); + return; + } + + g_strchomp (utf8); + + gst_tag_list_add (tlist, GST_TAG_MERGE_APPEND, GST_TAG_COMMENT, utf8, NULL); + + g_free (utf8); +} + GstTagList * gst_mad_id3_to_tag_list (const struct id3_tag * tag) { @@ -534,52 +641,45 @@ gst_mad_id3_to_tag_list (const struct id3_tag * tag) tag_list = gst_tag_list_new (); while ((frame = id3_tag_findframe (tag, NULL, i++)) != NULL) { - const union id3_field *field; + const union id3_field *field, *encfield; unsigned int nstrings, j; const gchar *tag_name; - /* find me the function to query the frame id */ - gchar *id = g_strndup (frame->id, 5); + tag_name = gst_tag_from_id3_tag (frame->id); + if (tag_name == NULL) + continue; - tag_name = gst_tag_from_id3_tag (id); - if (tag_name == NULL) { - g_free (id); + if (strncmp (frame->id, "COMM", 5) == 0) { + mad_id3_parse_comment_frame (tag_list, frame); continue; } - if (strcmp (id, "COMM") == 0) { - ucs4 = id3_field_getfullstring (&frame->fields[3]); - g_assert (ucs4); - - utf8 = id3_ucs4_utf8duplicate (ucs4); - if (utf8 == 0) - continue; - - if (!g_utf8_validate (utf8, -1, NULL)) { - g_warning ("converted string is not valid utf-8"); - g_free (utf8); - continue; - } - - gst_tag_list_add (tag_list, GST_TAG_MERGE_APPEND, - GST_TAG_COMMENT, utf8, NULL); - - g_free (utf8); + if (frame->id[0] != 'T') { + g_warning ("don't know how to parse ID3v2 frame with ID '%s'", frame->id); continue; } + g_assert (frame->nfields >= 2); + field = &frame->fields[1]; nstrings = id3_field_getnstrings (field); + encfield = &frame->fields[0]; for (j = 0; j < nstrings; ++j) { ucs4 = id3_field_getstrings (field, j); g_assert (ucs4); - if (strcmp (id, ID3_FRAME_GENRE) == 0) + if (strncmp (frame->id, ID3_FRAME_GENRE, 5) == 0) ucs4 = id3_genre_name (ucs4); - utf8 = id3_ucs4_utf8duplicate (ucs4); - if (utf8 == 0) + if (encfield->type == ID3_FIELD_TYPE_TEXTENCODING + && encfield->number.value == ID3_FIELD_TEXTENCODING_ISO_8859_1) { + utf8 = mad_id3_parse_latin1_string (ucs4); + } else { + utf8 = id3_ucs4_utf8duplicate (ucs4); + } + + if (utf8 == NULL) continue; if (!g_utf8_validate (utf8, -1, NULL)) { @@ -654,13 +754,13 @@ gst_mad_id3_to_tag_list (const struct id3_tag * tag) } default: g_assert (gst_tag_get_type (tag_name) == G_TYPE_STRING); + g_strchomp (utf8); gst_tag_list_add (tag_list, GST_TAG_MERGE_APPEND, tag_name, utf8, NULL); break; } free (utf8); } - g_free (id); } return tag_list;