Audacious  $Id:Doxyfile42802007-03-2104:39:00Znenolod$
chardet.c
Go to the documentation of this file.
1 /*
2  * chardet.c
3  * Copyright 2006-2010 Yoshiki Yazawa, Matti Hämäläinen, and John Lindgren
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  * this list of conditions, and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions, and the following disclaimer in the documentation
13  * provided with the distribution.
14  *
15  * This software is provided "as is" and without any warranty, express or
16  * implied. In no event shall the authors be liable for any damages arising from
17  * the use of this software.
18  */
19 
20 #include <glib.h>
21 #include <string.h>
22 #include <libaudcore/audstrings.h>
23 
24 #include "debug.h"
25 #include "i18n.h"
26 #include "main.h"
27 #include "misc.h"
28 
29 #ifdef USE_CHARDET
30 # include <libguess.h>
31 #endif
32 
33 static char * cd_chardet_to_utf8 (const char * str, int len,
34  int * arg_bytes_read, int * arg_bytes_written);
35 
36 static char * str_to_utf8_fallback (const char * str)
37 {
38  char * out = g_strconcat (str, _(" (invalid UTF-8)"), NULL);
39 
40  for (char * c = out; * c; c ++)
41  {
42  if (* c & 0x80)
43  * c = '?';
44  }
45 
46  return out;
47 }
48 
49 static char * cd_str_to_utf8 (const char * str)
50 {
51  char *out_str;
52 
53  if (str == NULL)
54  return NULL;
55 
56  /* Note: Currently, playlist calls this function repeatedly, even
57  * if the string is already converted into utf-8.
58  * chardet_to_utf8() would convert a valid utf-8 string into a
59  * different utf-8 string, if fallback encodings were supplied and
60  * the given string could be treated as a string in one of
61  * fallback encodings. To avoid this, g_utf8_validate() had been
62  * used at the top of evaluation.
63  */
64 
65  /* Note 2: g_utf8_validate() has so called encapsulated utf-8
66  * problem, thus chardet_to_utf8() took the place of that.
67  */
68 
69  /* Note 3: As introducing madplug, the problem of conversion from
70  * ISO-8859-1 to UTF-8 arose. This may be coped with g_convert()
71  * located near the end of chardet_to_utf8(), but it requires utf8
72  * validation guard where g_utf8_validate() was. New
73  * dfa_validate_utf8() employs libguess' DFA engine to validate
74  * utf-8 and can properly distinguish examples of encapsulated
75  * utf-8. It is considered to be safe to use as a guard.
76  */
77 
78  /* Already UTF-8? */
79 #ifdef USE_CHARDET
80  if (libguess_validate_utf8(str, strlen(str)))
81  return g_strdup(str);
82 #else
83  if (g_utf8_validate(str, strlen(str), NULL))
84  return g_strdup(str);
85 #endif
86 
87  /* chardet encoding detector */
88  if ((out_str = cd_chardet_to_utf8 (str, strlen (str), NULL, NULL)))
89  return out_str;
90 
91  /* all else fails, we mask off character codes >= 128, replace with '?' */
92  return str_to_utf8_fallback(str);
93 }
94 
95 static char * cd_chardet_to_utf8 (const char * str, int len,
96  int * arg_bytes_read, int * arg_bytes_write)
97 {
98  char *ret = NULL;
99  int * bytes_read, * bytes_write;
100  int my_bytes_read, my_bytes_write;
101 
102  bytes_read = arg_bytes_read != NULL ? arg_bytes_read : &my_bytes_read;
103  bytes_write = arg_bytes_write != NULL ? arg_bytes_write : &my_bytes_write;
104 
105  g_return_val_if_fail(str != NULL, NULL);
106 
107 #ifdef USE_CHARDET
108  if (libguess_validate_utf8(str, len))
109 #else
110  if (g_utf8_validate(str, len, NULL))
111 #endif
112  {
113  if (len < 0)
114  len = strlen (str);
115 
116  ret = g_malloc (len + 1);
117  memcpy (ret, str, len);
118  ret[len] = 0;
119 
120  if (arg_bytes_read != NULL)
121  * arg_bytes_read = len;
122  if (arg_bytes_write != NULL)
123  * arg_bytes_write = len;
124 
125  return ret;
126  }
127 
128 #ifdef USE_CHARDET
129  char * det = get_string (NULL, "chardet_detector");
130 
131  if (det[0])
132  {
133  AUDDBG("guess encoding (%s) %s\n", det, str);
134  const char * encoding = libguess_determine_encoding (str, len, det);
135  AUDDBG("encoding = %s\n", encoding);
136  if (encoding)
137  {
138  gsize read_gsize = 0, written_gsize = 0;
139  ret = g_convert (str, len, "UTF-8", encoding, & read_gsize, & written_gsize, NULL);
140  * bytes_read = read_gsize;
141  * bytes_write = written_gsize;
142  }
143  }
144 
145  g_free (det);
146 #endif
147 
148  /* If detection failed or was not enabled, try fallbacks (if there are any) */
149  if (! ret)
150  {
151  char * fallbacks = get_string (NULL, "chardet_fallback");
152  char * * split = g_strsplit_set (fallbacks, " ,:;|/", -1);
153 
154  for (char * * enc = split; * enc; enc ++)
155  {
156  gsize read_gsize = 0, written_gsize = 0;
157  ret = g_convert (str, len, "UTF-8", * enc, & read_gsize, & written_gsize, NULL);
158  * bytes_read = read_gsize;
159  * bytes_write = written_gsize;
160 
161  if (len == *bytes_read)
162  break;
163  else {
164  g_free(ret);
165  ret = NULL;
166  }
167  }
168 
169  g_strfreev (split);
170  g_free (fallbacks);
171  }
172 
173  /* First fallback: locale (duh!) */
174  if (ret == NULL)
175  {
176  gsize read_gsize = 0, written_gsize = 0;
177  ret = g_locale_to_utf8 (str, len, & read_gsize, & written_gsize, NULL);
178  * bytes_read = read_gsize;
179  * bytes_write = written_gsize;
180  }
181 
182  /* The final fallback is ISO-8859-1, if no other is specified or conversions fail */
183  if (ret == NULL)
184  {
185  gsize read_gsize = 0, written_gsize = 0;
186  ret = g_convert (str, len, "UTF-8", "ISO-8859-1", & read_gsize, & written_gsize, NULL);
187  * bytes_read = read_gsize;
188  * bytes_write = written_gsize;
189  }
190 
191  if (ret != NULL)
192  {
193  if (g_utf8_validate(ret, -1, NULL))
194  return ret;
195  else
196  {
197  g_warning("g_utf8_validate() failed for converted string in cd_chardet_to_utf8: '%s'", ret);
198  g_free(ret);
199  return NULL;
200  }
201  }
202 
203  return NULL; /* If we have no idea, return NULL. */
204 }
205 
206 void chardet_init (void)
207 {
208 #ifdef USE_CHARDET
209  libguess_determine_encoding(NULL, -1, "");
210 #endif
212 }
static char * cd_str_to_utf8(const char *str)
Definition: chardet.c:49
#define _(String)
Definition: i18n.h:25
static char * cd_chardet_to_utf8(const char *str, int len, int *arg_bytes_read, int *arg_bytes_written)
Definition: chardet.c:95
char * get_string(const char *section, const char *name)
Definition: config.c:270
#define AUDDBG(...)
Definition: debug.h:30
void chardet_init(void)
Definition: chardet.c:206
EXPORT void str_set_utf8_impl(char *(*stu_impl)(const char *), char *(*stuf_impl)(const char *, int, int *, int *))
Definition: audstrings.c:58
#define NULL
Definition: core.h:27
static char * str_to_utf8_fallback(const char *str)
Definition: chardet.c:36