|
1 /* |
|
2 * CDDL HEADER START |
|
3 * |
|
4 * The contents of this file are subject to the terms of the |
|
5 * Common Development and Distribution License (the "License"). |
|
6 * You may not use this file except in compliance with the License. |
|
7 * |
|
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE |
|
9 * or http://www.opensolaris.org/os/licensing. |
|
10 * See the License for the specific language governing permissions |
|
11 * and limitations under the License. |
|
12 * |
|
13 * When distributing Covered Code, include this CDDL HEADER in each |
|
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. |
|
15 * If applicable, add the following below this CDDL HEADER, with the |
|
16 * fields enclosed by brackets "[]" replaced with your own identifying |
|
17 * information: Portions Copyright [yyyy] [name of copyright owner] |
|
18 * |
|
19 * CDDL HEADER END |
|
20 */ |
|
21 |
|
22 /* |
|
23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. |
|
24 * Use is subject to license terms. |
|
25 */ |
|
26 |
|
27 |
|
28 #ifdef HAVE_CONFIG_H |
|
29 #include <config.h> |
|
30 #endif |
|
31 |
|
32 #include <glib.h> |
|
33 #include <glib/gi18n.h> |
|
34 #include <stdlib.h> |
|
35 #include <strings.h> |
|
36 |
|
37 #ifdef HAVE_AUTO_EF_H |
|
38 #include <auto_ef.h> |
|
39 #endif |
|
40 |
|
41 #include "encoding.h" |
|
42 #include "auto-detect.h" |
|
43 |
|
44 short encoding2id (const gchar *encoding_name); |
|
45 |
|
46 #ifndef HAVE_AUTO_EF_H |
|
47 gboolean |
|
48 file_isutf8 (const gchar *filename, gint flags) |
|
49 { |
|
50 gchar *contents = NULL; |
|
51 |
|
52 if (filename == NULL) |
|
53 return FALSE; |
|
54 |
|
55 if (g_file_get_contents (filename, &contents, NULL, NULL)) { |
|
56 gboolean ret = FALSE; |
|
57 |
|
58 if (g_utf8_validate (contents, -1, NULL)) |
|
59 ret = TRUE; |
|
60 |
|
61 g_free (contents); |
|
62 |
|
63 return ret; |
|
64 }else{ |
|
65 //fsexam_errno = ERR_CANNOT_READ; delay error handling |
|
66 } |
|
67 |
|
68 return FALSE; |
|
69 } |
|
70 |
|
71 GList * |
|
72 file_encoding_detect (const gchar *filename, gint flags) |
|
73 { |
|
74 return NULL; |
|
75 } |
|
76 |
|
77 GList * |
|
78 str_encoding_detect (const gchar *string, gint flags) |
|
79 { |
|
80 return NULL; |
|
81 } |
|
82 |
|
83 void |
|
84 auto_encoding_free (GList *list) |
|
85 { |
|
86 return; |
|
87 } |
|
88 |
|
89 #else |
|
90 |
|
91 static const gchar *autoen_map[][2] = { |
|
92 /* auto detected canonical */ |
|
93 {"8859-1", "ISO-8859-1"}, |
|
94 {"8859-2", "ISO-8859-2"}, |
|
95 {"8859-5", "ISO-8859-5"}, |
|
96 {"8859-6", "ISO-8859-6"}, |
|
97 {"8859-7", "ISO-8859-7"}, |
|
98 {"8859-8", "ISO-8859-8"}, |
|
99 {"ascii", "UTF-8"}, |
|
100 {"iso-8859-1", "ISO-8859-1"}, |
|
101 {"iso-8859-2", "ISO-8859-2"}, |
|
102 {"iso-8859-5", "ISO-8859-5"}, |
|
103 {"iso-8859-6", "ISO-8859-6"}, |
|
104 {"iso-8859-7", "ISO-8859-7"}, |
|
105 {"iso-8859-8", "ISO-8859-8"}, |
|
106 {"ko_KR.cp949", "cp949"}, |
|
107 {"ko_KR.euc", "EUC-KR"}, |
|
108 {"koi8-r", "8859-5"}, |
|
109 {"zh_CN.euc", "GB18030"}, |
|
110 {"zh_CN.GB18030", "GB18030"}, |
|
111 {"zh_CN.iso2022-CN", "GB2312"}, |
|
112 {"zh_HK.hkscs", "BIG5-HKSCS"}, |
|
113 {"zh_TW-big5", "BIG5"}, |
|
114 {"zh_TW-euc", "EUC-TW"}, |
|
115 }; |
|
116 |
|
117 /* convert encoding name detected to canonical encoding name */ |
|
118 static const gchar * |
|
119 auto_encoding_to_canonical_name (const gchar *auto_encoding_name) |
|
120 { |
|
121 short low = 0; |
|
122 short high = sizeof (autoen_map) / sizeof (autoen_map[0]) - 1; |
|
123 short mid; |
|
124 |
|
125 if (auto_encoding_name == NULL) |
|
126 return NULL; |
|
127 |
|
128 while (low <= high) { |
|
129 gint result; |
|
130 |
|
131 mid = (low + high) / 2; |
|
132 result = g_ascii_strcasecmp (auto_encoding_name, autoen_map[mid][0]); |
|
133 |
|
134 if (result == 0) |
|
135 return autoen_map[mid][1]; |
|
136 else if (result > 0) |
|
137 low = mid + 1; |
|
138 else |
|
139 high = mid - 1; |
|
140 } |
|
141 |
|
142 /* may be in canonical name already */ |
|
143 if (encoding2id (auto_encoding_name) != -1) |
|
144 return auto_encoding_name; |
|
145 |
|
146 return NULL; |
|
147 } |
|
148 |
|
149 gboolean |
|
150 file_isutf8 (const gchar *filename, gint flags) |
|
151 { |
|
152 auto_ef_t *array_info = NULL; |
|
153 size_t number = 0; |
|
154 gchar *encoding = NULL; |
|
155 double score = 0; |
|
156 gint ret = FALSE; |
|
157 |
|
158 if (filename == NULL) |
|
159 return ret; |
|
160 |
|
161 number = auto_ef_file (&array_info, filename, flags); |
|
162 if (number != 1){ |
|
163 ret = FALSE; |
|
164 goto free; |
|
165 } |
|
166 |
|
167 score = auto_ef_get_score (array_info[0]); |
|
168 if (abs(score - 100.0) > INACCURACY) { |
|
169 ret = FALSE; |
|
170 goto free; |
|
171 } |
|
172 encoding = auto_ef_get_encoding (array_info[0]); |
|
173 if ((strcmp (encoding, "UTF-8") == 0) || (strcmp (encoding, "ASCII") == 0)){ |
|
174 ret = TRUE; |
|
175 } |
|
176 |
|
177 free: |
|
178 auto_ef_free (array_info); |
|
179 |
|
180 return ret; |
|
181 } |
|
182 |
|
183 GList * |
|
184 file_encoding_detect (const gchar *filename, gint flags) |
|
185 { |
|
186 EncodingPair *pair = NULL; |
|
187 GList *result = NULL; |
|
188 auto_ef_t *array_info = NULL; |
|
189 size_t number = 0; |
|
190 gint i; |
|
191 |
|
192 if (filename == NULL) |
|
193 return NULL; |
|
194 |
|
195 number = auto_ef_file (&array_info, filename, flags); |
|
196 if (-1 == number) |
|
197 return NULL; |
|
198 |
|
199 for (i = number - 1; i >= 0; i--){ |
|
200 const gchar *canonical_name = NULL; |
|
201 |
|
202 canonical_name = auto_encoding_to_canonical_name ( |
|
203 auto_ef_get_encoding (array_info[i])); |
|
204 |
|
205 if (canonical_name == NULL) { |
|
206 g_print (_("Warning: can not convert encoding %s to canonical encoding name, will ignore it.\n"), |
|
207 auto_ef_get_encoding (array_info[i])); |
|
208 continue; |
|
209 } |
|
210 |
|
211 pair = g_new (EncodingPair, 1); |
|
212 |
|
213 pair->encoding_name = g_strdup (canonical_name); |
|
214 pair->score = auto_ef_get_score (array_info[i]); |
|
215 |
|
216 result = g_list_append (result, pair); |
|
217 } |
|
218 |
|
219 free: |
|
220 auto_ef_free (array_info); |
|
221 return result; |
|
222 } |
|
223 |
|
224 GList * |
|
225 str_encoding_detect (const gchar *string, gint flags) |
|
226 { |
|
227 EncodingPair *pair = NULL; |
|
228 GList *result = NULL; |
|
229 auto_ef_t *array_info = NULL; |
|
230 size_t number = 0; |
|
231 gint i = 0; |
|
232 |
|
233 if (string == NULL) |
|
234 return NULL; |
|
235 |
|
236 number = auto_ef_str (&array_info, string, strlen(string), flags); |
|
237 if (-1 == number) |
|
238 return NULL; |
|
239 |
|
240 for (i = number - 1; i >= 0; i--){ |
|
241 const gchar *canonical_name = NULL; |
|
242 |
|
243 canonical_name = auto_encoding_to_canonical_name ( |
|
244 auto_ef_get_encoding (array_info[i])); |
|
245 |
|
246 if (canonical_name == NULL) { |
|
247 g_print (_("Warning: can not convert encoding %s to canonical encoding name, will ignore it.\n"), |
|
248 auto_ef_get_encoding (array_info[i])); |
|
249 continue; |
|
250 } |
|
251 |
|
252 pair = g_new (EncodingPair, 1); |
|
253 |
|
254 pair->encoding_name = g_strdup (canonical_name); |
|
255 pair->score = auto_ef_get_score (array_info[i]); |
|
256 |
|
257 result = g_list_append (result, pair); |
|
258 } |
|
259 |
|
260 free: |
|
261 auto_ef_free (array_info); |
|
262 |
|
263 return result; |
|
264 } |
|
265 |
|
266 void |
|
267 auto_encoding_free (GList *list) |
|
268 { |
|
269 EncodingPair *pair = NULL; |
|
270 |
|
271 if (NULL == list) |
|
272 return; |
|
273 |
|
274 while (list){ |
|
275 pair = list->data; |
|
276 if (pair != NULL) |
|
277 g_free (pair->encoding_name); |
|
278 |
|
279 list = list->next; |
|
280 } |
|
281 |
|
282 g_list_free (list); |
|
283 |
|
284 return; |
|
285 } |
|
286 #endif |
|
287 |
|
288 gboolean |
|
289 str_isutf8 (const gchar *string, gint flags) |
|
290 { |
|
291 if (g_utf8_validate (string, -1, NULL)) |
|
292 return TRUE; |
|
293 else |
|
294 return FALSE; |
|
295 } |