src/cmd/fsexam/src/auto-detect.c
changeset 147 8c4ef02c14b8
equal deleted inserted replaced
146:841e634f8d60 147:8c4ef02c14b8
       
     1 /*
       
     2  * CDDL HEADER START
       
     3  *
       
     4  * The contents of this file are subject to the terms of the
       
     5  * Common Development and Distribution License (the "License").
       
     6  * You may not use this file except in compliance with the License.
       
     7  *
       
     8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
       
     9  * or http://www.opensolaris.org/os/licensing.
       
    10  * See the License for the specific language governing permissions
       
    11  * and limitations under the License.
       
    12  *
       
    13  * When distributing Covered Code, include this CDDL HEADER in each
       
    14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
       
    15  * If applicable, add the following below this CDDL HEADER, with the
       
    16  * fields enclosed by brackets "[]" replaced with your own identifying
       
    17  * information: Portions Copyright [yyyy] [name of copyright owner]
       
    18  *
       
    19  * CDDL HEADER END
       
    20  */
       
    21 
       
    22 /*
       
    23  * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
       
    24  * Use is subject to license terms.
       
    25  */
       
    26 
       
    27 
       
    28 #ifdef HAVE_CONFIG_H
       
    29 #include <config.h>
       
    30 #endif
       
    31 
       
    32 #include <glib.h>
       
    33 #include <glib/gi18n.h>
       
    34 #include <stdlib.h>
       
    35 #include <strings.h>
       
    36 
       
    37 #ifdef   HAVE_AUTO_EF_H
       
    38 #include <auto_ef.h>
       
    39 #endif
       
    40 
       
    41 #include "encoding.h"
       
    42 #include "auto-detect.h"
       
    43 
       
    44 short encoding2id (const gchar *encoding_name);
       
    45 
       
    46 #ifndef  HAVE_AUTO_EF_H
       
    47 gboolean 
       
    48 file_isutf8 (const gchar *filename, gint flags)
       
    49 {
       
    50     gchar *contents = NULL;
       
    51 
       
    52     if (filename == NULL)
       
    53         return FALSE;
       
    54 
       
    55     if (g_file_get_contents (filename, &contents, NULL, NULL)) {
       
    56         gboolean ret = FALSE;
       
    57 
       
    58         if (g_utf8_validate (contents, -1, NULL))
       
    59             ret = TRUE;
       
    60 
       
    61         g_free (contents);
       
    62 
       
    63         return ret;
       
    64     }else{
       
    65         //fsexam_errno = ERR_CANNOT_READ;   delay error handling 
       
    66     }
       
    67 
       
    68     return FALSE;
       
    69 }
       
    70 
       
    71 GList *
       
    72 file_encoding_detect (const gchar *filename, gint flags)
       
    73 {
       
    74     return NULL;
       
    75 }
       
    76 
       
    77 GList *
       
    78 str_encoding_detect (const gchar *string, gint flags)
       
    79 {
       
    80     return NULL;
       
    81 }
       
    82 
       
    83 void
       
    84 auto_encoding_free (GList *list)
       
    85 {
       
    86     return;
       
    87 }
       
    88 
       
    89 #else
       
    90 
       
    91 static const gchar *autoen_map[][2] = {
       
    92   /* auto detected          canonical */
       
    93     {"8859-1",              "ISO-8859-1"},
       
    94     {"8859-2",              "ISO-8859-2"},
       
    95     {"8859-5",              "ISO-8859-5"},
       
    96     {"8859-6",              "ISO-8859-6"},
       
    97     {"8859-7",              "ISO-8859-7"},
       
    98     {"8859-8",              "ISO-8859-8"},
       
    99     {"ascii",               "UTF-8"},
       
   100     {"iso-8859-1",          "ISO-8859-1"},
       
   101     {"iso-8859-2",          "ISO-8859-2"},
       
   102     {"iso-8859-5",          "ISO-8859-5"},
       
   103     {"iso-8859-6",          "ISO-8859-6"},
       
   104     {"iso-8859-7",          "ISO-8859-7"},
       
   105     {"iso-8859-8",          "ISO-8859-8"},
       
   106     {"ko_KR.cp949",         "cp949"}, 
       
   107     {"ko_KR.euc",           "EUC-KR"},
       
   108     {"koi8-r",              "8859-5"},
       
   109     {"zh_CN.euc",           "GB18030"},
       
   110     {"zh_CN.GB18030",       "GB18030"},
       
   111     {"zh_CN.iso2022-CN",    "GB2312"},
       
   112     {"zh_HK.hkscs",         "BIG5-HKSCS"},
       
   113     {"zh_TW-big5",          "BIG5"},
       
   114     {"zh_TW-euc",           "EUC-TW"},
       
   115 };
       
   116 
       
   117 /* convert encoding name detected to canonical encoding name */
       
   118 static const gchar *
       
   119 auto_encoding_to_canonical_name (const gchar *auto_encoding_name)
       
   120 {
       
   121     short low = 0;  
       
   122     short high = sizeof (autoen_map) / sizeof (autoen_map[0]) - 1;
       
   123     short mid;
       
   124 
       
   125     if (auto_encoding_name == NULL)
       
   126         return NULL;
       
   127 
       
   128     while (low <= high) {
       
   129         gint result;
       
   130 
       
   131         mid = (low + high) / 2;
       
   132         result = g_ascii_strcasecmp (auto_encoding_name, autoen_map[mid][0]);
       
   133 
       
   134         if (result == 0)
       
   135             return autoen_map[mid][1];
       
   136         else if (result > 0)
       
   137             low = mid + 1;
       
   138         else
       
   139             high = mid - 1;
       
   140     }
       
   141 
       
   142     /* may be in canonical name already */
       
   143     if (encoding2id (auto_encoding_name) != -1)
       
   144         return auto_encoding_name;
       
   145 
       
   146     return NULL;
       
   147 }
       
   148 
       
   149 gboolean
       
   150 file_isutf8 (const gchar *filename, gint flags)
       
   151 {
       
   152     auto_ef_t   *array_info = NULL;
       
   153     size_t      number = 0;
       
   154     gchar       *encoding = NULL;
       
   155     double      score = 0;
       
   156     gint        ret = FALSE;
       
   157 
       
   158     if (filename == NULL)
       
   159         return ret;
       
   160 
       
   161     number = auto_ef_file (&array_info, filename, flags);
       
   162     if (number != 1){
       
   163         ret = FALSE;
       
   164         goto free;
       
   165     }
       
   166 
       
   167     score = auto_ef_get_score (array_info[0]);
       
   168     if (abs(score - 100.0) > INACCURACY) {
       
   169         ret = FALSE;
       
   170         goto free;
       
   171     }
       
   172     encoding = auto_ef_get_encoding (array_info[0]);
       
   173     if ((strcmp (encoding, "UTF-8") == 0) || (strcmp (encoding, "ASCII") == 0)){
       
   174         ret = TRUE;
       
   175     }
       
   176 
       
   177 free:
       
   178     auto_ef_free (array_info);
       
   179 
       
   180     return ret;
       
   181 }
       
   182 
       
   183 GList *
       
   184 file_encoding_detect (const gchar *filename, gint flags)
       
   185 {
       
   186     EncodingPair    *pair = NULL;
       
   187     GList           *result = NULL;
       
   188     auto_ef_t       *array_info = NULL;
       
   189     size_t          number = 0;
       
   190     gint            i;
       
   191 
       
   192     if (filename == NULL)
       
   193         return NULL;
       
   194 
       
   195     number = auto_ef_file (&array_info, filename, flags);
       
   196     if (-1 == number)
       
   197         return NULL;
       
   198 
       
   199     for (i = number - 1; i >= 0; i--){
       
   200         const gchar *canonical_name = NULL;
       
   201 
       
   202         canonical_name =  auto_encoding_to_canonical_name (
       
   203                         auto_ef_get_encoding (array_info[i]));
       
   204 
       
   205         if (canonical_name == NULL) {
       
   206             g_print (_("Warning: can not convert encoding %s to canonical encoding name, will ignore it.\n"), 
       
   207                       auto_ef_get_encoding (array_info[i]));
       
   208             continue;
       
   209         }
       
   210 
       
   211         pair = g_new (EncodingPair, 1);
       
   212 
       
   213         pair->encoding_name = g_strdup (canonical_name);
       
   214         pair->score = auto_ef_get_score (array_info[i]);
       
   215 
       
   216         result = g_list_append (result, pair);
       
   217     }
       
   218 
       
   219 free:
       
   220     auto_ef_free (array_info);
       
   221     return result;
       
   222 }
       
   223 
       
   224 GList *
       
   225 str_encoding_detect (const gchar *string, gint flags)
       
   226 {
       
   227     EncodingPair    *pair = NULL;
       
   228     GList           *result = NULL;
       
   229     auto_ef_t       *array_info = NULL;
       
   230     size_t          number = 0;
       
   231     gint            i = 0;
       
   232 
       
   233     if (string == NULL)
       
   234         return NULL;
       
   235 
       
   236     number = auto_ef_str (&array_info, string, strlen(string), flags);
       
   237     if (-1 == number)
       
   238         return NULL;
       
   239 
       
   240     for (i = number - 1; i >= 0; i--){
       
   241         const gchar *canonical_name = NULL;
       
   242 
       
   243         canonical_name =  auto_encoding_to_canonical_name (
       
   244                         auto_ef_get_encoding (array_info[i]));
       
   245 
       
   246         if (canonical_name == NULL) {
       
   247             g_print (_("Warning: can not convert encoding %s to canonical encoding name, will ignore it.\n"), 
       
   248                       auto_ef_get_encoding (array_info[i]));
       
   249             continue;
       
   250         }
       
   251 
       
   252         pair = g_new (EncodingPair, 1);
       
   253 
       
   254         pair->encoding_name = g_strdup (canonical_name);
       
   255         pair->score = auto_ef_get_score (array_info[i]);
       
   256 
       
   257         result = g_list_append (result, pair);
       
   258     }
       
   259 
       
   260 free:
       
   261     auto_ef_free (array_info);
       
   262 
       
   263     return result;
       
   264 }
       
   265 
       
   266 void
       
   267 auto_encoding_free (GList *list)
       
   268 {
       
   269     EncodingPair    *pair = NULL;
       
   270 
       
   271     if (NULL == list)
       
   272         return;
       
   273 
       
   274     while (list){
       
   275         pair = list->data;
       
   276         if (pair != NULL)
       
   277             g_free (pair->encoding_name);
       
   278 
       
   279         list = list->next;
       
   280     }
       
   281 
       
   282     g_list_free (list);
       
   283 
       
   284     return;
       
   285 }
       
   286 #endif
       
   287 
       
   288 gboolean
       
   289 str_isutf8 (const gchar *string, gint flags)
       
   290 {
       
   291     if (g_utf8_validate (string, -1, NULL))
       
   292         return TRUE;
       
   293     else
       
   294         return FALSE;
       
   295 }