Imported Upstream version 0.10.0

author: Alessandro Ghedini <alessandro@ghedini.me> 2015-09-03 21:22:56 +0200
committer: Alessandro Ghedini <alessandro@ghedini.me> 2015-09-03 21:22:56 +0200
commit: 97616326aa35c74f085c8b7b5bc4a497e049febc (patch)
tree: 789f334c888d3f0cdb42d202efe76622b0d2afcc /misc
parent: 69a0ce1df673d26271df9c1f58f14b6314538210 (diff)
2 files changed, 61 insertions, 13 deletions
diff --git a/misc/charset_conv.c b/misc/charset_conv.c
index 31f53cc..bceb52a 100644
--- a/misc/charset_conv.c
+++ b/misc/charset_conv.c
@@ -36,6 +36,10 @@
 #include <libguess.h>
 #endif
 
+#if HAVE_UCHARDET
+#include <uchardet.h>
+#endif
+
 #if HAVE_ICONV
 #include <iconv.h>
 #endif
@@ -81,6 +85,7 @@ bool mp_charset_requires_guess(const char *user_cp)
     // Note that "utf8" is the UTF-8 codepage, while "utf8:..." specifies UTF-8
     // by default, plus a codepage that is used if the input is not UTF-8.
     return bstrcasecmp0(res[0], "enca") == 0 ||
+           bstrcasecmp0(res[0], "uchardet") == 0 ||
            bstrcasecmp0(res[0], "auto") == 0 ||
            bstrcasecmp0(res[0], "guess") == 0 ||
            (r > 1 && bstrcasecmp0(res[0], "utf-8") == 0) ||
@@ -102,6 +107,11 @@ static const char *ms_bom_guess(bstr buf)
 #if HAVE_ENCA
 static const char *enca_guess(struct mp_log *log, bstr buf, const char *language)
 {
+    // Do our own UTF-8 detection, because ENCA seems to get it wrong sometimes
+    // (suggested by divVerent). Explicitly allow cut-off UTF-8.
+    if (bstr_validate_utf8(buf) > -8)
+        return "UTF-8";
+
     if (!language || !language[0])
         language = "__"; // neutral language
 
@@ -145,32 +155,58 @@ static const char *libguess_guess(struct mp_log *log, bstr buf,
 }
 #endif
 
+#if HAVE_UCHARDET
+static const char *mp_uchardet(void *talloc_ctx, struct mp_log *log, bstr buf)
+{
+    uchardet_t det = uchardet_new();
+    if (!det)
+        return NULL;
+    if (uchardet_handle_data(det, buf.start, buf.len) != 0) {
+        uchardet_delete(det);
+        return NULL;
+    }
+    uchardet_data_end(det);
+    char *res = talloc_strdup(talloc_ctx, uchardet_get_charset(det));
+    if (res && !res[0])
+        res = NULL;
+    if (res) {
+        iconv_t icdsc = iconv_open("UTF-8", res);
+        if (icdsc == (iconv_t)(-1)) {
+            mp_warn(log, "Charset detected as %s, but not supported by iconv.\n",
+                    res);
+            res = NULL;
+        } else {
+            iconv_close(icdsc);
+        }
+    }
+    uchardet_delete(det);
+    return res;
+}
+#endif
+
 // Runs charset auto-detection on the input buffer, and returns the result.
 // If auto-detection fails, NULL is returned.
 // If user_cp doesn't refer to any known auto-detection (for example because
 // it's a real iconv codepage), user_cp is returned without even looking at
 // the buf data.
-const char *mp_charset_guess(struct mp_log *log, bstr buf, const char *user_cp,
-                             int flags)
+// The return value may (but doesn't have to) be allocated under talloc_ctx.
+const char *mp_charset_guess(void *talloc_ctx, struct mp_log *log, bstr buf,
+                             const char *user_cp, int flags)
 {
     if (!mp_charset_requires_guess(user_cp))
         return user_cp;
 
     bool use_auto = strcasecmp(user_cp, "auto") == 0;
     if (use_auto) {
-#if HAVE_ENCA
+#if HAVE_UCHARDET
+        user_cp = "uchardet";
+#elif HAVE_ENCA
         user_cp = "enca";
 #else
         user_cp = "UTF-8:UTF-8-BROKEN";
 #endif
     }
 
-    // Do our own UTF-8 detection, because at least ENCA seems to get it
-    // wrong sometimes (suggested by divVerent).
-    int r = bstr_validate_utf8(buf);
-    if (r >= 0 || (r > -8 && (flags & MP_ICONV_ALLOW_CUTOFF)))
-        return "UTF-8";
-
     bstr params[3] = {{0}};
     split_colon(user_cp, 3, params);
 
@@ -195,9 +231,17 @@ const char *mp_charset_guess(struct mp_log *log, bstr buf, const char *user_cp,
     if (bstrcasecmp0(type, "guess") == 0)
         res = libguess_guess(log, buf, lang);
 #endif
+#if HAVE_UCHARDET
+    if (bstrcasecmp0(type, "uchardet") == 0)
+        res = mp_uchardet(talloc_ctx, log, buf);
+#endif
+
     if (bstrcasecmp0(type, "utf8") == 0 || bstrcasecmp0(type, "utf-8") == 0) {
         if (!fallback)
             fallback = params[1].start; // must be already 0-terminated
+        int r = bstr_validate_utf8(buf);
+        if (r >= 0 || (r > -8 && (flags & MP_ICONV_ALLOW_CUTOFF)))
+            res = "utf-8";
     }
 
     if (res) {
@@ -211,6 +255,7 @@ const char *mp_charset_guess(struct mp_log *log, bstr buf, const char *user_cp,
     if (!res && !(flags & MP_STRICT_UTF8))
         res = "UTF-8-BROKEN";
 
+    mp_verbose(log, "Using charset '%s'.\n", res);
     return res;
 }
 
@@ -225,8 +270,11 @@ const char *mp_charset_guess(struct mp_log *log, bstr buf, const char *user_cp,
 bstr mp_charset_guess_and_conv_to_utf8(struct mp_log *log, bstr buf,
                                        const char *user_cp, int flags)
 {
-    return mp_iconv_to_utf8(log, buf, mp_charset_guess(log, buf, user_cp, flags),
-                            flags);
+    void *tmp = talloc_new(NULL);
+    const char *cp = mp_charset_guess(tmp, log, buf, user_cp, flags);
+    bstr res = mp_iconv_to_utf8(log, buf, cp, flags);
+    talloc_free(tmp);
+    return res;
 }
 
 // Use iconv to convert buf to UTF-8.
diff --git a/misc/charset_conv.h b/misc/charset_conv.h
index 93bd91c..bd76ae0 100644
--- a/misc/charset_conv.h
+++ b/misc/charset_conv.h
@@ -14,8 +14,8 @@ enum {
 
 bool mp_charset_is_utf8(const char *user_cp);
 bool mp_charset_requires_guess(const char *user_cp);
-const char *mp_charset_guess(struct mp_log *log, bstr buf, const char *user_cp,
-                             int flags);
+const char *mp_charset_guess(void *talloc_ctx, struct mp_log *log, bstr buf,
+                             const char *user_cp, int flags);
 bstr mp_charset_guess_and_conv_to_utf8(struct mp_log *log, bstr buf,
                                        const char *user_cp, int flags);
 bstr mp_iconv_to_utf8(struct mp_log *log, bstr buf, const char *cp, int flags);
author	Alessandro Ghedini <alessandro@ghedini.me>	2015-09-03 21:22:56 +0200
committer	Alessandro Ghedini <alessandro@ghedini.me>	2015-09-03 21:22:56 +0200
commit	97616326aa35c74f085c8b7b5bc4a497e049febc (patch)
tree	789f334c888d3f0cdb42d202efe76622b0d2afcc /misc
parent	69a0ce1df673d26271df9c1f58f14b6314538210 (diff)