summaryrefslogtreecommitdiff
path: root/misc
diff options
context:
space:
mode:
authorAlessandro Ghedini <alessandro@ghedini.me>2015-09-03 21:22:56 +0200
committerAlessandro Ghedini <alessandro@ghedini.me>2015-09-03 21:22:56 +0200
commit97616326aa35c74f085c8b7b5bc4a497e049febc (patch)
tree789f334c888d3f0cdb42d202efe76622b0d2afcc /misc
parent69a0ce1df673d26271df9c1f58f14b6314538210 (diff)
Imported Upstream version 0.10.0
Diffstat (limited to 'misc')
-rw-r--r--misc/charset_conv.c70
-rw-r--r--misc/charset_conv.h4
2 files changed, 61 insertions, 13 deletions
diff --git a/misc/charset_conv.c b/misc/charset_conv.c
index 31f53cc..bceb52a 100644
--- a/misc/charset_conv.c
+++ b/misc/charset_conv.c
@@ -36,6 +36,10 @@
#include <libguess.h>
#endif
+#if HAVE_UCHARDET
+#include <uchardet.h>
+#endif
+
#if HAVE_ICONV
#include <iconv.h>
#endif
@@ -81,6 +85,7 @@ bool mp_charset_requires_guess(const char *user_cp)
// Note that "utf8" is the UTF-8 codepage, while "utf8:..." specifies UTF-8
// by default, plus a codepage that is used if the input is not UTF-8.
return bstrcasecmp0(res[0], "enca") == 0 ||
+ bstrcasecmp0(res[0], "uchardet") == 0 ||
bstrcasecmp0(res[0], "auto") == 0 ||
bstrcasecmp0(res[0], "guess") == 0 ||
(r > 1 && bstrcasecmp0(res[0], "utf-8") == 0) ||
@@ -102,6 +107,11 @@ static const char *ms_bom_guess(bstr buf)
#if HAVE_ENCA
static const char *enca_guess(struct mp_log *log, bstr buf, const char *language)
{
+ // Do our own UTF-8 detection, because ENCA seems to get it wrong sometimes
+ // (suggested by divVerent). Explicitly allow cut-off UTF-8.
+ if (bstr_validate_utf8(buf) > -8)
+ return "UTF-8";
+
if (!language || !language[0])
language = "__"; // neutral language
@@ -145,32 +155,58 @@ static const char *libguess_guess(struct mp_log *log, bstr buf,
}
#endif
+#if HAVE_UCHARDET
+static const char *mp_uchardet(void *talloc_ctx, struct mp_log *log, bstr buf)
+{
+ uchardet_t det = uchardet_new();
+ if (!det)
+ return NULL;
+ if (uchardet_handle_data(det, buf.start, buf.len) != 0) {
+ uchardet_delete(det);
+ return NULL;
+ }
+ uchardet_data_end(det);
+ char *res = talloc_strdup(talloc_ctx, uchardet_get_charset(det));
+ if (res && !res[0])
+ res = NULL;
+ if (res) {
+ iconv_t icdsc = iconv_open("UTF-8", res);
+ if (icdsc == (iconv_t)(-1)) {
+ mp_warn(log, "Charset detected as %s, but not supported by iconv.\n",
+ res);
+ res = NULL;
+ } else {
+ iconv_close(icdsc);
+ }
+ }
+ uchardet_delete(det);
+ return res;
+}
+#endif
+
// Runs charset auto-detection on the input buffer, and returns the result.
// If auto-detection fails, NULL is returned.
// If user_cp doesn't refer to any known auto-detection (for example because
// it's a real iconv codepage), user_cp is returned without even looking at
// the buf data.
-const char *mp_charset_guess(struct mp_log *log, bstr buf, const char *user_cp,
- int flags)
+// The return value may (but doesn't have to) be allocated under talloc_ctx.
+const char *mp_charset_guess(void *talloc_ctx, struct mp_log *log, bstr buf,
+ const char *user_cp, int flags)
{
if (!mp_charset_requires_guess(user_cp))
return user_cp;
bool use_auto = strcasecmp(user_cp, "auto") == 0;
if (use_auto) {
-#if HAVE_ENCA
+#if HAVE_UCHARDET
+ user_cp = "uchardet";
+#elif HAVE_ENCA
user_cp = "enca";
#else
user_cp = "UTF-8:UTF-8-BROKEN";
#endif
}
- // Do our own UTF-8 detection, because at least ENCA seems to get it
- // wrong sometimes (suggested by divVerent).
- int r = bstr_validate_utf8(buf);
- if (r >= 0 || (r > -8 && (flags & MP_ICONV_ALLOW_CUTOFF)))
- return "UTF-8";
-
bstr params[3] = {{0}};
split_colon(user_cp, 3, params);
@@ -195,9 +231,17 @@ const char *mp_charset_guess(struct mp_log *log, bstr buf, const char *user_cp,
if (bstrcasecmp0(type, "guess") == 0)
res = libguess_guess(log, buf, lang);
#endif
+#if HAVE_UCHARDET
+ if (bstrcasecmp0(type, "uchardet") == 0)
+ res = mp_uchardet(talloc_ctx, log, buf);
+#endif
+
if (bstrcasecmp0(type, "utf8") == 0 || bstrcasecmp0(type, "utf-8") == 0) {
if (!fallback)
fallback = params[1].start; // must be already 0-terminated
+ int r = bstr_validate_utf8(buf);
+ if (r >= 0 || (r > -8 && (flags & MP_ICONV_ALLOW_CUTOFF)))
+ res = "utf-8";
}
if (res) {
@@ -211,6 +255,7 @@ const char *mp_charset_guess(struct mp_log *log, bstr buf, const char *user_cp,
if (!res && !(flags & MP_STRICT_UTF8))
res = "UTF-8-BROKEN";
+ mp_verbose(log, "Using charset '%s'.\n", res);
return res;
}
@@ -225,8 +270,11 @@ const char *mp_charset_guess(struct mp_log *log, bstr buf, const char *user_cp,
bstr mp_charset_guess_and_conv_to_utf8(struct mp_log *log, bstr buf,
const char *user_cp, int flags)
{
- return mp_iconv_to_utf8(log, buf, mp_charset_guess(log, buf, user_cp, flags),
- flags);
+ void *tmp = talloc_new(NULL);
+ const char *cp = mp_charset_guess(tmp, log, buf, user_cp, flags);
+ bstr res = mp_iconv_to_utf8(log, buf, cp, flags);
+ talloc_free(tmp);
+ return res;
}
// Use iconv to convert buf to UTF-8.
diff --git a/misc/charset_conv.h b/misc/charset_conv.h
index 93bd91c..bd76ae0 100644
--- a/misc/charset_conv.h
+++ b/misc/charset_conv.h
@@ -14,8 +14,8 @@ enum {
bool mp_charset_is_utf8(const char *user_cp);
bool mp_charset_requires_guess(const char *user_cp);
-const char *mp_charset_guess(struct mp_log *log, bstr buf, const char *user_cp,
- int flags);
+const char *mp_charset_guess(void *talloc_ctx, struct mp_log *log, bstr buf,
+ const char *user_cp, int flags);
bstr mp_charset_guess_and_conv_to_utf8(struct mp_log *log, bstr buf,
const char *user_cp, int flags);
bstr mp_iconv_to_utf8(struct mp_log *log, bstr buf, const char *cp, int flags);