diff options
author | Alessandro Ghedini <alessandro@ghedini.me> | 2015-09-03 21:22:56 +0200 |
---|---|---|
committer | Alessandro Ghedini <alessandro@ghedini.me> | 2015-09-03 21:22:56 +0200 |
commit | 97616326aa35c74f085c8b7b5bc4a497e049febc (patch) | |
tree | 789f334c888d3f0cdb42d202efe76622b0d2afcc /misc | |
parent | 69a0ce1df673d26271df9c1f58f14b6314538210 (diff) |
Imported Upstream version 0.10.0
Diffstat (limited to 'misc')
-rw-r--r-- | misc/charset_conv.c | 70 | ||||
-rw-r--r-- | misc/charset_conv.h | 4 |
2 files changed, 61 insertions, 13 deletions
diff --git a/misc/charset_conv.c b/misc/charset_conv.c index 31f53cc..bceb52a 100644 --- a/misc/charset_conv.c +++ b/misc/charset_conv.c @@ -36,6 +36,10 @@ #include <libguess.h> #endif +#if HAVE_UCHARDET +#include <uchardet.h> +#endif + #if HAVE_ICONV #include <iconv.h> #endif @@ -81,6 +85,7 @@ bool mp_charset_requires_guess(const char *user_cp) // Note that "utf8" is the UTF-8 codepage, while "utf8:..." specifies UTF-8 // by default, plus a codepage that is used if the input is not UTF-8. return bstrcasecmp0(res[0], "enca") == 0 || + bstrcasecmp0(res[0], "uchardet") == 0 || bstrcasecmp0(res[0], "auto") == 0 || bstrcasecmp0(res[0], "guess") == 0 || (r > 1 && bstrcasecmp0(res[0], "utf-8") == 0) || @@ -102,6 +107,11 @@ static const char *ms_bom_guess(bstr buf) #if HAVE_ENCA static const char *enca_guess(struct mp_log *log, bstr buf, const char *language) { + // Do our own UTF-8 detection, because ENCA seems to get it wrong sometimes + // (suggested by divVerent). Explicitly allow cut-off UTF-8. + if (bstr_validate_utf8(buf) > -8) + return "UTF-8"; + if (!language || !language[0]) language = "__"; // neutral language @@ -145,32 +155,58 @@ static const char *libguess_guess(struct mp_log *log, bstr buf, } #endif +#if HAVE_UCHARDET +static const char *mp_uchardet(void *talloc_ctx, struct mp_log *log, bstr buf) +{ + uchardet_t det = uchardet_new(); + if (!det) + return NULL; + if (uchardet_handle_data(det, buf.start, buf.len) != 0) { + uchardet_delete(det); + return NULL; + } + uchardet_data_end(det); + char *res = talloc_strdup(talloc_ctx, uchardet_get_charset(det)); + if (res && !res[0]) + res = NULL; + if (res) { + iconv_t icdsc = iconv_open("UTF-8", res); + if (icdsc == (iconv_t)(-1)) { + mp_warn(log, "Charset detected as %s, but not supported by iconv.\n", + res); + res = NULL; + } else { + iconv_close(icdsc); + } + } + uchardet_delete(det); + return res; +} +#endif + // Runs charset auto-detection on the input buffer, and returns the result. // If auto-detection fails, NULL is returned. // If user_cp doesn't refer to any known auto-detection (for example because // it's a real iconv codepage), user_cp is returned without even looking at // the buf data. -const char *mp_charset_guess(struct mp_log *log, bstr buf, const char *user_cp, - int flags) +// The return value may (but doesn't have to) be allocated under talloc_ctx. +const char *mp_charset_guess(void *talloc_ctx, struct mp_log *log, bstr buf, + const char *user_cp, int flags) { if (!mp_charset_requires_guess(user_cp)) return user_cp; bool use_auto = strcasecmp(user_cp, "auto") == 0; if (use_auto) { -#if HAVE_ENCA +#if HAVE_UCHARDET + user_cp = "uchardet"; +#elif HAVE_ENCA user_cp = "enca"; #else user_cp = "UTF-8:UTF-8-BROKEN"; #endif } - // Do our own UTF-8 detection, because at least ENCA seems to get it - // wrong sometimes (suggested by divVerent). - int r = bstr_validate_utf8(buf); - if (r >= 0 || (r > -8 && (flags & MP_ICONV_ALLOW_CUTOFF))) - return "UTF-8"; - bstr params[3] = {{0}}; split_colon(user_cp, 3, params); @@ -195,9 +231,17 @@ const char *mp_charset_guess(struct mp_log *log, bstr buf, const char *user_cp, if (bstrcasecmp0(type, "guess") == 0) res = libguess_guess(log, buf, lang); #endif +#if HAVE_UCHARDET + if (bstrcasecmp0(type, "uchardet") == 0) + res = mp_uchardet(talloc_ctx, log, buf); +#endif + if (bstrcasecmp0(type, "utf8") == 0 || bstrcasecmp0(type, "utf-8") == 0) { if (!fallback) fallback = params[1].start; // must be already 0-terminated + int r = bstr_validate_utf8(buf); + if (r >= 0 || (r > -8 && (flags & MP_ICONV_ALLOW_CUTOFF))) + res = "utf-8"; } if (res) { @@ -211,6 +255,7 @@ const char *mp_charset_guess(struct mp_log *log, bstr buf, const char *user_cp, if (!res && !(flags & MP_STRICT_UTF8)) res = "UTF-8-BROKEN"; + mp_verbose(log, "Using charset '%s'.\n", res); return res; } @@ -225,8 +270,11 @@ const char *mp_charset_guess(struct mp_log *log, bstr buf, const char *user_cp, bstr mp_charset_guess_and_conv_to_utf8(struct mp_log *log, bstr buf, const char *user_cp, int flags) { - return mp_iconv_to_utf8(log, buf, mp_charset_guess(log, buf, user_cp, flags), - flags); + void *tmp = talloc_new(NULL); + const char *cp = mp_charset_guess(tmp, log, buf, user_cp, flags); + bstr res = mp_iconv_to_utf8(log, buf, cp, flags); + talloc_free(tmp); + return res; } // Use iconv to convert buf to UTF-8. diff --git a/misc/charset_conv.h b/misc/charset_conv.h index 93bd91c..bd76ae0 100644 --- a/misc/charset_conv.h +++ b/misc/charset_conv.h @@ -14,8 +14,8 @@ enum { bool mp_charset_is_utf8(const char *user_cp); bool mp_charset_requires_guess(const char *user_cp); -const char *mp_charset_guess(struct mp_log *log, bstr buf, const char *user_cp, - int flags); +const char *mp_charset_guess(void *talloc_ctx, struct mp_log *log, bstr buf, + const char *user_cp, int flags); bstr mp_charset_guess_and_conv_to_utf8(struct mp_log *log, bstr buf, const char *user_cp, int flags); bstr mp_iconv_to_utf8(struct mp_log *log, bstr buf, const char *cp, int flags); |