summaryrefslogtreecommitdiff
path: root/misc
diff options
context:
space:
mode:
authorAlessandro Ghedini <alessandro@ghedini.me>2014-08-12 11:49:18 +0200
committerAlessandro Ghedini <alessandro@ghedini.me>2014-08-12 11:49:18 +0200
commit82426933e01d05934635d2e50b910031515c2b40 (patch)
treea0aa4a294a1478761606898aab76ad11053a1025 /misc
parent5a7efa38ba0f5a971e375c01a7f4c5a572bbfe83 (diff)
Imported Upstream version 0.5.0
Diffstat (limited to 'misc')
-rw-r--r--misc/charset_conv.c35
-rw-r--r--misc/ctype.h19
-rw-r--r--misc/rendezvous.c54
-rw-r--r--misc/rendezvous.h8
4 files changed, 112 insertions, 4 deletions
diff --git a/misc/charset_conv.c b/misc/charset_conv.c
index 2146f09..31f53cc 100644
--- a/misc/charset_conv.c
+++ b/misc/charset_conv.c
@@ -21,6 +21,7 @@
#include <stdlib.h>
#include <errno.h>
+#include <strings.h>
#include <assert.h>
#include "config.h"
@@ -80,11 +81,24 @@ bool mp_charset_requires_guess(const char *user_cp)
// Note that "utf8" is the UTF-8 codepage, while "utf8:..." specifies UTF-8
// by default, plus a codepage that is used if the input is not UTF-8.
return bstrcasecmp0(res[0], "enca") == 0 ||
+ bstrcasecmp0(res[0], "auto") == 0 ||
bstrcasecmp0(res[0], "guess") == 0 ||
(r > 1 && bstrcasecmp0(res[0], "utf-8") == 0) ||
(r > 1 && bstrcasecmp0(res[0], "utf8") == 0);
}
+static const char *const utf_bom[3] = {"\xEF\xBB\xBF", "\xFF\xFE", "\xFE\xFF"};
+static const char *const utf_enc[3] = {"utf-8", "utf-16le", "utf-16be"};
+
+static const char *ms_bom_guess(bstr buf)
+{
+ for (int n = 0; n < 3; n++) {
+ if (bstr_startswith0(buf, utf_bom[n]))
+ return utf_enc[n];
+ }
+ return NULL;
+}
+
#if HAVE_ENCA
static const char *enca_guess(struct mp_log *log, bstr buf, const char *language)
{
@@ -102,8 +116,7 @@ static const char *enca_guess(struct mp_log *log, bstr buf, const char *language
detected_cp = tmp;
enca_analyser_free(analyser);
} else {
- mp_err(log, "ENCA doesn't know language '%s'\n",
- language);
+ mp_err(log, "ENCA doesn't know language '%s'\n", language);
size_t langcnt;
const char **languages = enca_get_languages(&langcnt);
mp_err(log, "ENCA supported languages:");
@@ -143,6 +156,15 @@ const char *mp_charset_guess(struct mp_log *log, bstr buf, const char *user_cp,
if (!mp_charset_requires_guess(user_cp))
return user_cp;
+ bool use_auto = strcasecmp(user_cp, "auto") == 0;
+ if (use_auto) {
+#if HAVE_ENCA
+ user_cp = "enca";
+#else
+ user_cp = "UTF-8:UTF-8-BROKEN";
+#endif
+ }
+
// Do our own UTF-8 detection, because at least ENCA seems to get it
// wrong sometimes (suggested by divVerent).
int r = bstr_validate_utf8(buf);
@@ -159,6 +181,12 @@ const char *mp_charset_guess(struct mp_log *log, bstr buf, const char *user_cp,
const char *res = NULL;
+ if (use_auto) {
+ res = ms_bom_guess(buf);
+ if (res)
+ type = bstr0("auto");
+ }
+
#if HAVE_ENCA
if (bstrcasecmp0(type, "enca") == 0)
res = enca_guess(log, buf, lang);
@@ -173,8 +201,7 @@ const char *mp_charset_guess(struct mp_log *log, bstr buf, const char *user_cp,
}
if (res) {
- mp_dbg(log, "%.*s detected charset: '%s'\n",
- BSTR_P(type), res);
+ mp_dbg(log, "%.*s detected charset: '%s'\n", BSTR_P(type), res);
} else {
res = fallback;
mp_dbg(log, "Detection with %.*s failed: fallback to %s\n",
diff --git a/misc/ctype.h b/misc/ctype.h
new file mode 100644
index 0000000..cbff799
--- /dev/null
+++ b/misc/ctype.h
@@ -0,0 +1,19 @@
+#ifndef MP_CTYPE_H_
+#define MP_CTYPE_H_
+
+// Roughly follows C semantics, but doesn't account for EOF, allows char as
+// parameter, and is locale independent (always uses "C" locale).
+
+static inline int mp_isprint(char c) { return (unsigned char)c >= 32; }
+static inline int mp_isspace(char c) { return c == ' ' || c == '\f' || c == '\n' ||
+ c == '\r' || c == '\t' || c =='\v'; }
+static inline int mp_isupper(char c) { return c >= 'A' && c <= 'Z'; }
+static inline int mp_islower(char c) { return c >= 'a' && c <= 'z'; }
+static inline int mp_isdigit(char c) { return c >= '0' && c <= '9'; }
+static inline int mp_isalpha(char c) { return mp_isupper(c) || mp_islower(c); }
+static inline int mp_isalnum(char c) { return mp_isalpha(c) || mp_isdigit(c); }
+
+static inline char mp_tolower(char c) { return mp_isupper(c) ? c - 'A' + 'a' : c; }
+static inline char mp_toupper(char c) { return mp_islower(c) ? c - 'a' + 'A' : c; }
+
+#endif
diff --git a/misc/rendezvous.c b/misc/rendezvous.c
new file mode 100644
index 0000000..9af798d
--- /dev/null
+++ b/misc/rendezvous.c
@@ -0,0 +1,54 @@
+#include <pthread.h>
+
+#include "rendezvous.h"
+
+static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t wakeup = PTHREAD_COND_INITIALIZER;
+
+static struct waiter *waiters;
+
+struct waiter {
+ void *tag;
+ struct waiter *next;
+ intptr_t *value;
+};
+
+/* A barrier for 2 threads, which can exchange a value when they meet.
+ * The first thread to call this function will block. As soon as two threads
+ * are calling this function with the same tag value, they will unblock, and
+ * on each thread the call return the value parameter of the _other_ thread.
+ *
+ * tag is an arbitrary value, but it must be an unique pointer. If there are
+ * more than 2 threads using the same tag, things won't work. Typically, it
+ * will have to point to a memory allocation or to the stack, while pointing
+ * it to static data is always a bug.
+ *
+ * This shouldn't be used for performance critical code (uses a linked list
+ * of _all_ waiters in the process, and temporarily wakes up _all_ waiters on
+ * each second call).
+ *
+ * This is inspired by: http://9atom.org/magic/man2html/2/rendezvous */
+intptr_t mp_rendezvous(void *tag, intptr_t value)
+{
+ struct waiter wait = { .tag = tag, .value = &value };
+ pthread_mutex_lock(&lock);
+ struct waiter **prev = &waiters;
+ while (*prev) {
+ if ((*prev)->tag == tag) {
+ intptr_t tmp = *(*prev)->value;
+ *(*prev)->value = value;
+ value = tmp;
+ (*prev)->value = NULL; // signals completion
+ *prev = (*prev)->next; // unlink
+ pthread_cond_broadcast(&wakeup);
+ goto done;
+ }
+ prev = &(*prev)->next;
+ }
+ *prev = &wait;
+ while (wait.value)
+ pthread_cond_wait(&wakeup, &lock);
+done:
+ pthread_mutex_unlock(&lock);
+ return value;
+}
diff --git a/misc/rendezvous.h b/misc/rendezvous.h
new file mode 100644
index 0000000..ffcc89a
--- /dev/null
+++ b/misc/rendezvous.h
@@ -0,0 +1,8 @@
+#ifndef MP_RENDEZVOUS_H_
+#define MP_RENDEZVOUS_H_
+
+#include <stdint.h>
+
+intptr_t mp_rendezvous(void *tag, intptr_t value);
+
+#endif