summaryrefslogtreecommitdiff
path: root/src/basic/xml.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/basic/xml.c')
-rw-r--r--src/basic/xml.c238
1 files changed, 238 insertions, 0 deletions
diff --git a/src/basic/xml.c b/src/basic/xml.c
new file mode 100644
index 000000000..cb34d870c
--- /dev/null
+++ b/src/basic/xml.c
@@ -0,0 +1,238 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+
+#include <errno.h>
+#include <stddef.h>
+#include <string.h>
+
+#include "macro.h"
+#include "string-util.h"
+#include "xml.h"
+
+enum {
+ STATE_NULL,
+ STATE_TEXT,
+ STATE_TAG,
+ STATE_ATTRIBUTE,
+};
+
+static void inc_lines(unsigned *line, const char *s, size_t n) {
+ const char *p = s;
+
+ if (!line)
+ return;
+
+ for (;;) {
+ const char *f;
+
+ f = memchr(p, '\n', n);
+ if (!f)
+ return;
+
+ n -= (f - p) + 1;
+ p = f + 1;
+ (*line)++;
+ }
+}
+
+/* We don't actually do real XML here. We only read a simplistic
+ * subset, that is a bit less strict that XML and lacks all the more
+ * complex features, like entities, or namespaces. However, we do
+ * support some HTML5-like simplifications */
+
+int xml_tokenize(const char **p, char **name, void **state, unsigned *line) {
+ const char *c, *e, *b;
+ char *ret;
+ int t;
+
+ assert(p);
+ assert(*p);
+ assert(name);
+ assert(state);
+
+ t = PTR_TO_INT(*state);
+ c = *p;
+
+ if (t == STATE_NULL) {
+ if (line)
+ *line = 1;
+ t = STATE_TEXT;
+ }
+
+ for (;;) {
+ if (*c == 0)
+ return XML_END;
+
+ switch (t) {
+
+ case STATE_TEXT: {
+ int x;
+
+ e = strchrnul(c, '<');
+ if (e > c) {
+ /* More text... */
+ ret = strndup(c, e - c);
+ if (!ret)
+ return -ENOMEM;
+
+ inc_lines(line, c, e - c);
+
+ *name = ret;
+ *p = e;
+ *state = INT_TO_PTR(STATE_TEXT);
+
+ return XML_TEXT;
+ }
+
+ assert(*e == '<');
+ b = c + 1;
+
+ if (startswith(b, "!--")) {
+ /* A comment */
+ e = strstr(b + 3, "-->");
+ if (!e)
+ return -EINVAL;
+
+ inc_lines(line, b, e + 3 - b);
+
+ c = e + 3;
+ continue;
+ }
+
+ if (*b == '?') {
+ /* Processing instruction */
+
+ e = strstr(b + 1, "?>");
+ if (!e)
+ return -EINVAL;
+
+ inc_lines(line, b, e + 2 - b);
+
+ c = e + 2;
+ continue;
+ }
+
+ if (*b == '!') {
+ /* DTD */
+
+ e = strchr(b + 1, '>');
+ if (!e)
+ return -EINVAL;
+
+ inc_lines(line, b, e + 1 - b);
+
+ c = e + 1;
+ continue;
+ }
+
+ if (*b == '/') {
+ /* A closing tag */
+ x = XML_TAG_CLOSE;
+ b++;
+ } else
+ x = XML_TAG_OPEN;
+
+ e = strpbrk(b, WHITESPACE "/>");
+ if (!e)
+ return -EINVAL;
+
+ ret = strndup(b, e - b);
+ if (!ret)
+ return -ENOMEM;
+
+ *name = ret;
+ *p = e;
+ *state = INT_TO_PTR(STATE_TAG);
+
+ return x;
+ }
+
+ case STATE_TAG:
+
+ b = c + strspn(c, WHITESPACE);
+ if (*b == 0)
+ return -EINVAL;
+
+ inc_lines(line, c, b - c);
+
+ e = b + strcspn(b, WHITESPACE "=/>");
+ if (e > b) {
+ /* An attribute */
+
+ ret = strndup(b, e - b);
+ if (!ret)
+ return -ENOMEM;
+
+ *name = ret;
+ *p = e;
+ *state = INT_TO_PTR(STATE_ATTRIBUTE);
+
+ return XML_ATTRIBUTE_NAME;
+ }
+
+ if (startswith(b, "/>")) {
+ /* An empty tag */
+
+ *name = NULL; /* For empty tags we return a NULL name, the caller must be prepared for that */
+ *p = b + 2;
+ *state = INT_TO_PTR(STATE_TEXT);
+
+ return XML_TAG_CLOSE_EMPTY;
+ }
+
+ if (*b != '>')
+ return -EINVAL;
+
+ c = b + 1;
+ t = STATE_TEXT;
+ continue;
+
+ case STATE_ATTRIBUTE:
+
+ if (*c == '=') {
+ c++;
+
+ if (IN_SET(*c, '\'', '\"')) {
+ /* Tag with a quoted value */
+
+ e = strchr(c+1, *c);
+ if (!e)
+ return -EINVAL;
+
+ inc_lines(line, c, e - c);
+
+ ret = strndup(c+1, e - c - 1);
+ if (!ret)
+ return -ENOMEM;
+
+ *name = ret;
+ *p = e + 1;
+ *state = INT_TO_PTR(STATE_TAG);
+
+ return XML_ATTRIBUTE_VALUE;
+
+ }
+
+ /* Tag with a value without quotes */
+
+ b = strpbrk(c, WHITESPACE ">");
+ if (!b)
+ b = c;
+
+ ret = strndup(c, b - c);
+ if (!ret)
+ return -ENOMEM;
+
+ *name = ret;
+ *p = b;
+ *state = INT_TO_PTR(STATE_TAG);
+ return XML_ATTRIBUTE_VALUE;
+ }
+
+ t = STATE_TAG;
+ continue;
+ }
+
+ }
+
+ assert_not_reached("Bad state");
+}