/* SPDX-License-Identifier: LGPL-2.1+ */ #include #include #include #include "macro.h" #include "string-util.h" #include "xml.h" enum { STATE_NULL, STATE_TEXT, STATE_TAG, STATE_ATTRIBUTE, }; static void inc_lines(unsigned *line, const char *s, size_t n) { const char *p = s; if (!line) return; for (;;) { const char *f; f = memchr(p, '\n', n); if (!f) return; n -= (f - p) + 1; p = f + 1; (*line)++; } } /* We don't actually do real XML here. We only read a simplistic * subset, that is a bit less strict that XML and lacks all the more * complex features, like entities, or namespaces. However, we do * support some HTML5-like simplifications */ int xml_tokenize(const char **p, char **name, void **state, unsigned *line) { const char *c, *e, *b; char *ret; int t; assert(p); assert(*p); assert(name); assert(state); t = PTR_TO_INT(*state); c = *p; if (t == STATE_NULL) { if (line) *line = 1; t = STATE_TEXT; } for (;;) { if (*c == 0) return XML_END; switch (t) { case STATE_TEXT: { int x; e = strchrnul(c, '<'); if (e > c) { /* More text... */ ret = strndup(c, e - c); if (!ret) return -ENOMEM; inc_lines(line, c, e - c); *name = ret; *p = e; *state = INT_TO_PTR(STATE_TEXT); return XML_TEXT; } assert(*e == '<'); b = c + 1; if (startswith(b, "!--")) { /* A comment */ e = strstr(b + 3, "-->"); if (!e) return -EINVAL; inc_lines(line, b, e + 3 - b); c = e + 3; continue; } if (*b == '?') { /* Processing instruction */ e = strstr(b + 1, "?>"); if (!e) return -EINVAL; inc_lines(line, b, e + 2 - b); c = e + 2; continue; } if (*b == '!') { /* DTD */ e = strchr(b + 1, '>'); if (!e) return -EINVAL; inc_lines(line, b, e + 1 - b); c = e + 1; continue; } if (*b == '/') { /* A closing tag */ x = XML_TAG_CLOSE; b++; } else x = XML_TAG_OPEN; e = strpbrk(b, WHITESPACE "/>"); if (!e) return -EINVAL; ret = strndup(b, e - b); if (!ret) return -ENOMEM; *name = ret; *p = e; *state = INT_TO_PTR(STATE_TAG); return x; } case STATE_TAG: b = c + strspn(c, WHITESPACE); if (*b == 0) return -EINVAL; inc_lines(line, c, b - c); e = b + strcspn(b, WHITESPACE "=/>"); if (e > b) { /* An attribute */ ret = strndup(b, e - b); if (!ret) return -ENOMEM; *name = ret; *p = e; *state = INT_TO_PTR(STATE_ATTRIBUTE); return XML_ATTRIBUTE_NAME; } if (startswith(b, "/>")) { /* An empty tag */ *name = NULL; /* For empty tags we return a NULL name, the caller must be prepared for that */ *p = b + 2; *state = INT_TO_PTR(STATE_TEXT); return XML_TAG_CLOSE_EMPTY; } if (*b != '>') return -EINVAL; c = b + 1; t = STATE_TEXT; continue; case STATE_ATTRIBUTE: if (*c == '=') { c++; if (IN_SET(*c, '\'', '\"')) { /* Tag with a quoted value */ e = strchr(c+1, *c); if (!e) return -EINVAL; inc_lines(line, c, e - c); ret = strndup(c+1, e - c - 1); if (!ret) return -ENOMEM; *name = ret; *p = e + 1; *state = INT_TO_PTR(STATE_TAG); return XML_ATTRIBUTE_VALUE; } /* Tag with a value without quotes */ b = strpbrk(c, WHITESPACE ">"); if (!b) b = c; ret = strndup(c, b - c); if (!ret) return -ENOMEM; *name = ret; *p = b; *state = INT_TO_PTR(STATE_TAG); return XML_ATTRIBUTE_VALUE; } t = STATE_TAG; continue; } } assert_not_reached("Bad state"); }