;;; esxml-query.el --- select esxml nodes jQuery-style ;; Copyright (C) 2017 Vasilij Schneidermann ;; Author: Vasilij Schneidermann ;; Maintainer: Vasilij Schneidermann ;; Version: 0.0.1 ;; Keywords: data, lisp ;; Package-Requires: ((cl-lib "0.1")) ;; ;; This program is free software; you can redistribute it and/or ;; modify it under the terms of the GNU General Public License as ;; published by the Free Software Foundation, either version 3 of the ;; License, or (at your option) any later version. ;; This program is distributed in the hope that it will be useful, ;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;; GNU General Public License for more details. ;; You should have received a copy of the GNU General Public License ;; along with this program. If not, see . ;;; Commentary: ;; Traditionally people pick one of the following options when faced ;; with the task of extracting data from XML in Emacs Lisp: ;; ;; - Using regular expressions on the unparsed document ;; - Manual tree traversal with `assoc', `car' and `cdr' ;; ;; Browsers faced a similar problem until jQuery happened, shortly ;; afterwards they started providing the `node.querySelector' and ;; `node.querySelectorAll' API for retrieving one or all nodes ;; matching a given CSS selector. This code implements the same API ;; with the `esxml-query' and `esxml-query-all' functions. The ;; following table summarizes the currently supported modifiers and ;; combinators: ;; ;; | Name | Supported? | Syntax | ;; |------------------------------------+------------+-------------| ;; | Namespaces | No | foo|bar | ;; | Commas | Yes | foo, bar | ;; | Descendant combinator | Yes | foo bar | ;; | Child combinator | Yes | foo>bar | ;; | Adjacent sibling combinator | No | foo+bar | ;; | General sibling combinator | No | foo~bar | ;; | Universal selector | Yes | * | ;; | Type selector | Yes | tag | ;; | ID selector | Yes | #foo | ;; | Class selector | Yes | .foo | ;; | Attribute selector | Yes | [foo] | ;; | Exact match attribute selector | Yes | [foo=bar] | ;; | Prefix match attribute selector | Yes | [foo^=bar] | ;; | Suffix match attribute selector | Yes | [foo$=bar] | ;; | Substring match attribute selector | Yes | [foo*=bar] | ;; | Include match attribute selector | Yes | [foo~=bar] | ;; | Dash match attribute selector | Yes | [foo|=bar] | ;; | Attribute selector modifiers | No | [foo=bar i] | ;; | Pseudo elements | No | ::foo | ;; | Pseudo classes | No | :foo | ;;; Code: (require 'cl-lib) ;;; CSS selector parsing ;; https://www.w3.org/TR/selectors/#w3cselgrammar ;; https://www.w3.org/TR/selectors4/#grammar ;; https://www.w3.org/TR/2003/WD-css3-syntax-20030813/#detailed-grammar ;; you might be wondering why I'm using both level 3 and 4 standards, ;; well, the level 3 one has a buggy lexer section whereas level 4 ;; omits crucial parser definitions, so both have to be used... ;; TODO: support :not (defvar esxml--css-selector-token-matchers (let* ((h "[0-9a-f]") (nl "\n\\|\r\n\\|\r\\|\f") (nonascii "[\200-\U0010ffff]") (unicode (format "\\%s\\{1,6\\}[ \t\r\n\f]?" h)) (escape (format "\\(?:%s\\)\\|\\[ -~\200-\U0010ffff]" unicode)) (nmstart (format "[a-z]\\|%s\\|\\(?:%s\\)" nonascii escape)) (nmchar (format "[a-z0-9-]\\|%s\\|\\(?:%s\\)" nonascii escape)) (num "[0-9]+\\|[0-9]*\\.[0-9]+") (string1 (format "\"\\(?:[\t !#$%%&(-~]\\|\\(?:%s\\)\\|\'\\|%s\\|\\(?:%s\\)\\)*\"" nl nonascii escape)) (string2 (format "'\\(?:[\t !#$%%&(-~]\\|\\(?:%s\\)\\|\"\\|%s\\|\\(?:%s\\)\\)*'" nl nonascii escape)) (ident (format "[-]?\\(?:%s\\)\\(?:%s\\)*" nmstart nmchar)) (unit (format "[-]?\\(?:%s\\)\\(?:%s\\)+" nmstart nmchar)) (name (format "\\(?:%s\\)+" nmchar))) `((whitespace . "[ \t\r\n\f]+") (string . ,(format "\\(?:%s\\|%s\\)" string1 string2)) (ident . ,ident) (hash . ,(format "#%s" name)) (function . ,(format "%s(" ident)) (number . ,num) (dimension . ,(format "\\(?:%s\\)%s" num unit)) (prefix-match . "\\^=") (suffix-match . "\\$=") (substring-match . "\\*=") (include-match . "~=") (dash-match . "|=") (comma . ",") (gt . ">") (plus . "\\+") (minus . "-") (tilde . "~") (asterisk . "\\*") (period . "\\.") (equals . "=") (colon . ":") (lbracket . "\\[") (rbracket . "\\]") (rparen . ")")))) (defun esxml--tokenize-css-selector (string) (let (result) (with-temp-buffer (insert string) (goto-char (point-min)) (while (not (eobp)) (let ((max-length 0) longest) (dolist (matcher esxml--css-selector-token-matchers) (let ((id (car matcher)) (re (cdr matcher))) (when (looking-at re) (let* ((token (match-string 0)) (length (length token))) (when (> length max-length) (setq max-length length) (setq longest (cons id token))))))) (when (not longest) (error "Invalid token detected: %s" (buffer-substring (point) (point-max)))) (push longest result) (goto-char (+ (point) max-length))))) (nreverse result))) ;; the alternative is creating a mutable object with peek/next methods ;; and passing it around, so I chose the one requiring less typing, a ;; dynamically bound variable :< (defvar esxml--token-stream) ;; TODO: support :not ;; css-selector: ;; css-selector-list; ;; css-selector-list: ;; complex-css-selector [ comma whitespace* complex-css-selector ]*; ;; complex-css-selector: ;; compound-css-selector [ css-combinator compound-css-selector ]* whitespace*; ;; css-combinator: ;; whitespace+ | whitespace* [ '>' | '+' | '~' ] whitespace*; ;; compound-css-selector: ;; css-type-selector css-modifier* | css-modifier+; ;; css-type-selector: ;; IDENT | *; ;; css-modifier: ;; css-id | css-class | css-attrib | css-pseudo; ;; css-id: ;; HASH; ;; css-class: ;; '.' IDENT; ;; css-attrib: ;; '[' whitespace* css-attrib-name ']' ;; | '[' whitespace* css-attrib-name css-attrib-match css-attrib-value whitespace* ']'; ;; css-attrib-name: ;; IDENT whitespace*; ;; css-attrib-match: ;; [ '=' | PREFIX-MATCH | SUFFIX-MATCH | SUBSTRING-MATCH | INCLUDE-MATCH | DASH-MATCH ] whitespace*; ;; css-attrib-value: ;; IDENT | STRING; ;; css-pseudo: ;; ':' ':'? [ IDENT | css-functional-pseudo ]; ;; css-functional-pseudo: ;; FUNCTION whitespace* [ css-expression whitespace* ]+ ')'; ;; css-expression: ;; '+' | '-' | DIMENSION | NUMBER | STRING | IDENT (defmacro esxml--with-parse-shorthands (&rest body) `(cl-macrolet ((peek () '(car esxml--token-stream)) (next () '(pop esxml--token-stream)) (accept (type) `(and (peek) (eq (car (peek)) ,type) (cdr (next)))) (eat-whitespace () '(while (accept 'whitespace)))) ,@body)) (def-edebug-spec esxml--with-parse-shorthands (body)) (defun esxml-parse-css-selector (string) "Parse CSS selector STRING into a list of alists. Each alist represents a complex CSS selector. The result can be passed to `esxml-query' and `esxml-query-all' as the selector argument." (let* ((esxml--token-stream (esxml--tokenize-css-selector string)) (result (esxml--parse-css-selector-list))) (when esxml--token-stream (error "Trailing garbage: %s" (mapconcat 'cdr esxml--token-stream ""))) result)) (defun esxml--parse-css-selector-list () (esxml--with-parse-shorthands (let ((first (esxml--parse-complex-css-selector)) result) (when (not first) (error "Expected at least one selector")) (push first result) (while (accept 'comma) (eat-whitespace) (let ((selector (esxml--parse-complex-css-selector))) (when (not selector) (error "Expected selector after comma")) (push selector result))) (nreverse result)))) (defun esxml--parse-complex-css-selector () (esxml--with-parse-shorthands (let ((first (esxml--parse-compound-css-selector)) result done) (when first (push first result) (while (not done) (let ((combinator (esxml--parse-css-combinator))) (if combinator (let ((compound (esxml--parse-compound-css-selector))) (cond (compound (setq result (append (list compound combinator) result))) ;; allow whitespace before comma ((not (eq (car (peek)) 'comma)) (error "Trailing combinator")))) (setq done t)))) (nreverse result))))) (defun esxml--parse-css-combinator () (esxml--with-parse-shorthands ;; NOTE: whitespace-surrounded combinators are distinguished from ;; whitespace-only ones by checking whether there has been ;; whitespace followed by a non-blank combinator (let ((leading-whitespace-p (eq (car (peek)) 'whitespace)) result) (eat-whitespace) (let ((type (car (peek)))) (cond ((member type '(gt plus tilde)) (next) (cond ((eq type 'gt) (setq result '((combinator . child)))) ((eq type 'plus) (setq result '((combinator . direct-sibling)))) ((eq type 'tilde) (setq result '((combinator . indirect-sibling))))) (eat-whitespace)) (leading-whitespace-p (setq result '((combinator . descendant)))) (t nil))) result))) (defun esxml--parse-compound-css-selector () (esxml--with-parse-shorthands (let ((type-selector (esxml--parse-css-type-selector)) done result) ;; NOTE: css-type-selector css-modifier* | css-modifier+; is ;; equivalent to: [ css-type-selector | css-modifier ] css-modifier*; (if type-selector (push type-selector result) (let ((modifier (esxml--parse-css-modifier))) (if modifier (push modifier result) ;; NOTE: this allows the trailing combinator error to be thrown (setq done t)))) (while (not done) (let ((modifier (esxml--parse-css-modifier))) (if modifier (push modifier result) (setq done t)))) (when (> (cl-count 'id result :key 'car) 1) (error "Only one id selector allowed per compound")) (nreverse result)))) (defun esxml--parse-css-type-selector () (esxml--with-parse-shorthands (let ((token (peek))) (cond ((eq (car token) 'ident) (next) (cons 'tag (intern (cdr token)))) ((eq (car token) 'asterisk) (next) '(wildcard)) (t nil))))) (defun esxml--parse-css-modifier () (or (esxml--parse-css-id) (esxml--parse-css-class) (esxml--parse-css-attrib) (esxml--parse-css-pseudo))) (defun esxml--parse-css-id () (esxml--with-parse-shorthands (let ((value (accept 'hash))) (when value (cons 'id (substring value 1)))))) (defun esxml--parse-css-class () (esxml--with-parse-shorthands (when (accept 'period) (let ((value (accept 'ident))) (if value (cons 'class value) (error "Expected identifier after period")))))) (defun esxml--parse-css-attrib () (esxml--with-parse-shorthands (let (result) (when (accept 'lbracket) (eat-whitespace) (let ((name (esxml--parse-css-attrib-name))) (when (not name) (error "Expected attribute name")) (push (cons 'name name) result) (when (not (accept 'rbracket)) (let ((match (esxml--parse-css-attrib-match))) (when (not match) (error "Expected attribute matcher")) (let ((value (esxml--parse-css-attrib-value))) (when (not value) (error "Expected attribute value")) (eat-whitespace) (when (not (accept 'rbracket)) (error "Unterminated attribute")) (push (cons match value) result))))) (cons 'attribute (nreverse result)))))) (defun esxml--parse-css-attrib-name () (esxml--with-parse-shorthands (let ((name (accept 'ident))) (when name (eat-whitespace) name)))) (defun esxml--parse-css-attrib-match () (esxml--with-parse-shorthands (let (result) (cond ((accept 'equals) (setq result 'exact-match)) ((accept 'prefix-match) (setq result 'prefix-match)) ((accept 'suffix-match) (setq result 'suffix-match)) ((accept 'substring-match) (setq result 'substring-match)) ((accept 'include-match) (setq result 'include-match)) ((accept 'dash-match) (setq result 'dash-match))) (eat-whitespace) result))) (defun esxml--parse-css-attrib-value () (esxml--with-parse-shorthands (let ((token (peek))) (cond ((eq (car token) 'ident) (next) (cdr token)) ((eq (car token) 'string) (next) (substring (cdr token) 1 -1)) (t nil))))) (defun esxml--parse-css-pseudo () (esxml--with-parse-shorthands (let (result type) (when (accept 'colon) (if (accept 'colon) (setq type 'pseudo-element) (setq type 'pseudo-class)) (let ((functional (esxml--parse-css-functional-pseudo))) (if functional (if (eq type 'pseudo-class) (let ((value (car functional)) (args (cdr functional))) (push (cons 'name value) result) (push (cons 'args args) result)) (error "Pseudo-elements may not have arguments")) (let ((value (accept 'ident))) (if value (push (cons 'name value) result) (error "Expected function or identifier"))))) (cons type (nreverse result)))))) (defun esxml--parse-css-functional-pseudo () (esxml--with-parse-shorthands (let ((function (accept 'function)) result) (when function (push (substring function 0 -1) result) (eat-whitespace) (let ((expression (esxml--parse-css-expression)) done) (eat-whitespace) (when (not expression) (error "Expected at least one expression for function")) (push expression result) (while (not done) (setq expression (esxml--parse-css-expression)) (if expression (progn (push expression result) (eat-whitespace)) (setq done t)))) (when (not (accept 'rparen)) (error "Unterminated function argument list")) (nreverse result))))) (defun esxml--parse-css-expression () (esxml--with-parse-shorthands (let ((token (peek))) (cond ((accept 'plus) '(operator . +)) ((accept 'minus) '(operator . -)) ((eq (car token) 'dimension) (next) (cons 'dimension (cdr token))) ((eq (car token) 'number) (next) (cons 'number (string-to-number (cdr token)))) ((eq (car token) 'string) (next) (cons 'string (substring (cdr token) 1 -1))) ((eq (car token) 'ident) (next) (cons 'ident (cdr token))) (t nil))))) ;;; tree traversal ;; TODO: these helpers should be part of esxml.el (defun esxml-branch-p (node) "Non-nil if NODE refers to an esxml branch." (and (listp node) (>= (length node) 2) (symbolp (car node)) (listp (cadr node)))) (defun esxml-node-tag (node) "Returns the tag of NODE if available." (and (esxml-branch-p node) (car node))) (defun esxml-node-attributes (node) "Returns the attributes of NODE if available." (and (esxml-branch-p node) (cadr node))) (defun esxml-node-attribute (attribute node) "Returns the attribute ATTRIBUTE of NODE if available." (and (esxml-branch-p node) (cdr (assq attribute (cadr node))))) (defun esxml-node-children (node) "Returns the children of NODE if available." (and (esxml-branch-p node) (nthcdr 2 node))) (defun esxml-find-node (pred root) "Locates a node satisfying PRED starting from ROOT. Returns the node or nil if none found." (if (funcall pred root) root (cl-some (lambda (node) (esxml-find-node pred node)) (esxml-node-children root)))) (defun esxml-visit-nodes (function root) "Visit nodes by calling FUNCTION on each starting from ROOT." (funcall function root) (mapc (lambda (node) (esxml-visit-nodes function node)) (esxml-node-children root))) (defun esxml-find-nodes (pred root) "Locates all nodes satisfying PRED starting from ROOT. Returns a list of the nodes or nil if none found." (let ((acc '())) (esxml-visit-nodes (lambda (node) (when (funcall pred node) (push node acc))) root) (nreverse acc))) (defun esxml-find-descendant (pred root) "Locates a node satisfying PRED starting from ROOT's children. Returns the node or nil if none found." (cl-some (lambda (node) (esxml-find-node pred node)) (esxml-node-children root))) (defun esxml-find-descendants (pred root) "Locates all nodes satisfying PRED starting from ROOT's children. Returns a list of the nodes or nil if none found." (cl-mapcan (lambda (node) (esxml-find-nodes pred node)) (esxml-node-children root))) (defun esxml-find-child (pred root) "Locates a node satisfying PRED among ROOT's children. Returns the node or nil if none found." (cl-some (lambda (node) (when (funcall pred node) node)) (esxml-node-children root))) (defun esxml-find-children (pred root) "Locates all nodes satisfying PRED among ROOT's children. Returns a list of the nodes or nil if none found." (mapcar (lambda (node) (when (funcall pred node) node)) (esxml-node-children root))) (defun esxml--node-with-children (node children) (let ((tag (esxml-node-tag node)) (attributes (esxml-node-attributes node))) (append (list tag attributes) children))) (defun esxml--node-with-attributes (node attributes) (let ((tag (esxml-node-tag node)) (children (esxml-node-children node))) (append (list tag attributes) children))) (defun esxml-tree-map (function root) "Returns a copy of ROOT with FUNCTION applied to each node." (if (esxml-branch-p root) (esxml--node-with-children (funcall function root) (mapcar (lambda (node) (esxml-tree-map function node)) (esxml-node-children root))) (funcall function root))) (defvar esxml--symbol (make-symbol "id")) (defun esxml--decorate-tree (root) (let ((i 0)) (esxml-tree-map (lambda (node) (let ((attribute (cons esxml--symbol i)) (attributes (esxml-node-attributes node))) (setq attributes (append (list attribute) attributes)) (setq i (1+ i)) (if (esxml-branch-p node) (esxml--node-with-attributes node attributes) node))) root))) (defun esxml--undecorate-node (node) (if (esxml-branch-p node) (let ((attributes (esxml-node-attributes node))) (esxml--node-with-attributes node (assq-delete-all esxml--symbol attributes))) node)) (defun esxml--retrieve-decoration (node) (esxml-node-attribute esxml--symbol node)) ;;; querying ;; NOTE: supporting structural pseudo functions, direct siblings and ;; indirect siblings requires breadth instead of depth traversal, ;; something that could be emulated without zippers if you had the ;; parent of the node (and the position of the child)... (defun esxml--node-matches-attribute-p (node modifier) (let ((attributes (esxml-node-attributes node)) haystack) (cl-every (lambda (item) (let ((type (car item)) (value (cdr item))) (cond ((eq type 'name) (let ((match (assq (intern value) attributes))) (setq haystack (cdr match)) match)) ((eq type 'exact-match) (equal haystack value)) ((eq type 'prefix-match) (string-prefix-p value haystack)) ((eq type 'suffix-match) (string-suffix-p value haystack)) ((eq type 'substring-match) (string-match-p (regexp-quote value) haystack)) ((eq type 'include-match) (member value (split-string haystack " "))) ((eq type 'dash-match) (or (equal value haystack) (string-match-p (format "^%s-" (regexp-quote value)) haystack))) (t (error "Unknown attribute modifier"))))) modifier))) (defun esxml--node-matches-modifier-p (node type value) (cond ((eq type 'wildcard) t) ((eq type 'tag) (equal (esxml-node-tag node) value)) ((eq type 'id) (equal (esxml-node-attribute 'id node) value)) ((eq type 'class) (let ((class (esxml-node-attribute 'class node))) (and class (member value (split-string class " "))))) ((eq type 'attribute) (esxml--node-matches-attribute-p node value)) ;; TODO: support structural pseudo functions ;; TODO: error out on invalid pseudo-class arguments (t (error "Unimplemented attribute type: %s" type)))) (defun esxml--find-node-for (attributes) (lambda (node) (cl-every (lambda (attribute) (let ((type (car attribute)) (value (cdr attribute))) (esxml--node-matches-modifier-p node type value))) attributes))) (defun esxml--find-nodes (root combinator attributes) (let* ((type (cdr (assq 'combinator combinator))) (walker (cond ((not type) 'esxml-find-nodes) ((eq type 'descendant) 'esxml-find-descendants) ((eq type 'child) 'esxml-find-children) ;; TODO: support direct sibling ;; TODO: support indirect sibling (t (error "Unimplemented combinator %s" combinator))))) (funcall walker (esxml--find-node-for attributes) root))) (defun esxml--query (selector root) (let* ((attributes (pop selector)) combinator (result (esxml--find-nodes root nil attributes))) (while (and result selector) (setq combinator (pop selector)) (setq attributes (pop selector)) (setq result (cl-mapcan (lambda (node) (esxml--find-nodes node combinator attributes)) result)) (setq result (delq nil result))) result)) (defun esxml--delete-dups (items test) (let ((seen (make-hash-table :test test)) result) (while items (let ((item (pop items))) (when (not (gethash item seen)) (push item result) (puthash item t seen)))) (nreverse result))) (defun esxml-query-all (selector root) "Locates all nodes satisfying SELECTOR starting from ROOT. SELECTOR must be a string containing a CSS selector or a parsed CSS selector returned by `esxml-parse-css-selector'. Returns a list of the nodes or nil if none found." (when (stringp selector) (setq selector (esxml-parse-css-selector selector))) (if (= (length selector) 1) ;; no commas, we can only get the same nodes repeatedly (esxml--delete-dups (esxml--query (car selector) root) 'eq) ;; commas, nodes might be the same *and* in the wrong order (setq root (esxml--decorate-tree root)) (let (result) (while selector (setq result (append result (esxml--query (pop selector) root)))) (setq result (cl-sort result '< :key 'esxml--retrieve-decoration)) (setq result (cl-delete-duplicates result :test '= :key 'esxml--retrieve-decoration)) (mapcar (lambda (node) (esxml--undecorate-node node)) result)))) (defun esxml-query (selector root) "Locates a node satisfying SELECTOR starting from ROOT. SELECTOR must be a string containing a CSS selector or a parsed CSS selector returned by `esxml-parse-css-selector'. Returns the node or nil if none found." ;; NOTE: you can do a bit less work (the savings decrease the more ;; branches the query discards), but it's simpler and safer to just ;; have the same algorithm for both entry points (car (esxml-query-all selector root))) (provide 'esxml-query) ;;; esxml-query.el ends here