; XDOC Documentation System for ACL2 ; Copyright (C) 2009-2015 Centaur Technology ; ; Contact: ; Centaur Technology Formal Verification Group ; 7600-C N. Capital of Texas Highway, Suite 300, Austin, TX 78731, USA. ; http://www.centtech.com/ ; ; License: (An MIT/X11-style license) ; ; Permission is hereby granted, free of charge, to any person obtaining a ; copy of this software and associated documentation files (the "Software"), ; to deal in the Software without restriction, including without limitation ; the rights to use, copy, modify, merge, publish, distribute, sublicense, ; and/or sell copies of the Software, and to permit persons to whom the ; Software is furnished to do so, subject to the following conditions: ; ; The above copyright notice and this permission notice shall be included in ; all copies or substantial portions of the Software. ; ; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ; FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER ; DEALINGS IN THE SOFTWARE. ; ; Original author: Jared Davis (in-package "XDOC") (include-book "autolink") (set-state-ok t) (program) ; (PARSE-XML X) --> (MV ERR TOKENS) ; ; This is an extremely primitive XML lexer. It is incomplete, doesn't support ; unicode, and probably is dead wrong as far as the XML standard is concerned. ; But, it's adequate for XDOC stuff. ; ; X should be an XML-formatted string (with no preprocessor stuff). We break ; it into a list of tokens, which may be any of the following: ; ; (:OPEN STR ALIST LOC) represents ; ; - STR is a string that is the tag name, e.g., "p" for

; ; - ALIST is an alist of the attribute. Each entry is either: ; (KEY-STR . VAL-STR) for ordinary key="val" style atts ; (KEY-STR . NIL) for value-free atts like


; ; - LOC is semantically irrelevant. It's the location of this ; open token in the string, which can be useful for better ; error messages about unbalanced strings. ; ; (:CLOSE STR) represents
; ; - STR is a string that is the tag name, e.g., "p" for

; ; (:TEXT STR) represents ordinary text ; ; - STR is the contents of the text. This is ordinary text with no ; embedded tags or entities. ; ; (:ENTITY TYPE) represents entities like & ; ; - TYPE is :AMP, :LT, :GT, :APOS, :NBSP, :MDASH, :RARR, LSQUO, RSQUO, LDQUO, or RDQUO (defun opentok-p (x) (eq (first x) :OPEN)) (defun opentok-name (x) (second x)) (defun opentok-atts (x) (third x)) (defun opentok-loc (x) (fourth x)) (defun closetok-p (x) (eq (first x) :CLOSE)) (defun closetok-name (x) (second x)) (defun entitytok-p (x) (eq (first x) :ENTITY)) (defun entitytok-type (x) (second x)) (defun texttok-p (x) (eq (first x) :TEXT)) (defun texttok-text (x) (second x)) (defconst *nls* (coerce (list #\Newline) 'string)) (defun read-until (x n xl stop-chars acc) ;; Accumulate chars until one of the stop-chars is seen "Returns (MV FOUNDP N ACC)" (b* (((when (>= n xl)) (mv nil n acc)) (char-n (char x n)) ((when (member char-n stop-chars)) (mv t n acc))) (read-until x (+ 1 n) xl stop-chars (cons char-n acc)))) (defun read-name-aux (x n xl acc) ;; For attribute/tag names. We just expect alphanumeric characters. This is ;; probably horribly wrong as far as the XML spec is concerned, but should be ;; reasonable for all the tag and attribute names in xdoc. "Returns (MV N ACC)" (b* (((when (>= n xl)) (mv n acc)) (char-n (char x n)) ((when (or (and (char<= #\a char-n) (char<= char-n #\z)) (and (char<= #\Z char-n) (char<= char-n #\Z)) (and (char<= #\0 char-n) (char<= char-n #\9)) (eql char-n #\_) (eql char-n #\-) )) (read-name-aux x (+ 1 n) xl (cons char-n acc)))) (mv n acc))) (defun read-name (x n xl) ;; Try to read an attribute/tag name at the current spot. "Returns (MV ERR N NAME-STR)" (b* (((mv n rchars) (read-name-aux x n xl nil)) ((unless (consp rchars)) (mv (str::cat "Expected a name, but found unexpected character " (coerce (list (char x n)) 'string) *nls* "Nearby text: {" (error-context x n xl) "}" *nls*) n nil)) (str (str::rchars-to-string rchars))) (mv nil n str))) (defun read-attrval (x n xl) ;; Try to read "value" or 'value' at the current spot. "Returns (MV ERR N VAL-STR)" (b* ((saved-n n) (quote-char (char x n)) ((unless (or (eql quote-char #\") (eql quote-char #\'))) (mv (str::cat "Expected value to start with a quote, but found " (coerce (list (char x n)) 'string) *nls* "Nearby text: {" (error-context x n xl) "}" *nls*) n nil)) (n (+ 1 n)) ;; eat the quote-char ((mv found-endp n chars) (read-until x n xl (list quote-char) nil)) ((unless found-endp) (mv (str::cat "Attribute value never ends." *nls* "Nearby text: {" (error-context x saved-n xl) "}" *nls*) n nil)) (n (+ n 1)) ;; eat the closing quote-char (val-str (str::rchars-to-string chars))) (mv nil n val-str))) (defun read-tag-attributes (x n xl tag-start-n atts) ;; Assumes " or />, setting CLOSEP=T if we read /> ;; as in
. "Returns (MV ERR N ATTS CLOSEP)" (b* ((n (skip-past-ws x n xl)) ((when (>= n xl)) (mv (str::cat "Tag never closes." *nls* "Nearby text: {" (error-context x tag-start-n xl) "}" *nls*) n nil nil)) ((when (eql (char x n) #\>)) ;; End of tag via > (mv nil (+ 1 n) atts nil)) ((when (eql (char x n) #\/)) ;; It had better be />. (b* ((n (+ 1 n)) ((unless (and (< n xl) (eql (char x n) #\>))) (mv (str::cat "Stray / in tag." *nls* "Nearby text: {" (error-context x tag-start-n xl) "}" *nls*) n nil nil))) (mv nil (+ 1 n) atts t))) ;; We should now be at the start of some new attribute. ((mv err n key1) (read-name x n xl)) ((when err) (mv err n nil nil)) (n (skip-past-ws x n xl)) ((when (>= n xl)) (mv (str::cat "Tag never closes." *nls* "Nearby text: {" (error-context x tag-start-n xl) "}" *nls*) n nil nil)) ((unless (eql (char x n) #\=)) ;; I guess this is okay for things like "noshade" that don't have an ;; associated value. (read-tag-attributes x n xl tag-start-n (acons key1 nil atts))) (n (+ n 1)) ;; eat the = (n (skip-past-ws x n xl)) ((mv err n val1) (read-attrval x n xl)) ((when err) (mv err n nil nil))) (read-tag-attributes x n xl tag-start-n (acons key1 val1 atts)))) (defun read-close-tag (x n xl) ;; Assumes . "Returns (MV ERR N TOKEN-LIST)" (b* ((saved-n n) (n (skip-past-ws x n xl)) ((mv err n name) (read-name x n xl)) ((when err) (mv err n nil)) (n (skip-past-ws x n xl)) ((unless (eql (char x n) #\>)) (mv (str::cat "Invalid closing tag." *nls* "Nearby text: {" (error-context x saved-n xl) "}" *nls*) n nil)) (n (+ 1 n)) ;; eat the final > (close (list :CLOSE name))) (mv nil n (list close)))) (defun skip-declaration (x n xl start-n) ;; Read through ?> "Returns (MV ERR N NIL)" (b* (((when (>= (+ 1 n) xl)) (mv (str::cat " declaration never closes." *nls* "Nearby text: {" (error-context x start-n xl) "}" *nls*) n nil)) ((unless (and (eql (char x n) #\?) (eql (char x (+ n 1)) #\>))) (skip-declaration x (+ n 1) xl start-n))) (mv nil (+ n 2) nil))) (defun skip-entity-stuff (x n xl start-n) ;; Read through ]> "Returns (MV ERR N NIL)" (b* (((when (>= (+ 1 n) xl)) (mv (str::cat " declaration never closes." *nls* "Nearby text: {" (error-context x start-n xl) "}" *nls*) n nil)) ((unless (and (eql (char x n) #\]) (eql (char x (+ n 1)) #\>))) (skip-entity-stuff x (+ n 1) xl start-n))) (mv nil (+ n 2) nil))) (defun read-tag (x n xl) ;; Assumes (char x n) is "<" "Returns (MV ERR N TOKEN-LIST)" (b* ((tag-start-n n) (n (+ 1 n)) (n (skip-past-ws x n xl)) ((when (>= n xl)) (mv (str::cat "Tag never closes." *nls* "Nearby text: {" (error-context x tag-start-n xl) "}" *nls*) n nil)) ((when (eql (char x n) #\?)) (skip-declaration x n xl tag-start-n)) ((when (eql (char x n) #\!)) (skip-entity-stuff x n xl tag-start-n)) ((when (eql (char x n) #\/)) (read-close-tag x (+ 1 n) xl)) ;; Else, it's an opening tag. This should be its name. ((mv err n name) (read-name x n xl)) ((when err) (mv err n nil)) ((mv err n atts closep) (read-tag-attributes x n xl tag-start-n nil)) ((when err) (mv err n nil)) (open (list :OPEN name (reverse atts) tag-start-n)) ((when (not closep)) (mv nil n (list open))) (close (list :CLOSE name))) (mv nil n (list open close)))) (defun read-entity (x n xl) ;; Assumes (char x n) is "&" "Returns (MV ERR N TOK)" (b* ((saved-n n) (n (+ 1 n)) ;; eat the & ((mv foundp n rchars) (read-until x n xl (list #\;) nil)) ((unless foundp) (mv (str::cat "Entity does not have a closing semicolon." *nls* "Nearby text: {" (error-context x saved-n xl) "}" *nls*) n nil)) (n (+ 1 n)) ;; eat the ; (str (str::rchars-to-string rchars)) ((when (equal str "amp")) (mv nil n '(:ENTITY :AMP))) ((when (equal str "lt")) (mv nil n '(:ENTITY :LT))) ((when (equal str "gt")) (mv nil n '(:ENTITY :GT))) ((when (equal str "quot")) (mv nil n '(:ENTITY :QUOT))) ((when (equal str "apos")) (mv nil n '(:ENTITY :APOS))) ((when (equal str "nbsp")) (mv nil n '(:ENTITY :NBSP))) ((when (equal str "mdash")) (mv nil n '(:ENTITY :MDASH))) ((when (equal str "rarr")) (mv nil n '(:ENTITY :RARR))) ((when (equal str "lsquo")) (mv nil n '(:ENTITY :LSQUO))) ((when (equal str "rsquo")) (mv nil n '(:ENTITY :RSQUO))) ((when (equal str "ldquo")) (mv nil n '(:ENTITY :LDQUO))) ((when (equal str "rdquo")) (mv nil n '(:ENTITY :RDQUO))) ) (mv (str::cat "Unsupported entity: &" str ";" *nls* "Nearby text: {" (error-context x saved-n xl) "}" *nls*) n nil))) (defun parse-xml-aux (x n xl acc) "Returns (MV ERR TOKENS)" (b* (((when (>= n xl)) (mv nil acc)) (char-n (char x n)) ((when (eql char-n #\<)) ;; Open/closing tag (b* (((mv err n token-list) (read-tag x n xl)) ((when err) (mv err nil)) (acc (revappend token-list acc))) (parse-xml-aux x n xl acc))) ((when (eql char-n #\&)) ;; Entity (b* (((mv err n token) (read-entity x n xl)) ((when err) (mv err nil))) (parse-xml-aux x n xl (cons token acc)))) ((when (eql char-n #\>)) (mv (str::cat "Stray end-of-tag character '>'" *nls* "Nearby text: {" (error-context x n xl) "}" *nls*) nil)) ;; Otherwise, plain text. ((mv ?foundp n rchars) (read-until x n xl '(#\< #\& #\>) nil)) (token (list :TEXT (str::rchars-to-string rchars)))) (parse-xml-aux x n xl (cons token acc)))) ; Basic tag balance checking with nice error reporting (defun entitytok-as-entity (x) (case (entitytok-type x) (:AMP "&") (:LT "<") (:GT ">") (:QUOT """) (:APOS "'") (:NBSP " ") (:MDASH "—") (:RARR "→") (:LSQUO "‘") (:RSQUO "’") (:LDQUO "“") (:RDQUO "”") )) (defun entitytok-as-plaintext (x) (case (entitytok-type x) (:AMP "&") (:LT "<") (:GT ">") (:QUOT "\"") (:APOS "'") (:NBSP " ") (:MDASH "---") (:RARR "-->") (:LSQUO "`") (:RSQUO "'") (:LDQUO "``") (:RDQUO "''"))) (defun flatten-token-for-errormsg (x) (cond ((opentok-p x) (str::cat "<" (opentok-name x) (if (opentok-atts x) " [...]>" ">"))) ((closetok-p x) (str::cat "")) ((entitytok-p x) (entitytok-as-entity x)) ((texttok-p x) (texttok-text x)) (t (er hard? 'flatten-token-for-errormsg "Invalid token ~x0" x)))) (defun flatten-tokens-for-errormsg (x) (if (atom x) "" (str::cat (flatten-token-for-errormsg (car x)) (flatten-tokens-for-errormsg (cdr x))))) (defun nearby-text (index str) (let* ((strlen (length str)) (start index) (stop (min strlen (+ index 60)))) (str::cat (substitute #\Space #\Newline (subseq str start stop)) (if (equal stop strlen) "" "...")))) (defun open-tag-backtrace-entry (orig-str open-tok) (declare (type string orig-str)) (let* ((n (opentok-loc open-tok)) (name (opentok-name open-tok)) (name-len (length name)) (spaces-len (max 0 (- 6 name-len))) (pad (coerce (make-list spaces-len :initial-element #\Space) 'string)) (nearby (nearby-text n orig-str))) (str::cat " <" name "> " pad nearby *nls*))) (defun open-tags-backtrace (orig-str open-tags) (if (atom open-tags) "" (str::cat (open-tag-backtrace-entry orig-str (car open-tags)) (open-tags-backtrace orig-str (cdr open-tags))))) (defun find-tag-imbalance (x open-tags loc) "Returns (MV ERROR/NIL LOC/NIL STILL-OPEN-TAGS/NIL)" (b* (((when (atom x)) (if open-tags (mv (str::cat (opentok-name (car open-tags)) " tag is never closed.") loc open-tags) (mv nil nil nil))) ((when (opentok-p (car x))) (find-tag-imbalance (cdr x) (cons (car x) open-tags) (+ 1 loc))) ((when (closetok-p (car x))) (b* ((close-name (closetok-name (car x))) ((unless (consp open-tags)) (mv (str::cat "Found with no matching opening tag.") loc open-tags)) (open-name (opentok-name (car open-tags))) ((unless (equal close-name open-name)) (mv (str::cat "Found <" open-name "> with mismatched .") loc open-tags))) (find-tag-imbalance (cdr x) (cdr open-tags) (+ 1 loc))))) (find-tag-imbalance (cdr x) open-tags (+ 1 loc)))) (defun parse-xml (x) (declare (type string x)) (b* (((mv err rtokens) (parse-xml-aux x 0 (length x) nil)) ((when err) (mv err nil)) (tokens (reverse rtokens)) ((mv err loc open-tags) (find-tag-imbalance tokens nil 0)) ((when err) (b* (((when (not loc)) (mv err nil)) (back-one (max 0 (- loc 4))) (start-ctx (nthcdr back-one tokens)) (context (take (min 8 (len start-ctx)) start-ctx)) (nearby (flatten-tokens-for-errormsg context))) (mv (str::cat err *nls* "Nearby text: {" nearby "}" *nls* *nls* (if open-tags (str::cat "Open tags stack:" *nls* (open-tags-backtrace x (reverse open-tags))) "")) nil)))) (mv err tokens)))