diff options
Diffstat (limited to 'parsebib.el')
-rw-r--r-- | parsebib.el | 244 |
1 files changed, 228 insertions, 16 deletions
diff --git a/parsebib.el b/parsebib.el index f382cd2..cb6b3f4 100644 --- a/parsebib.el +++ b/parsebib.el @@ -6,7 +6,7 @@ ;; Author: Joost Kremers <joostkremers@fastmail.fm> ;; Maintainer: Joost Kremers <joostkremers@fastmail.fm> ;; Created: 2014 -;; Version: 3.0 +;; Version: 4.0 ;; Keywords: text bibtex ;; URL: https://github.com/joostkremers/parsebib ;; Package-Requires: ((emacs "25.1")) @@ -179,6 +179,198 @@ target field is set to the symbol `none'.") (defconst parsebib--key-regexp "[^\"@\\#%',={} \t\n\f]+" "Regexp describing a licit key.") (defconst parsebib--entry-start "^[ \t]*@" "Regexp describing the start of an entry.") +(defun parsebib--build-TeX-accent-command-regexp (command accent) + "Build a regexp-replacement pair for LaTeX diacritics. + +COMMAND is the name of a TeX or LaTeX command (without +backslash), ACCENT is the character (usually a Unicode combining +character) that COMMAND generates. Both COMMAND and ACCENT must +be strings. + +The return value is a cons cell that can be included in +`parsebib-TeX-markup-replace-alist' directly. + +The car of this cons cell is a regexp matching the TeX or LaTeX +COMMAND, capturing exactly one obligatory argument. The +cdr is a replacement string, the concatenation of \"\\1\" and +ACCENT. + +Specifically, the car regexp matches a string composed of a +backslash, followed by COMMAND and a single letter (i.e. +matching [[:alpha:]]). The regexp matches if the letter is in +curly braces (\"\\d{a}\") or if it is separated from COMMAND by +white space (\"\\d a\". If COMMAND is a non-letter character, +the regexp also matches if the letter follows COMMAND +immediately, without white space or curly braces (\"\\'a\"). In +all variants, the letter is captured with group number 1." + (cons + (rx-to-string + `(: "\\" ,command + (or (: (* blank) "{" (group-n 1 letter) "}") + (: (,(if (string-match "[a-zA-Z]" command) '+ '*) blank) + (group-n 1 letter)))) + t) + (rx-to-string `(: (backref 1) ,accent) t))) + +(defun parsebib--build-TeX-command-regexp (command replacement) + "Build a regexp-replacement pair for a LaTeX command. + +COMMAND is the name of a TeX or LaTeX command (without +backslash). Both COMMAND and REPLACEMENT must be strings. + +The return value is a cons cell: its car is a regexp matching +COMMAND, its cdr is REPLACEMENT. This cons cell can be included +in `parsebib-TeX-markup-replace-alist' directly. + +Specifically, the regexp matches a string composed of a backslash +followed by COMMAND and terminated by a pair of curly +braces (`\\COMMAND{}'), a word ending or a space. Such a +trailing space will be included in the overall match." + (cons + (rx-to-string + `(: "\\" ,(if (listp command) `(or ,@command) command) + ;; If a command is terminated by a space, LaTeX includes that + ;; space in the command itself, so it is not printed (like the + ;; behaviour for a following {}) Accordingly, if there is one, + ;; include that space in the replaced string by matching on it + ;; first. + (or (+ blank) word-end "{}")) + t) + replacement)) + +(defun parsebib--convert-tex-italics (str) + "Return first sub-expression match in STR, in italics." + (propertize (match-string 1 str) 'face 'italic)) + +(defun parsebib--convert-tex-bold (str) + "Return first sub-expression match in STR, in bold." + (propertize (match-string 1 str) 'face 'bold)) + +(defun parsebib--convert-tex-small-caps (str) + "Return first sub-expression match in STR, capitalised." + (upcase (match-string 1 str))) + +(defvar parsebib-TeX-markup-replace-alist + `(;; Commands defined to work in both math and text mode. (Dashes are + ;; separate because they are not backslash-escaped, unlike everything else.) + ("---\\|\\\\textemdash\\(?: +\\|{}\\|\\>\\)" . "\N{EM DASH}") + ("--\\|\\\\textendash\\(?: +\\|{}\\|\\>\\)" . "\N{EN DASH}") + ,@(mapcar + (apply-partially 'apply 'parsebib--build-TeX-command-regexp) + '((("ddag" "textdaggerdbl") "\N{DOUBLE DAGGER}") + (("dag" "textdagger") "\N{DAGGER}") + ("textpertenthousand" "\N{PER TEN THOUSAND SIGN}") + ("textperthousand" "\N{PER MILLE SIGN}") + ("textquestiondown" "\N{INVERTED QUESTION MARK}") + ("P" "\N{PILCROW SIGN}") + (("$" "textdollar") "$") + ("S" "\N{SECTION SIGN}") + (("ldots" "dots" "textellipsis") "\N{HORIZONTAL ELLIPSIS}"))) + + ;; Text-mode Accents + ,@(mapcar + (apply-partially 'apply 'parsebib--build-TeX-accent-command-regexp) + '(("\"" "\N{COMBINING DIAERESIS}") + ("'" "\N{COMBINING ACUTE ACCENT}") + ("." "\N{COMBINING DOT ABOVE}") + ("=" "\N{COMBINING MACRON}") + ("^" "\N{COMBINING CIRCUMFLEX ACCENT}") + ("`" "\N{COMBINING GRAVE ACCENT}") + ("b" "\N{COMBINING MACRON BELOW}") + ("c" "\N{COMBINING CEDILLA}") + ("d" "\N{COMBINING DOT BELOW}") + ("H" "\N{COMBINING DOUBLE ACUTE ACCENT}") + ("k" "\N{COMBINING OGONEK}") + ("U" "\N{COMBINING DOUBLE VERTICAL LINE ABOVE}") + ("u" "\N{COMBINING BREVE}") + ("v" "\N{COMBINING CARON}") + ("~" "\N{COMBINING TILDE}") + ("|" "\N{COMBINING COMMA ABOVE}") + ("f" "\N{COMBINING INVERTED BREVE}") + ("G" "\N{COMBINING DOUBLE GRAVE ACCENT}") + ("h" "\N{COMBINING HOOK ABOVE}") + ("C" "\N{COMBINING DOUBLE GRAVE ACCENT}") + ("r" "\N{COMBINING RING ABOVE}"))) + + ;; LaTeX2 Escapable "Special" Characters + ("\\\\%" . "%") ("\\\\&" . "&") ("\\\\#" . "#") + + ;; Quotes + ("``" . "\N{LEFT DOUBLE QUOTATION MARK}") + ("`" . "\N{LEFT SINGLE QUOTATION MARK}") + ("''" . "\N{RIGHT DOUBLE QUOTATION MARK}") + ("'" . "\N{RIGHT SINGLE QUOTATION MARK}") + + ;; Formatting Commands + ("\\\\textit{\\(.*?\\)}" . parsebib--convert-tex-italics) + ("\\\\emph{\\(.*?\\)}" . parsebib--convert-tex-italics) + ("\\\\textbf{\\(.*?\\)}" . parsebib--convert-tex-bold) + ("\\\\textsc{\\(.*?\\)}" . parsebib--convert-tex-small-caps) + + ;; Non-ASCII Letters (Excluding Accented Letters) + ,@(mapcar + (apply-partially 'apply 'parsebib--build-TeX-command-regexp) + '(("AA" "\N{LATIN CAPITAL LETTER A WITH RING ABOVE}") + ("AE" "\N{LATIN CAPITAL LETTER AE}") + ("DH" "\N{LATIN CAPITAL LETTER ETH}") + ("DJ" "\N{LATIN CAPITAL LETTER ETH}") + ("L" "\N{LATIN CAPITAL LETTER L WITH STROKE}") + ("SS" "\N{LATIN CAPITAL LETTER SHARP S}") + ("NG" "\N{LATIN CAPITAL LETTER ENG}") + ("OE" "\N{LATIN CAPITAL LIGATURE OE}") + ("O" "\N{LATIN CAPITAL LETTER O WITH STROKE}") + ("TH" "\N{LATIN CAPITAL LETTER THORN}") + + ("aa" "\N{LATIN SMALL LETTER A WITH RING ABOVE}") + ("ae" "\N{LATIN SMALL LETTER AE}") + ("dh" "\N{LATIN SMALL LETTER ETH}") + ("dj" "\N{LATIN SMALL LETTER ETH}") + ("l" "\N{LATIN SMALL LETTER L WITH STROKE}") + ("ss" "\N{LATIN SMALL LETTER SHARP S}") + ("ng" "\N{LATIN SMALL LETTER ENG}") + ("oe" "\N{LATIN SMALL LIGATURE OE}") + ("o" "\N{LATIN SMALL LETTER O WITH STROKE}") + ("th" "\N{LATIN SMALL LETTER THORN}") + + ("ij" "ij") + ("i" "\N{LATIN SMALL LETTER DOTLESS I}") + ("j" "\N{LATIN SMALL LETTER DOTLESS J}"))) + + ;; Commands with obligatory non-empty argument + ("\\\\[a-zA-Z*]+\\(?:\\[.*\\]\\)?{\\(.+?\\)}" . "\\1") + + ;; Commands without arguments, optionally terminated by empty braces + ("\\(\\\\[a-zA-Z*]+\\)\\(?:\\[.*\\]\\)?\\(?:{}\\)?" . "\\1") + + ;; Collapse white space + ("[[:blank:]]+" . " ") + + ;; Remove all remaining {braces} + ("{" . "") ("}" . "")) + "Alist of strings and replacements for TeX markup. +This is used in `parsebib-clean-TeX-markup' to make TeX markup more +suitable for display. Each item in the list consists of a regexp +and its replacement. The replacement can be a string (which will +simply replace the match) or a function (the match will be +replaced by the result of calling the function on the match +string). Earlier elements are evaluated before later ones, so if +one string is a subpattern of another, the second must appear +later (e.g. \"''\" is before \"'\").") + +(defun parsebib-clean-TeX-markup (string) + "Return STRING without TeX markup. +Any substring matching the car of a cell in +`parsebib-TeX-markup-replace-alist' is replaced with the +corresponding cdr (if the cdr is a string), or with the result of +calling the cdr on the match (if it is a function). This is done +with `replace-regexp-in-string', which see for details." + (let ((case-fold-search nil)) + (save-match-data + (cl-loop for (pattern . replacement) in parsebib-TeX-markup-replace-alist + do (setq string (replace-regexp-in-string + pattern replacement string)) + finally return string)))) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Matching and parsing stuff ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -228,7 +420,7 @@ if a matching delimiter was found." ;; If forward-sexp does not result in an error, we want to return t. t)) -(defun parsebib--parse-bib-value (limit &optional strings) +(defun parsebib--parse-bib-value (limit &optional strings replace-TeX) "Parse value at point. A value is either a field value or a @String expansion. Return the value as a string. No parsing is done beyond LIMIT, but note @@ -238,7 +430,11 @@ STRINGS, if non-nil, is a hash table of @String definitions. @String abbrevs in the value to be parsed are then replaced with their expansions. Additionally, newlines in field values are removed, white space is reduced to a single space and braces or -double quotes around field values are removed." +double quotes around field values are removed. + +REPLACE-TEX indicates whether TeX markup should be replaced with +ASCII/Unicode characters. See the variable +`parsebib-TeX-markup-replace-alist' for details." (let (res) (while (and (< (point) limit) (not (looking-at-p ","))) @@ -253,9 +449,12 @@ double quotes around field values are removed." ((looking-at "[[:space:]]*#[[:space:]]*") (goto-char (match-end 0))) (t (forward-char 1)))) ; So as not to get stuck in an infinite loop. - (if strings - (string-join (parsebib--expand-strings (nreverse res) strings)) - (string-join (nreverse res) " # ")))) + (setq res (if strings + (string-join (parsebib--expand-strings (nreverse res) strings)) + (string-join (nreverse res) " # "))) + (if replace-TeX + (parsebib-clean-TeX-markup res) + res))) ;;;;;;;;;;;;;;;;;;;;; ;; Expanding stuff ;; @@ -456,7 +655,7 @@ point." into hashid-fields finally return (mapconcat #'identity hashid-fields ""))) -(defun parsebib-read-entry (type &optional pos strings fields) +(defun parsebib-read-entry (type &optional pos strings fields replace-TeX) "Read a BibTeX entry of type TYPE at the line POS is on. TYPE should be a string and should not contain the @ sign. The return value is the entry as an alist of (<field> . @@ -484,7 +683,11 @@ FIELDS is a list of the field names (as strings) to be read and included in the result. Fields not in the list are ignored, except \"=key=\" and \"=type=\", which are always included. Case is ignored when comparing fields to the list in FIELDS. If -FIELDS is nil, all fields are returned." +FIELDS is nil, all fields are returned. + +REPLACE-TEX indicates whether TeX markup should be replaced with +ASCII/Unicode characters. See the variable +`parsebib-TeX-markup-replace-alist' for details." (unless (member-ignore-case type '("comment" "preamble" "string")) (when pos (goto-char pos)) (beginning-of-line) @@ -501,7 +704,7 @@ FIELDS is nil, all fields are returned." (buffer-substring-no-properties beg (point))))) (or key (setq key "")) ; If no key was found, we pretend it's empty and try to read the entry anyway. (skip-chars-forward "^," limit) ; Move to the comma after the entry key. - (let ((fields (cl-loop for field = (parsebib--parse-bibtex-field limit strings fields) + (let ((fields (cl-loop for field = (parsebib--parse-bibtex-field limit strings fields replace-TeX) while field if (consp field) collect field))) (push (cons "=type=" type) fields) @@ -510,7 +713,7 @@ FIELDS is nil, all fields are returned." (push (cons "=hashid=" (secure-hash 'sha256 (parsebib--get-hashid-string fields))) fields)) (nreverse fields)))))) -(defun parsebib--parse-bibtex-field (limit &optional strings fields) +(defun parsebib--parse-bibtex-field (limit &optional strings fields replace-TeX) "Parse the field starting at point. Do not search beyond LIMIT (a buffer position). Return a cons (FIELD . VALUE), or nil if no field was found. @@ -522,7 +725,11 @@ FIELDS is a list of the field names (as strings) to be read and included in the result. Fields not in the list are ignored, except \"=key=\" and \"=type=\", which are always included. Case is ignored when comparing fields to the list in FIELDS. If -FIELDS is nil, all fields are returned." +FIELDS is nil, all fields are returned. + +REPLACE-TEX indicates whether TeX markup should be replaced with +ASCII/Unicode characters. See the variable +`parsebib-TeX-markup-replace-alist' for details." (skip-chars-forward "\"#%'(),={} \n\t\f" limit) ; Move to the first char of the field name. (unless (>= (point) limit) ; If we haven't reached the end of the entry. (let ((beg (point))) @@ -530,7 +737,7 @@ FIELDS is nil, all fields are returned." (let ((field-type (buffer-substring-no-properties beg (point)))) (if (or (not fields) (member-ignore-case field-type fields)) - (cons field-type (parsebib--parse-bib-value limit strings)) + (cons field-type (parsebib--parse-bib-value limit strings replace-TeX)) (parsebib--parse-bib-value limit) ; Skip over the field value. :ignore)))))) ; Ignore this field but keep the `cl-loop' in `parsebib-read-entry' going. @@ -650,7 +857,7 @@ file. Return nil if no dialect is found." (string-match (concat "bibtex-dialect: " (regexp-opt (mapcar #'symbol-name bibtex-dialect-list) t)) comment)) (intern (match-string 1 comment)))))))) -(cl-defun parsebib-parse-bib-buffer (&key entries strings expand-strings inheritance fields) +(cl-defun parsebib-parse-bib-buffer (&key entries strings expand-strings inheritance fields replace-TeX) "Parse the current buffer and return all BibTeX data. Return a list of five elements: a hash table with the entries, a hash table with the @String definitions, a list of @Preamble @@ -685,7 +892,11 @@ FIELDS is a list of the field names (as strings) to be read and included in the result. Fields not in the list are ignored, except \"=key=\" and \"=type=\", which are always included. Case is ignored when comparing fields to the list in FIELDS. If -FIELDS is nil, all fields are returned." +FIELDS is nil, all fields are returned. + +REPLACE-TEX indicates whether TeX markup should be replaced with +ASCII/Unicode characters. See the variable +`parsebib-TeX-markup-replace-alist' for details." (save-excursion (goto-char (point-min)) (or (and (hash-table-p entries) @@ -710,7 +921,7 @@ FIELDS is nil, all fields are returned." ((cl-equalp item "comment") (push (parsebib-read-comment) comments)) ((stringp item) - (let ((entry (parsebib-read-entry item nil (if expand-strings strings) fields))) + (let ((entry (parsebib-read-entry item nil (if expand-strings strings) fields replace-TeX))) (when entry (puthash (cdr (assoc-string "=key=" entry)) entry entries)))))) (when inheritance (parsebib-expand-xrefs entries (if (eq inheritance t) dialect inheritance))) @@ -1015,7 +1226,8 @@ details. If FIELDS is nil, all fields are returned." :strings strings :expand-strings display :inheritance display - :fields fields)) + :fields fields + :replace-TeX display)) ((string= (file-name-extension file t) ".json") (parsebib-parse-json-buffer :entries entries :stringify display |