summaryrefslogtreecommitdiff
path: root/parsebib.el
diff options
context:
space:
mode:
Diffstat (limited to 'parsebib.el')
-rw-r--r--parsebib.el244
1 files changed, 228 insertions, 16 deletions
diff --git a/parsebib.el b/parsebib.el
index f382cd2..cb6b3f4 100644
--- a/parsebib.el
+++ b/parsebib.el
@@ -6,7 +6,7 @@
;; Author: Joost Kremers <joostkremers@fastmail.fm>
;; Maintainer: Joost Kremers <joostkremers@fastmail.fm>
;; Created: 2014
-;; Version: 3.0
+;; Version: 4.0
;; Keywords: text bibtex
;; URL: https://github.com/joostkremers/parsebib
;; Package-Requires: ((emacs "25.1"))
@@ -179,6 +179,198 @@ target field is set to the symbol `none'.")
(defconst parsebib--key-regexp "[^\"@\\#%',={} \t\n\f]+" "Regexp describing a licit key.")
(defconst parsebib--entry-start "^[ \t]*@" "Regexp describing the start of an entry.")
+(defun parsebib--build-TeX-accent-command-regexp (command accent)
+ "Build a regexp-replacement pair for LaTeX diacritics.
+
+COMMAND is the name of a TeX or LaTeX command (without
+backslash), ACCENT is the character (usually a Unicode combining
+character) that COMMAND generates. Both COMMAND and ACCENT must
+be strings.
+
+The return value is a cons cell that can be included in
+`parsebib-TeX-markup-replace-alist' directly.
+
+The car of this cons cell is a regexp matching the TeX or LaTeX
+COMMAND, capturing exactly one obligatory argument. The
+cdr is a replacement string, the concatenation of \"\\1\" and
+ACCENT.
+
+Specifically, the car regexp matches a string composed of a
+backslash, followed by COMMAND and a single letter (i.e.
+matching [[:alpha:]]). The regexp matches if the letter is in
+curly braces (\"\\d{a}\") or if it is separated from COMMAND by
+white space (\"\\d a\". If COMMAND is a non-letter character,
+the regexp also matches if the letter follows COMMAND
+immediately, without white space or curly braces (\"\\'a\"). In
+all variants, the letter is captured with group number 1."
+ (cons
+ (rx-to-string
+ `(: "\\" ,command
+ (or (: (* blank) "{" (group-n 1 letter) "}")
+ (: (,(if (string-match "[a-zA-Z]" command) '+ '*) blank)
+ (group-n 1 letter))))
+ t)
+ (rx-to-string `(: (backref 1) ,accent) t)))
+
+(defun parsebib--build-TeX-command-regexp (command replacement)
+ "Build a regexp-replacement pair for a LaTeX command.
+
+COMMAND is the name of a TeX or LaTeX command (without
+backslash). Both COMMAND and REPLACEMENT must be strings.
+
+The return value is a cons cell: its car is a regexp matching
+COMMAND, its cdr is REPLACEMENT. This cons cell can be included
+in `parsebib-TeX-markup-replace-alist' directly.
+
+Specifically, the regexp matches a string composed of a backslash
+followed by COMMAND and terminated by a pair of curly
+braces (`\\COMMAND{}'), a word ending or a space. Such a
+trailing space will be included in the overall match."
+ (cons
+ (rx-to-string
+ `(: "\\" ,(if (listp command) `(or ,@command) command)
+ ;; If a command is terminated by a space, LaTeX includes that
+ ;; space in the command itself, so it is not printed (like the
+ ;; behaviour for a following {}) Accordingly, if there is one,
+ ;; include that space in the replaced string by matching on it
+ ;; first.
+ (or (+ blank) word-end "{}"))
+ t)
+ replacement))
+
+(defun parsebib--convert-tex-italics (str)
+ "Return first sub-expression match in STR, in italics."
+ (propertize (match-string 1 str) 'face 'italic))
+
+(defun parsebib--convert-tex-bold (str)
+ "Return first sub-expression match in STR, in bold."
+ (propertize (match-string 1 str) 'face 'bold))
+
+(defun parsebib--convert-tex-small-caps (str)
+ "Return first sub-expression match in STR, capitalised."
+ (upcase (match-string 1 str)))
+
+(defvar parsebib-TeX-markup-replace-alist
+ `(;; Commands defined to work in both math and text mode. (Dashes are
+ ;; separate because they are not backslash-escaped, unlike everything else.)
+ ("---\\|\\\\textemdash\\(?: +\\|{}\\|\\>\\)" . "\N{EM DASH}")
+ ("--\\|\\\\textendash\\(?: +\\|{}\\|\\>\\)" . "\N{EN DASH}")
+ ,@(mapcar
+ (apply-partially 'apply 'parsebib--build-TeX-command-regexp)
+ '((("ddag" "textdaggerdbl") "\N{DOUBLE DAGGER}")
+ (("dag" "textdagger") "\N{DAGGER}")
+ ("textpertenthousand" "\N{PER TEN THOUSAND SIGN}")
+ ("textperthousand" "\N{PER MILLE SIGN}")
+ ("textquestiondown" "\N{INVERTED QUESTION MARK}")
+ ("P" "\N{PILCROW SIGN}")
+ (("$" "textdollar") "$")
+ ("S" "\N{SECTION SIGN}")
+ (("ldots" "dots" "textellipsis") "\N{HORIZONTAL ELLIPSIS}")))
+
+ ;; Text-mode Accents
+ ,@(mapcar
+ (apply-partially 'apply 'parsebib--build-TeX-accent-command-regexp)
+ '(("\"" "\N{COMBINING DIAERESIS}")
+ ("'" "\N{COMBINING ACUTE ACCENT}")
+ ("." "\N{COMBINING DOT ABOVE}")
+ ("=" "\N{COMBINING MACRON}")
+ ("^" "\N{COMBINING CIRCUMFLEX ACCENT}")
+ ("`" "\N{COMBINING GRAVE ACCENT}")
+ ("b" "\N{COMBINING MACRON BELOW}")
+ ("c" "\N{COMBINING CEDILLA}")
+ ("d" "\N{COMBINING DOT BELOW}")
+ ("H" "\N{COMBINING DOUBLE ACUTE ACCENT}")
+ ("k" "\N{COMBINING OGONEK}")
+ ("U" "\N{COMBINING DOUBLE VERTICAL LINE ABOVE}")
+ ("u" "\N{COMBINING BREVE}")
+ ("v" "\N{COMBINING CARON}")
+ ("~" "\N{COMBINING TILDE}")
+ ("|" "\N{COMBINING COMMA ABOVE}")
+ ("f" "\N{COMBINING INVERTED BREVE}")
+ ("G" "\N{COMBINING DOUBLE GRAVE ACCENT}")
+ ("h" "\N{COMBINING HOOK ABOVE}")
+ ("C" "\N{COMBINING DOUBLE GRAVE ACCENT}")
+ ("r" "\N{COMBINING RING ABOVE}")))
+
+ ;; LaTeX2 Escapable "Special" Characters
+ ("\\\\%" . "%") ("\\\\&" . "&") ("\\\\#" . "#")
+
+ ;; Quotes
+ ("``" . "\N{LEFT DOUBLE QUOTATION MARK}")
+ ("`" . "\N{LEFT SINGLE QUOTATION MARK}")
+ ("''" . "\N{RIGHT DOUBLE QUOTATION MARK}")
+ ("'" . "\N{RIGHT SINGLE QUOTATION MARK}")
+
+ ;; Formatting Commands
+ ("\\\\textit{\\(.*?\\)}" . parsebib--convert-tex-italics)
+ ("\\\\emph{\\(.*?\\)}" . parsebib--convert-tex-italics)
+ ("\\\\textbf{\\(.*?\\)}" . parsebib--convert-tex-bold)
+ ("\\\\textsc{\\(.*?\\)}" . parsebib--convert-tex-small-caps)
+
+ ;; Non-ASCII Letters (Excluding Accented Letters)
+ ,@(mapcar
+ (apply-partially 'apply 'parsebib--build-TeX-command-regexp)
+ '(("AA" "\N{LATIN CAPITAL LETTER A WITH RING ABOVE}")
+ ("AE" "\N{LATIN CAPITAL LETTER AE}")
+ ("DH" "\N{LATIN CAPITAL LETTER ETH}")
+ ("DJ" "\N{LATIN CAPITAL LETTER ETH}")
+ ("L" "\N{LATIN CAPITAL LETTER L WITH STROKE}")
+ ("SS" "\N{LATIN CAPITAL LETTER SHARP S}")
+ ("NG" "\N{LATIN CAPITAL LETTER ENG}")
+ ("OE" "\N{LATIN CAPITAL LIGATURE OE}")
+ ("O" "\N{LATIN CAPITAL LETTER O WITH STROKE}")
+ ("TH" "\N{LATIN CAPITAL LETTER THORN}")
+
+ ("aa" "\N{LATIN SMALL LETTER A WITH RING ABOVE}")
+ ("ae" "\N{LATIN SMALL LETTER AE}")
+ ("dh" "\N{LATIN SMALL LETTER ETH}")
+ ("dj" "\N{LATIN SMALL LETTER ETH}")
+ ("l" "\N{LATIN SMALL LETTER L WITH STROKE}")
+ ("ss" "\N{LATIN SMALL LETTER SHARP S}")
+ ("ng" "\N{LATIN SMALL LETTER ENG}")
+ ("oe" "\N{LATIN SMALL LIGATURE OE}")
+ ("o" "\N{LATIN SMALL LETTER O WITH STROKE}")
+ ("th" "\N{LATIN SMALL LETTER THORN}")
+
+ ("ij" "ij")
+ ("i" "\N{LATIN SMALL LETTER DOTLESS I}")
+ ("j" "\N{LATIN SMALL LETTER DOTLESS J}")))
+
+ ;; Commands with obligatory non-empty argument
+ ("\\\\[a-zA-Z*]+\\(?:\\[.*\\]\\)?{\\(.+?\\)}" . "\\1")
+
+ ;; Commands without arguments, optionally terminated by empty braces
+ ("\\(\\\\[a-zA-Z*]+\\)\\(?:\\[.*\\]\\)?\\(?:{}\\)?" . "\\1")
+
+ ;; Collapse white space
+ ("[[:blank:]]+" . " ")
+
+ ;; Remove all remaining {braces}
+ ("{" . "") ("}" . ""))
+ "Alist of strings and replacements for TeX markup.
+This is used in `parsebib-clean-TeX-markup' to make TeX markup more
+suitable for display. Each item in the list consists of a regexp
+and its replacement. The replacement can be a string (which will
+simply replace the match) or a function (the match will be
+replaced by the result of calling the function on the match
+string). Earlier elements are evaluated before later ones, so if
+one string is a subpattern of another, the second must appear
+later (e.g. \"''\" is before \"'\").")
+
+(defun parsebib-clean-TeX-markup (string)
+ "Return STRING without TeX markup.
+Any substring matching the car of a cell in
+`parsebib-TeX-markup-replace-alist' is replaced with the
+corresponding cdr (if the cdr is a string), or with the result of
+calling the cdr on the match (if it is a function). This is done
+with `replace-regexp-in-string', which see for details."
+ (let ((case-fold-search nil))
+ (save-match-data
+ (cl-loop for (pattern . replacement) in parsebib-TeX-markup-replace-alist
+ do (setq string (replace-regexp-in-string
+ pattern replacement string))
+ finally return string))))
+
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Matching and parsing stuff ;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -228,7 +420,7 @@ if a matching delimiter was found."
;; If forward-sexp does not result in an error, we want to return t.
t))
-(defun parsebib--parse-bib-value (limit &optional strings)
+(defun parsebib--parse-bib-value (limit &optional strings replace-TeX)
"Parse value at point.
A value is either a field value or a @String expansion. Return
the value as a string. No parsing is done beyond LIMIT, but note
@@ -238,7 +430,11 @@ STRINGS, if non-nil, is a hash table of @String definitions.
@String abbrevs in the value to be parsed are then replaced with
their expansions. Additionally, newlines in field values are
removed, white space is reduced to a single space and braces or
-double quotes around field values are removed."
+double quotes around field values are removed.
+
+REPLACE-TEX indicates whether TeX markup should be replaced with
+ASCII/Unicode characters. See the variable
+`parsebib-TeX-markup-replace-alist' for details."
(let (res)
(while (and (< (point) limit)
(not (looking-at-p ",")))
@@ -253,9 +449,12 @@ double quotes around field values are removed."
((looking-at "[[:space:]]*#[[:space:]]*")
(goto-char (match-end 0)))
(t (forward-char 1)))) ; So as not to get stuck in an infinite loop.
- (if strings
- (string-join (parsebib--expand-strings (nreverse res) strings))
- (string-join (nreverse res) " # "))))
+ (setq res (if strings
+ (string-join (parsebib--expand-strings (nreverse res) strings))
+ (string-join (nreverse res) " # ")))
+ (if replace-TeX
+ (parsebib-clean-TeX-markup res)
+ res)))
;;;;;;;;;;;;;;;;;;;;;
;; Expanding stuff ;;
@@ -456,7 +655,7 @@ point."
into hashid-fields
finally return (mapconcat #'identity hashid-fields "")))
-(defun parsebib-read-entry (type &optional pos strings fields)
+(defun parsebib-read-entry (type &optional pos strings fields replace-TeX)
"Read a BibTeX entry of type TYPE at the line POS is on.
TYPE should be a string and should not contain the @
sign. The return value is the entry as an alist of (<field> .
@@ -484,7 +683,11 @@ FIELDS is a list of the field names (as strings) to be read and
included in the result. Fields not in the list are ignored,
except \"=key=\" and \"=type=\", which are always included. Case
is ignored when comparing fields to the list in FIELDS. If
-FIELDS is nil, all fields are returned."
+FIELDS is nil, all fields are returned.
+
+REPLACE-TEX indicates whether TeX markup should be replaced with
+ASCII/Unicode characters. See the variable
+`parsebib-TeX-markup-replace-alist' for details."
(unless (member-ignore-case type '("comment" "preamble" "string"))
(when pos (goto-char pos))
(beginning-of-line)
@@ -501,7 +704,7 @@ FIELDS is nil, all fields are returned."
(buffer-substring-no-properties beg (point)))))
(or key (setq key "")) ; If no key was found, we pretend it's empty and try to read the entry anyway.
(skip-chars-forward "^," limit) ; Move to the comma after the entry key.
- (let ((fields (cl-loop for field = (parsebib--parse-bibtex-field limit strings fields)
+ (let ((fields (cl-loop for field = (parsebib--parse-bibtex-field limit strings fields replace-TeX)
while field
if (consp field) collect field)))
(push (cons "=type=" type) fields)
@@ -510,7 +713,7 @@ FIELDS is nil, all fields are returned."
(push (cons "=hashid=" (secure-hash 'sha256 (parsebib--get-hashid-string fields))) fields))
(nreverse fields))))))
-(defun parsebib--parse-bibtex-field (limit &optional strings fields)
+(defun parsebib--parse-bibtex-field (limit &optional strings fields replace-TeX)
"Parse the field starting at point.
Do not search beyond LIMIT (a buffer position). Return a
cons (FIELD . VALUE), or nil if no field was found.
@@ -522,7 +725,11 @@ FIELDS is a list of the field names (as strings) to be read and
included in the result. Fields not in the list are ignored,
except \"=key=\" and \"=type=\", which are always included. Case
is ignored when comparing fields to the list in FIELDS. If
-FIELDS is nil, all fields are returned."
+FIELDS is nil, all fields are returned.
+
+REPLACE-TEX indicates whether TeX markup should be replaced with
+ASCII/Unicode characters. See the variable
+`parsebib-TeX-markup-replace-alist' for details."
(skip-chars-forward "\"#%'(),={} \n\t\f" limit) ; Move to the first char of the field name.
(unless (>= (point) limit) ; If we haven't reached the end of the entry.
(let ((beg (point)))
@@ -530,7 +737,7 @@ FIELDS is nil, all fields are returned."
(let ((field-type (buffer-substring-no-properties beg (point))))
(if (or (not fields)
(member-ignore-case field-type fields))
- (cons field-type (parsebib--parse-bib-value limit strings))
+ (cons field-type (parsebib--parse-bib-value limit strings replace-TeX))
(parsebib--parse-bib-value limit) ; Skip over the field value.
:ignore)))))) ; Ignore this field but keep the `cl-loop' in `parsebib-read-entry' going.
@@ -650,7 +857,7 @@ file. Return nil if no dialect is found."
(string-match (concat "bibtex-dialect: " (regexp-opt (mapcar #'symbol-name bibtex-dialect-list) t)) comment))
(intern (match-string 1 comment))))))))
-(cl-defun parsebib-parse-bib-buffer (&key entries strings expand-strings inheritance fields)
+(cl-defun parsebib-parse-bib-buffer (&key entries strings expand-strings inheritance fields replace-TeX)
"Parse the current buffer and return all BibTeX data.
Return a list of five elements: a hash table with the entries, a
hash table with the @String definitions, a list of @Preamble
@@ -685,7 +892,11 @@ FIELDS is a list of the field names (as strings) to be read and
included in the result. Fields not in the list are ignored,
except \"=key=\" and \"=type=\", which are always included. Case
is ignored when comparing fields to the list in FIELDS. If
-FIELDS is nil, all fields are returned."
+FIELDS is nil, all fields are returned.
+
+REPLACE-TEX indicates whether TeX markup should be replaced with
+ASCII/Unicode characters. See the variable
+`parsebib-TeX-markup-replace-alist' for details."
(save-excursion
(goto-char (point-min))
(or (and (hash-table-p entries)
@@ -710,7 +921,7 @@ FIELDS is nil, all fields are returned."
((cl-equalp item "comment")
(push (parsebib-read-comment) comments))
((stringp item)
- (let ((entry (parsebib-read-entry item nil (if expand-strings strings) fields)))
+ (let ((entry (parsebib-read-entry item nil (if expand-strings strings) fields replace-TeX)))
(when entry
(puthash (cdr (assoc-string "=key=" entry)) entry entries))))))
(when inheritance (parsebib-expand-xrefs entries (if (eq inheritance t) dialect inheritance)))
@@ -1015,7 +1226,8 @@ details. If FIELDS is nil, all fields are returned."
:strings strings
:expand-strings display
:inheritance display
- :fields fields))
+ :fields fields
+ :replace-TeX display))
((string= (file-name-extension file t) ".json")
(parsebib-parse-json-buffer :entries entries
:stringify display