summaryrefslogtreecommitdiff
path: root/parsebib.el
blob: 12b2cf2c275646d0ebf66a95551559729652e5c0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
;;; parsebib.el --- A library for parsing bib files  -*- lexical-binding: t -*-

;; Copyright (c) 2014 Joost Kremers
;; All rights reserved.

;; Author: Joost Kremers <joostkremers@fastmail.fm>
;; Maintainer: Joost Kremers <joostkremers@fastmail.fm>
;; Created: 2014
;; Version: 1.0
;; Keywords: text bibtex
;; Package-Requires: ((emacs "24.3"))

;; Redistribution and use in source and binary forms, with or without
;; modification, are permitted provided that the following conditions
;; are met:
;;
;; 1. Redistributions of source code must retain the above copyright
;;    notice, this list of conditions and the following disclaimer.
;; 2. Redistributions in binary form must reproduce the above copyright
;;    notice, this list of conditions and the following disclaimer in the
;;    documentation and/or other materials provided with the distribution.
;; 3. The name of the author may not be used to endorse or promote products
;;    derived from this software without specific prior written permission.
;;
;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
;; IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
;; OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
;; IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
;; INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
;; NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES ; LOSS OF USE,
;; DATA, OR PROFITS ; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
;; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
;; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
;; THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

;;; Commentary:

;;

;;; Code:

(require 'bibtex)
(require 'cl-lib)
(require 'subr-x) ; for `string-join'.

(defconst parsebib--bibtex-identifier "[^^\"@\\&$#%',={}() \t\n\f]*" "Regexp describing a licit BibTeX identifier.")
(defconst parsebib--key-regexp "[^^\"@\\&$#%',={} \t\n\f]*" "Regexp describing a licit key.")
(defconst parsebib--entry-start "^[ \t]*@" "Regexp describing the start of an entry.")

;; Emacs 24.3 compatibility code.
(unless (fboundp 'define-error)
  ;; This definition is simply copied from the Emacs 24.4 sources
  (defun define-error (name message &optional parent)
    "Define NAME as a new error signal.
MESSAGE is a string that will be output to the echo area if such an error
is signaled without being caught by a `condition-case'.
PARENT is either a signal or a list of signals from which it inherits.
Defaults to `error'."
    (unless parent (setq parent 'error))
    (let ((conditions
           (if (consp parent)
               (apply #'nconc
                      (mapcar (lambda (parent)
                                (cons parent
                                      (or (get parent 'error-conditions)
                                          (error "Unknown signal `%s'" parent))))
                              parent))
             (cons parent (get parent 'error-conditions)))))
      (put name 'error-conditions
           (delete-dups (copy-sequence (cons name conditions))))
      (when message (put name 'error-message message)))))

(define-error 'parsebib-entry-type-error "Illegal entry type" 'error)

;;;;;;;;;;;;;;;;;;;;
;; matching stuff ;;
;;;;;;;;;;;;;;;;;;;;

(defun parsebib--looking-at-goto-end (str &optional match)
  "Like `looking-at' but move point to the end of the matching string STR.
MATCH acts just like the argument to MATCH-END, and defaults to
0. Comparison is done case-insensitively."
  (or match (setq match 0))
  (let ((case-fold-search t))
    (if (looking-at str)
        (goto-char (match-end match)))))

(defun parsebib--match-paren-forward ()
  "Move forward to the closing paren matching the opening paren at point.
This function handles parentheses () and braces {}.  Return t if a
matching parenthesis was found.  Note that this function puts
point right before the closing delimiter (unlike e.g.,
`forward-sexp', which puts it right after.)"
  (let ((result (cond
                 ((eq (char-after) ?\{)
                  (parsebib--match-brace-forward))
                 ((eq (char-after) ?\()
                  ;; This is really a hack. We want to allow unbalanced parentheses in
                  ;; field values (BibTeX does), so we cannot use forward-sexp
                  ;; here. For the same reason, looking for the matching paren by hand
                  ;; is pretty complicated. However, balanced parentheses can only be
                  ;; used to enclose entire entries (or @STRINGs or @PREAMBLEs) so we
                  ;; can be pretty sure we'll find it right before the next @ at the
                  ;; start of a line, or right before the end of the file.
                  (let ((beg (point)))
                    (re-search-forward parsebib--entry-start nil 0)
                    (skip-chars-backward "@ \n\t\f")
                    (if (eq (char-after) ?\))
                        ;; if we've found a closing paren, return t
                        t
                      ;; otherwise put the cursor back and signal an error
                      (goto-char beg)
                      (signal 'scan-error (list "Unbalanced parentheses" beg (point-max)))))))))
    (when result
      ;; move point one char back to place it where the rest of parsebib expects it
      (forward-char -1)
      ;; make sure we return t
      result)))

(defun parsebib--match-delim-forward ()
  "Move forward to the closing delimiter matching the delimiter at point.
This function handles braces {} and double quotes \"\". Return t
if a matching delimiter was found. Note that this function puts
point right before the closing delimiter (unlike e.g.,
`forward-sexp', which puts it right after.)"
  (let ((result (cond
                 ((eq (char-after) ?\{)
                  (parsebib--match-brace-forward))
                 ((eq (char-after) ?\")
                  (parsebib--match-quote-forward)))))
    (when result
      ;; move point one char back to place it where the rest of parsebib expects it
      (forward-char -1)
      ;; make sure we return t
      result)))

(defun parsebib--match-brace-forward ()
  "Move forward to the closing brace matching the opening brace at point."
  (with-syntax-table bibtex-braced-string-syntax-table
    (forward-sexp 1)
    ;; if forward-sexp does not result in an error, we want to return t
    t))

(defun parsebib--match-quote-forward ()
  "Move to the closing double quote matching the quote at point."
  (with-syntax-table bibtex-quoted-string-syntax-table
    (forward-sexp 1)
    ;; if forward-sexp does not result in an error, we want to return t
    t))


;;;;;;;;;;;;;;;;;;;;;;;;
;; parsing a bib file ;;
;;;;;;;;;;;;;;;;;;;;;;;;

(defun parsebib-find-next-item (&optional pos)
  "Find the first (potential) BibTeX item following POS.

This function simply searches for an @ at the start of a line,
possibly preceded by spaces or tabs, followed by a string of
characters as defined by `parsebib--bibtex-identifier'.  When
successful, point is placed right after the item's type, i.e.,
generally on the opening brace or parenthesis following the entry
type, \"@Comment\", \"@Preamble\" or \"@String\".

The return value is the name of the item as a string, either
\"Comment\", \"Preamble\" or \"String\", or the entry
type (without the @). If an item name is found that includes an
illegal character, an error of type `parsebib-entry-type-error'
is raised. If no item is found, nil is returned and point is left
at the end of the buffer.

POS can be a number or a marker and defaults to point."
  (when pos (goto-char pos))
  (when (re-search-forward parsebib--entry-start nil 0)
    (if (parsebib--looking-at-goto-end (concat "\\(" parsebib--bibtex-identifier "\\)" "[[:space:]]*[\(\{]") 1)
        (match-string-no-properties 1)
      (signal 'parsebib-entry-type-error (list (point))))))

(defun parsebib-read-comment (&optional pos)
  "Read the @Comment beginning at the line POS is on.
Return value is the text of the @Comment or nil if no comment is
found.

POS can be a number or a marker.  It does not have to be at the
beginning of a line, but the @Comment entry must start at the
beginning of the line POS is on.  If POS is nil, it defaults to
point."
  (when pos (goto-char pos))
  (beginning-of-line)
  (when (parsebib--looking-at-goto-end (concat parsebib--entry-start "comment[[:space:]]*[\(\{]"))
    (let ((beg (point))) ; we are right after the opening brace / parenthesis
      (forward-char -1)  ; move back to the brace / paren
      (when (parsebib--match-paren-forward)
        (buffer-substring-no-properties beg (point))))))

(defun parsebib-replace-strings (val strings)
  "Replace strings in VAL using (key, value) pairs from hash table STRINGS."
  (replace-regexp-in-string "\"" "" (string-join (mapcar
                                                  (lambda (x) (if strings (gethash x strings x) x))
                                                  (split-string val " # " t)))))

(defun parsebib-read-string (&optional pos strings)
  "Read the @String definition beginning at the line POS is on.
If a proper abbreviation and expansion are found, they are
returned as a cons cell (<abbrev> . <expansion>).  Otherwise, nil
is returned.

POS can be a number or a marker.  It does not have to be at the
beginning of a line, but the @String entry must start at the
beginning of the line POS is on.  If POS is nil, it defaults to
point.

If STRINGS is provided it is assumed to be a hash table passed to
parsebib-replace-strings."
  (when pos (goto-char pos))
  (beginning-of-line)
  (when (parsebib--looking-at-goto-end (concat parsebib--entry-start "string[[:space:]]*[\(\{]"))
    (let ((limit (save-excursion        ; find the position of the matching end parenthesis
                   (forward-char -1)
                   (parsebib--match-paren-forward)
                   (point))))
      (skip-chars-forward "#%'(),={} \n\t\f" limit) ; move up to the abbrev
      (let* ((beg (point))                          ; read the abbrev
             (abbr (if (parsebib--looking-at-goto-end (concat "\\(" parsebib--bibtex-identifier "\\)[ \t\n\f]*=") 1)
                       (buffer-substring-no-properties beg (point))
                     nil)))
        (when (and abbr (> (length abbr) 0))            ; if we found an abbrev
          (skip-chars-forward "#%'(),={} \n\t\f" limit) ; move forward to the expansion
          (let* ((beg (point))                          ; read the expansion
                 (expansion (parsebib-replace-strings
                             (buffer-substring-no-properties beg limit)
                             strings)))
            (cons abbr expansion)))))))

(defun parsebib-read-preamble (&optional pos)
  "Read the @Preamble definition at the line POS is on.
Return the preamble as a string, or nil if none was found.

POS can be a number or a marker.  It does not have to be at the
beginning of a line, but the @Preamble must start at the
beginning of the line POS is on.  If POS is nil, it defaults to
point."
  (when pos (goto-char pos))
  (beginning-of-line)
  (when (parsebib--looking-at-goto-end (concat parsebib--entry-start "preamble[[:space:]]*[\(\{]"))
    (let ((beg (point)))
      (forward-char -1)
      (when (parsebib--match-paren-forward)
        (buffer-substring-no-properties beg (point))))))

(defun parsebib-read-entry (type &optional pos strings)
  "Read a BibTeX entry of type TYPE at the line POS is on.

TYPE should be a string and should not contain the @
sign.  The return value is the entry as an alist of (<field> .
<contents>) cons pairs, or nil if no entry was found.  In this
alist, the entry key is provided in the field \"=key=\" and the
entry type in the field \"=type=\".

POS can be a number or a marker.  It does not have to be at the
beginning of a line, but the entry must start at the beginning of
the line POS is on.  If POS is nil, it defaults to point.

ENTRY should not be \"Comment\", \"Preamble\" or \"String\", but
is otherwise not limited to any set of possible entry types. If
so required, the calling function has to ensure that the entry
type is valid.

If STRINGS is provided it is assumed to be a hash table passed to
parsebib-replace-strings."
  (unless (member-ignore-case type '("comment" "preamble" "string"))
    (when pos (goto-char pos))
    (beginning-of-line)
    (when (parsebib--looking-at-goto-end (concat parsebib--entry-start type "[[:space:]]*[\(\{]"))
      ;; find the end of the entry and the beginning of the entry key
      (let* ((limit (save-excursion
                      (backward-char)
                      (parsebib--match-paren-forward)
                      (point)))
             (beg (progn
                    (skip-chars-forward " \n\t\f") ; note the space!
                    (point)))
             (key (when (parsebib--looking-at-goto-end (concat "\\(" parsebib--key-regexp "\\)[ \t\n\f]*,") 1)
                    (buffer-substring-no-properties beg (point)))))
        (or key (setq key "")) ; if no key was found, we pretend it's empty and try to read the entry anyway
        (skip-chars-forward "^," limit) ; move to the comma after the entry key
        (let ((fields (cl-loop for field = (parsebib--find-bibtex-field limit strings)
                               while field collect field)))
          (push (cons "=type=" type) fields)
          (push (cons "=key=" key) fields)
          (nreverse fields))))))

(defun parsebib--find-bibtex-field (limit &optional strings)
  "Find the field after point.
Do not search beyond LIMIT (a buffer position).  Return a
cons (FIELD . VALUE), or nil if no field was found.

If STRINGS is provided it is assumed to be a hash table passed to
parsebib-replace-strings."
  (skip-chars-forward "\"#%'(),={} \n\t\f" limit) ; move to the first char of the field name
  (unless (>= (point) limit)                      ; if we haven't reached the end of the entry
    (let ((beg (point)))
      (if (parsebib--looking-at-goto-end (concat "\\(" parsebib--bibtex-identifier "\\)[ \t\n\f]*=") 1)
          (let ((field-type (buffer-substring-no-properties beg (point))))
            (skip-chars-forward "#%'()=} \n\t\f" limit) ; move to the field contents
            (let* ((beg (point))
                   (field-contents (buffer-substring-no-properties beg (parsebib--find-end-of-field limit))))
              (cons field-type (parsebib-replace-strings field-contents strings))))))))

(defun parsebib--find-end-of-field (limit)
  "Move point to the end of a field's contents and return point.
The contents of a field is delimited by a comma or by the closing brace of
the entry.  The latter should be at position LIMIT."
  (while (and (not (eq (char-after) ?\,))
              (< (point) limit))
    (parsebib--match-delim-forward) ; check if we're on a delimiter and if so, jump to the matching closing delimiter
    (forward-char 1))
  (if (= (point) limit)
      (skip-chars-backward " \n\t\f"))
  (point))

(defun parsebib-collect-strings (&optional hash)
  "Collect all @String definitions in the current buffer.
Return value is a hash with the abbreviations as keys and the
expansions as values.  If HASH is a hash table with test function
`equal', it is used to store the @String definitions."
  (save-excursion
    (goto-char (point-min))
    (let* ((res (if (and (hash-table-p hash)
                         (eq 'equal (hash-table-test hash)))
                    hash
                  (make-hash-table :test #'equal)))
           string)
      (cl-loop for item = (parsebib-find-next-item)
               while item do
               (when (cl-equalp item "string")
                 (setq string (parsebib-read-string))
                 (puthash (car string) (cdr string) res)))
      hash)))

(defun parsebib-collect-preambles ()
  "Collect all @Preamble definitions in the current buffer.
Return a list of strings, each string a separate @Preamble."
  (save-excursion
    (goto-char (point-min))
    (let (res)
      (cl-loop for item = (parsebib-find-next-item)
               while item do
               (when (cl-equalp item "preamble")
                 (push (parsebib-read-preamble) res)))
      (nreverse res))))

(defun parsebib-collect-comments ()
  "Collect all @Comment definitions in the current buffer.
Return a list of strings, each string a separate @Comment."
  (save-excursion
    (goto-char (point-min))
    (let (res)
      (cl-loop for item = (parsebib-find-next-item)
               while item do
               (when (cl-equalp item "comment")
                 (push (parsebib-read-comment) res)))
      (nreverse res))))

(defun parsebib-collect-entries (&optional hash strings)
  "Collect all entries is the current buffer.
Return value is a hash table containing the entries.  If HASH is
a hash table, with test function `equal', it is used to store the
entries.  If STRINGS is non-nil, it should be a hash table of
string definitions, which are used to expand abbreviations used
in the entries."
  (save-excursion
    (goto-char (point-min))
    (let* ((res (if (and (hash-table-p hash)
                         (eq 'equal (hash-table-test hash)))
                    hash
                  (make-hash-table :test #'equal)))
           entry)
      (cl-loop for entry-type = (parsebib-find-next-item)
               while entry-type do
               (unless (member-ignore-case entry-type '("preamble" "string" "comment"))
                 (setq entry (parsebib-read-entry entry-type nil strings))
                 (if entry
                     (puthash (cdr (assoc-string "=key=" entry)) entry res))))
      hash)))

(defun parsebib-parse-buffer (&optional entries-hash strings-hash expand-strings)
  "Parse the current buffer and return all BibTeX data.
Return list of four elements: a hash table with the entries, a
hash table with the @String definitions, a list of @Preamble
definitions, and a list of @Comments.

If ENTRIES-HASH is a hash table with test function `equal', it is
used to store the entries.  Any existing entries with identical
keys are overwritten.  Similarly, if STRINGS-HASH is a hash table
with test function `equal', the @String definitions are stored in
it.

If EXPAND-STRINGS is non-nil, abbreviations in the entries and
@String definitions are expanded using the @String definitions
already in STRINGS."
  (save-excursion
    (goto-char (point-min))
    (let ((entries (if (and (hash-table-p entries-hash)
                            (eq (hash-table-test entries-hash) 'equal))
                       entries-hash
                     (make-hash-table :test #'equal)))
          (strings (if (and (hash-table-p strings-hash)
                            (eq (hash-table-test strings-hash) 'equal))
                       strings-hash
                     (make-hash-table :test #'equal)))
          preambles comments)
      (cl-loop for item = (parsebib-find-next-item)
               while item do
               (cond
                ((cl-equalp item "string") ; `cl-equalp' compares strings case-insensitively.
                 (let ((string (parsebib-read-string nil (if expand-strings strings))))
                   (if string
                       (puthash (car string) (cdr string) strings))))
                ((cl-equalp item "preamble")
                 (push (parsebib-read-preamble) preambles))
                ((cl-equalp item "comment")
                 (push (parsebib-read-comment) comments))
                ((stringp item)
                 (let ((entry (parsebib-read-entry item nil (if expand-strings strings))))
                   (when entry
                     (puthash (cdr (assoc-string "=key=" entry)) entry entries))))))
      (list entries strings preambles comments))))

(defun parsebib-find-bibtex-dialect ()
  "Find the BibTeX dialect of a file if one is set.
This function looks for a local value of the variable
`bibtex-dialect' in the local variable block at the end of the
file.  Return nil if no dialect is found."
  (save-excursion
    (goto-char (point-max))
    (let ((case-fold-search t))
      (when (re-search-backward (concat parsebib--entry-start "comment") (- (point-max) 3000) t)
        (let ((comment (parsebib-read-comment)))
          (when (and comment
                     (string-match-p "\\`[ \n\t\r]*Local Variables:" comment)
                     (string-match-p "End:[ \n\t\r]*\\'" comment)
                     (string-match (concat "bibtex-dialect: " (regexp-opt (mapcar #'symbol-name bibtex-dialect-list) t)) comment))
            (intern (match-string 1 comment))))))))

(provide 'parsebib)

;;; parsebib.el ends here