summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNuno Carvalho <smash@cpan.org>2017-08-31 14:21:51 -0300
committerNuno Carvalho <smash@cpan.org>2017-08-31 14:21:51 -0300
commit12e858cd4a5378f5fb4b08a650df7d9a8da49bcc (patch)
tree49fc7484d2234e5c5a4171f3155da54f617a8c8e
Import libtext-bibtex-perl_0.85.orig.tar.gz
[dgit import orig libtext-bibtex-perl_0.85.orig.tar.gz]
-rw-r--r--Build.PL121
-rw-r--r--Changes327
-rw-r--r--MANIFEST156
-rw-r--r--MANIFEST.SKIP99
-rw-r--r--META.json109
-rw-r--r--META.yml73
-rw-r--r--README69
-rw-r--r--README.OLD168
-rw-r--r--THANKS15
-rw-r--r--btool_faq.pod121
-rw-r--r--btparse/doc/bt_format_names.pod297
-rw-r--r--btparse/doc/bt_input.pod175
-rw-r--r--btparse/doc/bt_language.pod277
-rw-r--r--btparse/doc/bt_macros.pod152
-rw-r--r--btparse/doc/bt_misc.pod147
-rw-r--r--btparse/doc/bt_post_processing.pod249
-rw-r--r--btparse/doc/bt_postprocess.pod265
-rw-r--r--btparse/doc/bt_split_names.pod280
-rw-r--r--btparse/doc/bt_traversal.pod181
-rw-r--r--btparse/doc/btparse.pod542
-rw-r--r--btparse/pccts/antlr.h625
-rw-r--r--btparse/pccts/ast.c283
-rw-r--r--btparse/pccts/ast.h115
-rw-r--r--btparse/pccts/config.h230
-rw-r--r--btparse/pccts/dlgauto.h474
-rw-r--r--btparse/pccts/dlgdef.h124
-rw-r--r--btparse/pccts/err.h872
-rw-r--r--btparse/progs/args.c108
-rw-r--r--btparse/progs/args.h34
-rw-r--r--btparse/progs/biblex.c87
-rw-r--r--btparse/progs/bibparse.c311
-rw-r--r--btparse/progs/dumpnames.c154
-rw-r--r--btparse/progs/getopt.c813
-rw-r--r--btparse/progs/getopt.h132
-rw-r--r--btparse/progs/getopt1.c187
-rw-r--r--btparse/src/attrib.h35
-rw-r--r--btparse/src/bibtex.c351
-rw-r--r--btparse/src/bibtex.g413
-rw-r--r--btparse/src/bibtex_ast.c63
-rw-r--r--btparse/src/bibtex_ast.h21
-rw-r--r--btparse/src/bt_config.h.in115
-rw-r--r--btparse/src/bt_debug.h38
-rw-r--r--btparse/src/btparse.h341
-rw-r--r--btparse/src/err.c75
-rw-r--r--btparse/src/error.c350
-rw-r--r--btparse/src/error.h65
-rw-r--r--btparse/src/file_header.c20
-rw-r--r--btparse/src/format_name.c974
-rw-r--r--btparse/src/function_header.c12
-rw-r--r--btparse/src/init.c42
-rw-r--r--btparse/src/input.c515
-rw-r--r--btparse/src/lex_auxiliary.c942
-rw-r--r--btparse/src/lex_auxiliary.h71
-rw-r--r--btparse/src/line_offsets.c91
-rw-r--r--btparse/src/line_offsets.h13
-rw-r--r--btparse/src/macros.c369
-rw-r--r--btparse/src/mode.h3
-rw-r--r--btparse/src/modify.c75
-rw-r--r--btparse/src/my_alloca.h35
-rw-r--r--btparse/src/my_dmalloc.h17
-rw-r--r--btparse/src/names.c916
-rw-r--r--btparse/src/parse_auxiliary.c391
-rw-r--r--btparse/src/parse_auxiliary.h32
-rw-r--r--btparse/src/parser.dlg215
-rw-r--r--btparse/src/postprocess.c500
-rw-r--r--btparse/src/prototypes.h50
-rw-r--r--btparse/src/scan.c615
-rw-r--r--btparse/src/stdpccts.h32
-rw-r--r--btparse/src/string_util.c698
-rw-r--r--btparse/src/sym.c376
-rw-r--r--btparse/src/sym.h48
-rw-r--r--btparse/src/tex_tree.c414
-rw-r--r--btparse/src/tokens.h80
-rw-r--r--btparse/src/traversal.c187
-rw-r--r--btparse/src/util.c1180
-rw-r--r--btparse/src/util.h7
-rw-r--r--btparse/tests/case_test.c50
-rw-r--r--btparse/tests/data/TESTS7
-rw-r--r--btparse/tests/data/commas.bib21
-rw-r--r--btparse/tests/data/comment.bib6
-rw-r--r--btparse/tests/data/empty.bib0
-rw-r--r--btparse/tests/data/foreign.bib12
-rw-r--r--btparse/tests/data/macro.bib2
-rw-r--r--btparse/tests/data/names17
-rw-r--r--btparse/tests/data/preamble.bib2
-rw-r--r--btparse/tests/data/purify.strings50
-rw-r--r--btparse/tests/data/regular.bib8
-rw-r--r--btparse/tests/data/simple.bib18
-rw-r--r--btparse/tests/macro_test.c90
-rw-r--r--btparse/tests/name_test.c155
-rw-r--r--btparse/tests/namebug.c28
-rw-r--r--btparse/tests/postprocess_test.c40
-rw-r--r--btparse/tests/purify_test.c37
-rw-r--r--btparse/tests/read_test.c84
-rw-r--r--btparse/tests/simple_test.c596
-rw-r--r--btparse/tests/testlib.c43
-rw-r--r--btparse/tests/testlib.h46
-rw-r--r--btparse/tests/tex_test.c42
-rwxr-xr-xexamples/append_entries78
-rw-r--r--inc/MyBuilder.pm403
-rw-r--r--lib/Text/BibTeX.pm809
-rw-r--r--lib/Text/BibTeX/Bib.pm476
-rw-r--r--lib/Text/BibTeX/BibFormat.pm500
-rw-r--r--lib/Text/BibTeX/BibSort.pm245
-rw-r--r--lib/Text/BibTeX/Entry.pm1087
-rw-r--r--lib/Text/BibTeX/File.pm265
-rw-r--r--lib/Text/BibTeX/Name.pm426
-rw-r--r--lib/Text/BibTeX/NameFormat.pm325
-rw-r--r--lib/Text/BibTeX/Structure.pm1206
-rw-r--r--lib/Text/BibTeX/Value.pm333
-rwxr-xr-xscripts/btcheck31
-rwxr-xr-xscripts/btformat128
-rwxr-xr-xscripts/btsort33
-rw-r--r--t/00_system_info.t12
-rw-r--r--t/bib.t157
-rw-r--r--t/common.pl68
-rw-r--r--t/corpora.bib264
-rw-r--r--t/errors.bib7
-rw-r--r--t/from_file.t82
-rw-r--r--t/macro.t135
-rw-r--r--t/modify.t88
-rw-r--r--t/nameformat.t197
-rw-r--r--t/namelist.t58
-rw-r--r--t/names.t138
-rw-r--r--t/output.t96
-rw-r--r--t/parse.t52
-rw-r--r--t/parse_f.t87
-rw-r--r--t/parse_s.t119
-rw-r--r--t/purify.t142
-rw-r--r--t/split_names28
-rw-r--r--t/unlimited.bib3
-rw-r--r--t/unlimited.t29
-rw-r--r--typemap31
-rw-r--r--xscode/BibTeX.xs572
-rw-r--r--xscode/btxs_support.c488
-rw-r--r--xscode/btxs_support.h45
136 files changed, 29756 insertions, 0 deletions
diff --git a/Build.PL b/Build.PL
new file mode 100644
index 0000000..ab9ddba
--- /dev/null
+++ b/Build.PL
@@ -0,0 +1,121 @@
+use strict;
+use warnings;
+
+use lib 'inc';
+use MyBuilder;
+use Config;
+use File::Spec::Functions qw.catdir catfile.;
+use File::Copy;
+
+
+my $version = get_version();
+
+my $builder = MyBuilder->new(
+ module_name => 'Text::BibTeX',
+ license => 'perl',
+ dist_author =>
+ [ 'Alberto Simões <ambs@cpan.org>', 'Greg Ward <gward@python.net>' ],
+ needs_compiler => 1,
+ meta_merge => {
+ resources => { repository => 'http://github.com/ambs/Text-BibTeX', },
+ },
+ configure_requires => {
+ 'Module::Build' => '0.36',
+ 'Config::AutoConf' => '0.16',
+ 'ExtUtils::LibBuilder' => '0.02',
+ },
+ requires => {
+ 'Scalar::Util' => '1.42',
+ 'Unicode::Normalize' => '0',
+ 'Encode' => '0',
+ },
+ build_requires => {
+ 'File::Copy' => '0',
+ 'Config::AutoConf' => '0.16',
+ 'ExtUtils::LibBuilder' => '0.02',
+ 'Capture::Tiny' => '0.06',
+ 'ExtUtils::CBuilder' => '0.27',
+ 'Module::Build' => '0.3603',
+ 'Cwd' => '0',
+ },
+ add_to_cleanup => [
+ 'Text-BibTeX-*',
+
+ # NOT SURE YET 'btparse/src/bt_config.h',
+ 'btparse/src/*.so',
+ 'btparse/src/*.dylib',
+ 'btparse/src/*.dll',
+ 'btparse/src/*.o',
+ 'xscode/*.o',
+ 'btparse/tests/*.o',
+ 'btparse/progs/*.o',
+ 'btparse/progs/dumpnames',
+ 'btparse/progs/bibparse',
+ 'btparse/progs/biblex',
+ 'btparse/tests/postprocess_test',
+ 'btparse/tests/read_test',
+ 'btparse/tests/simple_test',
+ 'btparse/tests/macro_test',
+ 'btparse/tests/case_test',
+ 'btparse/tests/name_test',
+ 'btparse/tests/purify_test',
+ ],
+);
+
+## HACK HACK HACK HACK
+my $libdir = $builder->install_destination("bin");
+if ( $^O =~ /mswin32/i ) {
+ $libdir = undef;
+
+ # Find a place where we can write.
+ my @folders = split /;/, $ENV{PATH};
+ my $installed = 0;
+ my $target = "text-bibtex.$$";
+ while ( @folders && !$installed ) {
+ $libdir = shift @folders;
+
+ copy( "MANIFEST", catfile( $libdir, $target ) );
+ $installed = 1 if -f catfile( $libdir, $target );
+ }
+ if ( !$installed ) {
+ warn("Wasn't able to find a suitable place for libbtparse.dll!");
+ }
+ else {
+ print STDERR "libbtparse.dll will be installed in $libdir\n";
+ unlink catfile( $libdir, $target );
+ }
+}
+else {
+ if ( $Config{archname} =~ /^x86_64|^ppc64|^s390x|^aarch64/ ) {
+ $libdir =~ s/\bbin\b/lib64/;
+ if ( !-d $libdir ) {
+ my $test = $libdir;
+ $test =~ s/lib64/lib/;
+ $libdir = $test if -d $test;
+ }
+ }
+ else {
+ $libdir =~ s/\bbin\b/lib/;
+ }
+}
+$builder->notes( 'btparse_version' => $version );
+$builder->notes( 'lib_path' => $libdir );
+$builder->add_build_element('usrlib');
+$builder->install_path( 'usrlib' => $libdir );
+
+$builder->create_build_script;
+
+sub get_version {
+ my $version = undef;
+ open PM, "lib/Text/BibTeX.pm"
+ or die "Cannot open 'lib/Text/BibTeX.pm' for reading: $!\n";
+ while (<PM>) {
+ if (m!^our\s+\$VERSION\s*=\s*'([^']+)'!) {
+ $version = $1;
+ last;
+ }
+ }
+ close PM;
+ die "Could not find VERSION on your .pm file. Weirdo!\n" unless $version;
+}
+
diff --git a/Changes b/Changes
new file mode 100644
index 0000000..0558653
--- /dev/null
+++ b/Changes
@@ -0,0 +1,327 @@
+Revision history for Perl module Text::BibTeX
+
+0.85 2017-08-31
+ * FreeBSD includes a definition of strlcat, so no need to redefine it.
+
+0.84 2017-08-31
+ * Further buffer overflow fixes.
+ * Spellchecking fixes by Julián Moreno Patiño, Lucas Kanashiro, and
+ Gregor Herrmann (debian community)
+
+0.83 2017-08-28
+ * Remove unecessary depedency to YAML.
+ * Fix further buffer overflow situations.
+
+0.82 2017-08-27
+ * Fix buffer overflow (thanks to Hamid Ebadi).
+ * Hide error messages on tests, and use them for testing purposes.
+
+0.81 2017-07-19
+ * Fix issue with NameFormat and unitialized join-tokens.
+ (thanks to Karl Wette for the bug report).
+
+0.80 2017-03-25
+ * Fix tests in order to work without dot in @INC (thanks Kent Fredric for the bug report)
+
+0.79 2017-03-13
+ * Further fixes to allow the parse of multiple files (Karl Wette).
+
+0.78 2017-01-10
+ * Fixed some issues with uninitialized arrays and s390
+ * Fixed test with fileno (thanks to Karl Wette).
+ * Allow state of btparse parser to be reset, for parsing multiple files (Karl Wette):
+ - bt_parse_entry(): reset parser state if infile == NULL
+ - BibTeX.xs: add _reset_parse(), _reset_parse_s() methods to Text::BibTeX::Entry
+ - Text::BibTeX::Entry: allow new() or parse() with undefined filehandle; calls _reset_parse()
+ - Text::BibTeX::Entry: allow new() or parse_s() with undefined text; calls _reset_parse_s()
+ - Text::BibTeX::File: close() calls Text::BibTeX::Entry->new($filename, undef) to reset parser
+
+0.77 2016-09-20
+ * Fixes for testing and installing on Darwin (install_name issues).
+ Thanks to Nuno "smash" Carvalho for the report and debug help.
+
+0.76 2016-07-06
+ * Added 'reset_macros' option to Text::BibTeX::File, in order
+ to remove all defined macros (except months)
+
+0.76_02 2016-07-05
+ * Fix issue with binmode not being copied in Clone method.
+ * Make month abbreviations available always, and not only when
+ using Text::BibTeX::Bib.
+ * Added docs to supported options for Text::BibTeX::Entry.
+
+0.76_01 2016-07-04
+ * Solved nasty bug when using lvalues as parameters (substr).
+ * Added tests.
+
+0.75 2016-07-03
+ * Stable version with bytes/utf-8 support.
+
+0.75_05 2016-07-02
+ * Get 5.8.x back aboard;
+
+0.75_04 2016-07-01
+ * Fixed reference to empty function name;
+
+0.75_03 2016-06-30
+ * Rename split_list to isplit_list, and creaed split_list wrapper;
+ * Added normalization option;
+
+0.75_02 2016-06-25
+ * Minor fix for some perl version parsing problems.
+
+0.75_01 2016-06-24
+ * Added binmode option. Should allow unicode handling directly.
+
+0.74 2016-06-15
+ * Get perl 5.8.x back.
+
+0.73 2016-06-14
+ * Change some documentation in order to use ->new method, instead
+ of older 'new Class()' approach;
+ * Add sensible default values to Text::BibTeX::NameFormat to reduce
+ the amount of segmentation faults for users forgetting arguments.
+
+0.72 2016-04-19
+ * Added clone() method to Text::BibTeX::Entry
+
+0.71 2015-05-28
+ * Fix segmentation fault when btparse fails parsing a long
+ entry. Thanks to Dale Evans.
+ * Stop using UNIVERSAL (5.21.3 requirement).
+ Thanks to Jitka Plesnikova
+
+0.70 2014-09-01
+ * Added metadata for metacpan.
+
+0.69 2013-02-27
+ * Fixes in the distribution, namely adding versions to unversioned packages.
+
+0.68 2013-02-27
+ * Replacement for islower() which understands all Unicode 6.2.0
+ chars with "LOWERCASE" property. Now we can detect prefices
+ not just in ASCII ... this also seems to have fixed the strange
+ windows XP test failures, probably due to undefined islower(). See RT #92864
+ * Fixed strange problem with Solaris coredumping due to 0.67 changes.
+
+0.67 2013-02-20
+ * Reformatted changelog (thanks to Sergey Romanov)
+ * Remove accents from C source file for Sun compiler probs.
+ * Add 'gnu' as a system name.
+ * Fixes for UTF-8 handling of combining marks
+
+0.66 2012-10-29
+ * Fix a segmentation fault with strcat and no string limit.
+
+0.65 2012-09-26
+ * Patch to support @ and ~ in names (thanks to Philip Kime)
+
+0.64 2012-07-08
+ * Patch to expand macro size limit (thanks to Philip Kime)
+
+0.63 2012-05-12
+ * Patch on what to consider whitespaces (thanks to Philip Kime)
+
+0.62 2012-01-11
+ * Releasing stable
+
+0.62_01 2012-01-07
+ * Better installation under Windows, and specifically, Win 7.
+
+0.61 2011-10-20
+ * Thanks to Philip Kime (again), we have STDERR working on Windows
+ (hopefully)
+ * Make Windows tests no longer ignored.
+
+0.60 2011-07-31
+ * Fix test in t/output.t
+
+0.59 2011-07-28
+ * Add comma at the end of each and any line in BibTeX record
+ as all modern parsers support them.
+ * Rewrote some tests in t/output.t
+
+0.58 2011-06-21
+ * Remove some old documentation about btparse.
+ * Warn users when installing on non standard library path.
+
+0.57 2011-06-17
+ * Mark two tests that fail from time to time as TODO for now.
+
+0.56 2011-06-08
+ * Use File::Temp instead of POSIX. Fixes some Win32 builds.
+ Thanks to Alex Gough for reporting this build problem.
+
+0.55 2011-04-25
+ * Added some extra checks for lib64 dir.
+ * Rewrote some code on MyBuilder.pm
+ * Add soname to library build.
+
+0.54 2011-04-17
+ * Applied patch to work under lib64 archs (hopefuly)
+ (Guillaume Rousse)
+
+0.53 2011-03-10
+ * Working under Cygwin (Philip Kime)
+ * Changed Windows testing mode (based on Cygwin approach)
+ - getting rid of 00_ and zz_ test files. Great!
+
+0.52 2011-02-15
+ * More name abbreviation tweeks (Philip Kime)
+
+0.51 2011-02-01
+ * Fixed environment during Linux/Unix testing
+
+0.50 2011-02-01
+ * Fixed name abbreviation when name has hiffens (Philip Kime)
+
+0.49 2011-01-27
+ * Small patch by Philip Kime to enlarge macros buffer size.
+ * Fixed Build.PL to clean some object files.
+
+0.48 2010-09-23
+ * depend on ExtUtils::LibBuilder for library compilation.
+
+0.47 2010-09-18
+ * fixed a problem with 160 char being considered whitespace on many unixes.
+ Thanks, again, to Philip Kime :)
+
+0.46 2010-08-24
+ * support entry keys in unicode. Thanks to Philip Kime.
+
+0.45 2010-06-08
+ * fixed ldconfig call under linux, now only if the user is root.
+ * fixed library installation path when user specifies different
+ installbase during install action.
+ (thanks to François for both fixes)
+
+0.44 2010-05-09
+ * RPath information on link - Thanks to Jens Rehsack
+ * removed dependency on 'UNIVERSAL' as it is now built-in
+ (change for perl 5.12 deprecation)
+
+0.43 2010-03-18
+ * small fix on warnings issue
+ * run ldconfig after installing the .so file under linux
+
+0.42 2010-03-16
+ * names with more than two commas are left untouched if protected by
+ braces.
+ * can now deal with names with hyphens and generate the correct BibTeX
+ compatible abbreviations.
+ * can generate initials for protected name parts, like BibTeX can.
+ * can deal with generating initials for names which contains Unicode
+ characters.
+ * fixed compilation on machines needing alloca.h
+ * added new README file with more up-to-date information.
+
+0.41 2010-03-14
+ * tested under Solaris and FreeBSD.
+ * fixed compilation/installation under Mac OS X.
+ * fixed some C code on format-security (thanks to Jerome Quelin).
+ * fixed build system to check for dependencies.
+ * added patch from Philip Kime to fix names with commas handling.
+
+0.40 2010-03-07
+ * this release is very similar to 0.40_3, but now indexable by cpan.
+
+0.40_3 2010-02-18
+ * fixed Build.PL with correct build_requirements;
+ * fixed test to use still not installed library;
+
+0.40_2 2010-02-15
+ * merged btparse library into Text::BibTeX code. I am sorry for all
+ other languages that might be using this library. For them my
+ suggestion is that they make Text::BibTeX as a dependency.
+ * ported compilation and configuration tools from ExtUtils::MakeMaker
+ to Module::Build.
+ * back on track, trying to make Text::BibTeX work and compile easily
+ on main platforms (sorry, for main platforms I assume Linux, Mac OS
+ X and Windows running Strawberry Perl).
+ * Thanks to Philip Kime for continuous poke so I work on this!!
+ Created a THANKS file for this purpose.
+
+0.38 2008-03-08
+ * fixed problem when creating an empty Entry.
+ Thanks to Frank Wiegand.
+
+0.37 2006-09-21
+ * added support for 'conference' key (alias for inproceedings)
+
+0.36 2005-07-24
+ * fixed url where btparse should be downloaded from. (stupid bug)
+
+0.35 2005-07-23
+ * changed from 0.33 to 0.35 to maintain relation with btparse.
+ * some tests now use Test::More (more tests will change soon)
+ * the way the module initializes changed. Now the module can be
+ included to check its version.
+
+0.33 2000-03-22
+ * fixed long-standing bug with import/inheritance code; should fix the
+ mysterious "method redefined" and "can't locate object method" errors
+ that various people have seen on various platforms over the years.
+ Thanks to Nikolay Pelov <Nikolay.Pelov@cs.kuleuven.ac.be> for
+ fixing the bug!
+ * fixed some small bugs spotted by Horst Tellioglu
+ <telliogl@h2hobel.phl.univie.ac.at> that messed up creating a brand-
+ new entry from scratch
+ * fixed even smaller bug spotted by Horst Tellioglu in the BibTeX 0.99
+ emulation code
+
+0.32 1999-11-28
+ * made Makefile.PL able to download and configure btparse, instead
+ of leaving it up to the user
+ * fixed entry output method (print_s) so strings are wrapped in braces,
+ not quotes -- that way we don't generate bogus BibTeX files if there
+ are quotes at top-level in a string
+
+0.31 1999-10-28
+ * fixed small bug in Text::BibTeX::BibFormat
+ * better documentation for Text::BibTeX::Name example
+ * better adherence to POD standard
+
+0.30 1999-03-12
+ * the "structure module" system is in place -- lets you write classes
+ analogous to BibTeX style files, but with all the advantages of
+ object-oriented programming in Perl (see Text::BibTeX::Structure
+ man page)
+ * the Bib structure, meant to emulate the standard style files of
+ BibTeX 0.99, is partially complete: can generate sort keys for
+ any of the 13 standard entry types, and format 'article' and 'book'
+ entries -- no support for crossrefs or alphabetic labels yet though
+ * can now (optionally) access the data in more "raw" form, ie. with
+ macros not expanded, strings not concatenated, whitespace not collapsed)
+ (see Text::BibTeX::Value man page)
+ * moved support for parsing individual names to the Text::BibTeX::Name class
+ * added support for name-formatting via Text::BibTeX::NameFormat class
+ (parallels work in btparse -- see also bt_format_names man page)
+ * added BibTeX-style string purificaction
+ * added three example programs: btcheck, btsort, and btformat (btformat
+ is *very* preliminary!)
+ * lots of documentation added/revised in the existing modules
+
+0.21 1997-10-20
+ * companion to btparse 0.21 -- mainly due to bug fixes and one
+ minor interface change (bt_cite_key -> bt_entry_key) in btparse
+ * documentation/portability/warning fixes
+ * fixed XS code to not make an accidental second "strip quote
+ characters" pass
+ * fixed Entry 'print_s' method to handle all entry metatypes
+ * changed Entry 'delete' method to handle a list of fields
+ * started introducing changes to support the new way of
+ doing 'structure modules' -- nothing documented yet, though
+ * deprecated old Structure module
+
+0.2 1997-09-08
+ * fixed a bunch of little memory leaks in the btparse C library
+ * rationalized the Text::BibTeX::Entry method names
+ * added (incomplete) Text::BibTeX::Structure module
+ * completely overhauled the parser and the interface to it;
+ this necessitated many small changes to BibTeX.xs
+ * greatly expanded the test suite and fixed a few little bugs
+ found in the process
+ * fixed the XS code to handle comment and preamble entries
+
+0.1 1997-03-08
+ * initial release
diff --git a/MANIFEST b/MANIFEST
new file mode 100644
index 0000000..1f4628d
--- /dev/null
+++ b/MANIFEST
@@ -0,0 +1,156 @@
+MANIFEST
+THANKS
+README
+Changes
+META.yml
+
+typemap
+inc/MyBuilder.pm
+Build.PL
+
+btool_faq.pod
+
+scripts/btcheck
+scripts/btsort
+scripts/btformat
+
+xscode/BibTeX.xs
+xscode/btxs_support.c
+xscode/btxs_support.h
+
+lib/Text/BibTeX.pm
+lib/Text/BibTeX/File.pm
+lib/Text/BibTeX/Entry.pm
+lib/Text/BibTeX/Value.pm
+lib/Text/BibTeX/Structure.pm
+lib/Text/BibTeX/Name.pm
+lib/Text/BibTeX/NameFormat.pm
+lib/Text/BibTeX/Bib.pm
+lib/Text/BibTeX/BibFormat.pm
+lib/Text/BibTeX/BibSort.pm
+
+t/00_system_info.t
+t/common.pl
+t/bib.t
+t/macro.t
+t/modify.t
+t/nameformat.t
+t/namelist.t
+t/names.t
+t/output.t
+t/parse.t
+t/parse_f.t
+t/parse_s.t
+t/purify.t
+t/split_names
+t/unlimited.bib
+t/unlimited.t
+t/corpora.bib
+t/errors.bib
+t/from_file.t
+
+examples/append_entries
+
+## files included from btparse/src
+btparse/pccts/antlr.h
+btparse/pccts/ast.c
+btparse/pccts/ast.h
+btparse/pccts/config.h
+btparse/pccts/dlgauto.h
+btparse/pccts/dlgdef.h
+btparse/pccts/err.h
+
+## btparse internals documentation
+btparse/doc/bt_format_names.pod
+btparse/doc/bt_input.pod
+btparse/doc/bt_language.pod
+btparse/doc/bt_macros.pod
+btparse/doc/bt_misc.pod
+btparse/doc/bt_post_processing.pod
+btparse/doc/bt_postprocess.pod
+btparse/doc/bt_split_names.pod
+btparse/doc/bt_traversal.pod
+btparse/doc/btparse.pod
+
+## btparse source files
+btparse/src/bibtex.c
+btparse/src/bibtex_ast.c
+btparse/src/err.c
+btparse/src/error.c
+btparse/src/file_header.c
+btparse/src/format_name.c
+btparse/src/function_header.c
+btparse/src/init.c
+btparse/src/input.c
+btparse/src/lex_auxiliary.c
+btparse/src/line_offsets.c
+btparse/src/macros.c
+btparse/src/modify.c
+btparse/src/names.c
+btparse/src/parse_auxiliary.c
+btparse/src/postprocess.c
+btparse/src/scan.c
+btparse/src/string_util.c
+btparse/src/sym.c
+btparse/src/tex_tree.c
+btparse/src/traversal.c
+btparse/src/util.c
+btparse/src/attrib.h
+btparse/src/bibtex_ast.h
+btparse/src/bt_debug.h
+btparse/src/btparse.h
+btparse/src/error.h
+btparse/src/lex_auxiliary.h
+btparse/src/line_offsets.h
+btparse/src/mode.h
+btparse/src/my_alloca.h
+btparse/src/my_dmalloc.h
+btparse/src/parse_auxiliary.h
+btparse/src/prototypes.h
+btparse/src/stdpccts.h
+btparse/src/sym.h
+btparse/src/tokens.h
+btparse/src/util.h
+
+btparse/src/parser.dlg
+btparse/src/bibtex.g
+
+## Extra C programs
+btparse/progs/args.c
+btparse/progs/args.h ## NOINST
+btparse/progs/biblex.c
+btparse/progs/bibparse.c
+btparse/progs/dumpnames.c
+btparse/progs/getopt.c
+btparse/progs/getopt.h ## NOINST
+btparse/progs/getopt1.c
+
+## Test code.
+btparse/tests/case_test.c
+btparse/tests/macro_test.c
+btparse/tests/name_test.c
+btparse/tests/namebug.c
+btparse/tests/postprocess_test.c
+btparse/tests/purify_test.c
+btparse/tests/read_test.c
+btparse/tests/simple_test.c
+btparse/tests/testlib.c
+btparse/tests/tex_test.c
+btparse/tests/data/TESTS
+btparse/tests/data/commas.bib
+btparse/tests/data/comment.bib
+btparse/tests/data/empty.bib
+btparse/tests/data/foreign.bib
+btparse/tests/data/macro.bib
+btparse/tests/data/names
+btparse/tests/data/preamble.bib
+btparse/tests/data/purify.strings
+btparse/tests/data/regular.bib
+btparse/tests/data/simple.bib
+btparse/tests/testlib.h
+
+btparse/src/bt_config.h.in
+
+README.OLD
+META.json
+MANIFEST.SKIP
diff --git a/MANIFEST.SKIP b/MANIFEST.SKIP
new file mode 100644
index 0000000..8375821
--- /dev/null
+++ b/MANIFEST.SKIP
@@ -0,0 +1,99 @@
+
+#!start included /opt/perl/lib/5.18.0/ExtUtils/MANIFEST.SKIP
+# Avoid version control files.
+\bRCS\b
+\bCVS\b
+\bSCCS\b
+,v$
+\B\.svn\b
+\B\.git\b
+\B\.gitignore\b
+\b_darcs\b
+\B\.cvsignore$
+
+# Avoid VMS specific MakeMaker generated files
+\bDescrip.MMS$
+\bDESCRIP.MMS$
+\bdescrip.mms$
+
+# Avoid Makemaker generated and utility files.
+\bMANIFEST\.bak
+\bMakefile$
+\bblib/
+\bMakeMaker-\d
+\bpm_to_blib\.ts$
+\bpm_to_blib$
+\bblibdirs\.ts$ # 6.18 through 6.25 generated this
+
+# Avoid Module::Build generated and utility files.
+\bBuild$
+\b_build/
+\bBuild.bat$
+\bBuild.COM$
+\bBUILD.COM$
+\bbuild.com$
+
+# Avoid temp and backup files.
+~$
+\.old$
+\#$
+\b\.#
+\.bak$
+\.tmp$
+\.#
+\.rej$
+
+# Avoid OS-specific files/dirs
+# Mac OSX metadata
+\B\.DS_Store
+# Mac OSX SMB mount metadata files
+\B\._
+
+# Avoid Devel::Cover and Devel::CoverX::Covered files.
+\bcover_db\b
+\bcovered\b
+
+# Avoid MYMETA files
+^MYMETA\.
+#!end included /opt/perl/lib/5.18.0/ExtUtils/MANIFEST.SKIP
+
+# Avoid configuration metadata file
+^MYMETA\.
+
+# Avoid Module::Build generated and utility files.
+\bBuild$
+\bBuild.bat$
+\b_build
+\bBuild.COM$
+\bBUILD.COM$
+\bbuild.com$
+^MANIFEST\.SKIP
+
+# Avoid archives of this distribution
+\bText-BibTeX-[\d\.\_]+
+
+.*\.o$
+
+
+btparse/BUGS
+btparse/ChangeLog
+btparse/COPYING
+btparse/progs/biblex
+btparse/progs/bibparse
+btparse/progs/dumpnames
+btparse/README
+btparse/src/bt_config.h
+btparse/src/libbtparse.dylib
+btparse/tests/macro_test
+btparse/tests/name_test
+btparse/tests/postprocess_test
+btparse/tests/purify_test
+btparse/tests/read_test
+btparse/tests/simple_test
+btparse/tests/tex_test
+btparse/thoughts
+btparse/TODO
+config.log
+crash.bib
+Other/pccts133mr.zip
+xscode/BibTeX.c
diff --git a/META.json b/META.json
new file mode 100644
index 0000000..3d076a9
--- /dev/null
+++ b/META.json
@@ -0,0 +1,109 @@
+{
+ "abstract" : "interface to read and parse BibTeX files",
+ "author" : [
+ "Alberto Simões <ambs@cpan.org>",
+ "Greg Ward <gward@python.net>"
+ ],
+ "dynamic_config" : 1,
+ "generated_by" : "Module::Build version 0.4224",
+ "license" : [
+ "perl_5"
+ ],
+ "meta-spec" : {
+ "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec",
+ "version" : 2
+ },
+ "name" : "Text-BibTeX",
+ "prereqs" : {
+ "build" : {
+ "requires" : {
+ "Capture::Tiny" : "0.06",
+ "Config::AutoConf" : "0.16",
+ "Cwd" : "0",
+ "ExtUtils::CBuilder" : "0.27",
+ "ExtUtils::LibBuilder" : "0.02",
+ "File::Copy" : "0",
+ "Module::Build" : "0.3603"
+ }
+ },
+ "configure" : {
+ "requires" : {
+ "Config::AutoConf" : "0.16",
+ "ExtUtils::LibBuilder" : "0.02",
+ "Module::Build" : "0.36"
+ }
+ },
+ "runtime" : {
+ "requires" : {
+ "Encode" : "0",
+ "Scalar::Util" : "1.42",
+ "Unicode::Normalize" : "0"
+ }
+ }
+ },
+ "provides" : {
+ "Text::BibTeX" : {
+ "file" : "lib/Text/BibTeX.pm",
+ "version" : "0.85"
+ },
+ "Text::BibTeX::BibEntry" : {
+ "file" : "lib/Text/BibTeX/Bib.pm",
+ "version" : "0.85"
+ },
+ "Text::BibTeX::BibFormat" : {
+ "file" : "lib/Text/BibTeX/BibFormat.pm",
+ "version" : "0.85"
+ },
+ "Text::BibTeX::BibSort" : {
+ "file" : "lib/Text/BibTeX/BibSort.pm",
+ "version" : "0.85"
+ },
+ "Text::BibTeX::BibStructure" : {
+ "file" : "lib/Text/BibTeX/Bib.pm",
+ "version" : "0.85"
+ },
+ "Text::BibTeX::Entry" : {
+ "file" : "lib/Text/BibTeX/Entry.pm",
+ "version" : "0.85"
+ },
+ "Text::BibTeX::File" : {
+ "file" : "lib/Text/BibTeX/File.pm",
+ "version" : "0.85"
+ },
+ "Text::BibTeX::Name" : {
+ "file" : "lib/Text/BibTeX/Name.pm",
+ "version" : "0.85"
+ },
+ "Text::BibTeX::NameFormat" : {
+ "file" : "lib/Text/BibTeX/NameFormat.pm",
+ "version" : "0.85"
+ },
+ "Text::BibTeX::SimpleValue" : {
+ "file" : "lib/Text/BibTeX/Value.pm",
+ "version" : "0.85"
+ },
+ "Text::BibTeX::Structure" : {
+ "file" : "lib/Text/BibTeX/Structure.pm",
+ "version" : "0.85"
+ },
+ "Text::BibTeX::StructuredEntry" : {
+ "file" : "lib/Text/BibTeX/Structure.pm",
+ "version" : "0.85"
+ },
+ "Text::BibTeX::Value" : {
+ "file" : "lib/Text/BibTeX/Value.pm",
+ "version" : "0.85"
+ }
+ },
+ "release_status" : "stable",
+ "resources" : {
+ "license" : [
+ "http://dev.perl.org/licenses/"
+ ],
+ "repository" : {
+ "url" : "http://github.com/ambs/Text-BibTeX"
+ }
+ },
+ "version" : "0.85",
+ "x_serialization_backend" : "JSON::PP version 2.94"
+}
diff --git a/META.yml b/META.yml
new file mode 100644
index 0000000..d388caf
--- /dev/null
+++ b/META.yml
@@ -0,0 +1,73 @@
+---
+abstract: 'interface to read and parse BibTeX files'
+author:
+ - 'Alberto Simões <ambs@cpan.org>'
+ - 'Greg Ward <gward@python.net>'
+build_requires:
+ Capture::Tiny: '0.06'
+ Config::AutoConf: '0.16'
+ Cwd: '0'
+ ExtUtils::CBuilder: '0.27'
+ ExtUtils::LibBuilder: '0.02'
+ File::Copy: '0'
+ Module::Build: '0.3603'
+configure_requires:
+ Config::AutoConf: '0.16'
+ ExtUtils::LibBuilder: '0.02'
+ Module::Build: '0.36'
+dynamic_config: 1
+generated_by: 'Module::Build version 0.4224, CPAN::Meta::Converter version 2.150010'
+license: perl
+meta-spec:
+ url: http://module-build.sourceforge.net/META-spec-v1.4.html
+ version: '1.4'
+name: Text-BibTeX
+provides:
+ Text::BibTeX:
+ file: lib/Text/BibTeX.pm
+ version: '0.85'
+ Text::BibTeX::BibEntry:
+ file: lib/Text/BibTeX/Bib.pm
+ version: '0.85'
+ Text::BibTeX::BibFormat:
+ file: lib/Text/BibTeX/BibFormat.pm
+ version: '0.85'
+ Text::BibTeX::BibSort:
+ file: lib/Text/BibTeX/BibSort.pm
+ version: '0.85'
+ Text::BibTeX::BibStructure:
+ file: lib/Text/BibTeX/Bib.pm
+ version: '0.85'
+ Text::BibTeX::Entry:
+ file: lib/Text/BibTeX/Entry.pm
+ version: '0.85'
+ Text::BibTeX::File:
+ file: lib/Text/BibTeX/File.pm
+ version: '0.85'
+ Text::BibTeX::Name:
+ file: lib/Text/BibTeX/Name.pm
+ version: '0.85'
+ Text::BibTeX::NameFormat:
+ file: lib/Text/BibTeX/NameFormat.pm
+ version: '0.85'
+ Text::BibTeX::SimpleValue:
+ file: lib/Text/BibTeX/Value.pm
+ version: '0.85'
+ Text::BibTeX::Structure:
+ file: lib/Text/BibTeX/Structure.pm
+ version: '0.85'
+ Text::BibTeX::StructuredEntry:
+ file: lib/Text/BibTeX/Structure.pm
+ version: '0.85'
+ Text::BibTeX::Value:
+ file: lib/Text/BibTeX/Value.pm
+ version: '0.85'
+requires:
+ Encode: '0'
+ Scalar::Util: '1.42'
+ Unicode::Normalize: '0'
+resources:
+ license: http://dev.perl.org/licenses/
+ repository: http://github.com/ambs/Text-BibTeX
+version: '0.85'
+x_serialization_backend: 'CPAN::Meta::YAML version 0.018'
diff --git a/README b/README
new file mode 100644
index 0000000..ee6d78e
--- /dev/null
+++ b/README
@@ -0,0 +1,69 @@
+
+ Text::BibTeX
+---------------------------------------------------------------------------
+
+ Text::BibTeX is a Perl library for reading, parsing, and processing
+ BibTeX files. It is the Perl half of btOOL, a pair of libraries for
+ dealing with BibTeX data.
+
+ Text::BibTeX gives you access to the data at many different levels:
+ you may work with BibTeX entries as simple "field -> string"
+ mappings, or get at the original form of the data as a list of
+ simple values (strings, macros, or numbers) pasted together. You
+ can choose not to impose nany restrictions on the allowed/expected
+ entry types or fields, or you can use the structure defined by
+ BibTeX 0.99's standard style files, or you can invent your own.
+
+ The library is copiously documented. After installing the module,
+ see the Text::BibTeX man page for a comprehensive introduction to
+ the system. If you wish to dive straight in without regards for
+ niceties such as splitting and formatting names, defining or
+ imposing database structures, getting access to unprocessed field
+ values, and other nifty features, then you can probably get away
+ with just reading the Text::BibTeX::Entry man page.
+ Comments/criticism of the documentation are welcome.
+
+
+ In addition to the man pages embedded in each module (and available
+ after installation), Greg Ward has written a technical report
+ describing btOOL, with the btparse and Text::BibTeX documentation
+ included as appendices. The whole report is just over 100 pages,
+ around 45 of which make up the Text::BibTeX documentation (the
+ btparse documentation is a further 30 pages). You can find it at
+ the btOOL home page:
+
+ http://www.gerg.ca/software/btOOL/
+
+
+ INSTALLATION
+---------------------------------------------------------------------------
+
+ To install Text::BibTeX you need a recent Perl, an ANSI-compliant C
+ compiler and a bunch of Perl modules.
+
+ The build system changed with version 0.40, and has been tested in
+ different platforms, ranging from Linux, Mac OS X, Solaris and
+ Windows. Unfortunately on this last just the Strawberry Perl
+ distribution was a success.
+
+ To build the module use the usual set of commands for a
+ Module::Build distribution:
+
+ perl Build.PL
+ ./Build
+ ./Build test
+
+ And then, as super user, install it
+
+ ./Build install
+
+ Note that if your Perl is installed in a non standard path you can
+ end up with libbtparse library in the wrong place. If the tests pass
+ successfully, but after installing you can not issue
+
+ perl -MText::BibTeX
+
+ without an error message, then probably libbtparse is not reachable
+ by the library loader. In this case you can either copy the library
+ to the proper place or add the path to the library in the
+ LD_LIBARY_PATH environment variable.
diff --git a/README.OLD b/README.OLD
new file mode 100644
index 0000000..f827c4d
--- /dev/null
+++ b/README.OLD
@@ -0,0 +1,168 @@
+ Text::BibTeX
+ version 0.34
+ Greg Ward (gward@python.net)
+ 22 December, 2000
+
+INTRODUCTION
+------------
+
+Text::BibTeX is a Perl library for reading, parsing, and processing
+BibTeX files. It is the Perl half of btOOL, a pair of libraries for
+dealing with BibTeX data; the C half, btparse, is needed to compile
+Text::BibTeX (see below).
+
+Text::BibTeX gives you access to the data at many different levels: you
+may work with BibTeX entries as simple "field -> string" mappings, or
+get at the original form of the data as a list of simple values
+(strings, macros, or numbers) pasted together. You can choose not to
+impose nany restrictions on the allowed/expected entry types or fields,
+or you can use the structure defined by BibTeX 0.99's standard style
+files, or you can invent your own.
+
+The library is copiously documented. After installing the module, see
+the Text::BibTeX man page for a comprehensive introduction to the
+system. If you wish to dive straight in without regards for niceties
+such as splitting and formatting names, defining or imposing database
+structures, getting access to unprocessed field values, and other nifty
+features, then you can probably get away with just reading the
+Text::BibTeX::Entry man page. Comments/criticism of the documentation
+are welcome.
+
+In addition to the man pages embedded in each module (and available
+after installation), I have written a technical report describing btOOL,
+with the btparse and Text::BibTeX documentation included as appendices.
+The whole report is just over 100 pages, around 45 of which make up the
+Text::BibTeX documentation (the btparse documentation is a further 30
+pages). You can find it at the btOOL home page:
+
+ http://starship.python.net/~gward/btOOL/
+
+I may also make the btparse and Text::BibTeX manuals available as
+separate PostScript files, but they aren't there as of this writing.
+
+
+DEPENDENCIES
+------------
+
+Text::BibTeX requires Perl 5.004 or later and an ANSI-compliant C
+compiler.
+
+You must also have the btparse distribution, the C library underlying
+Text::BibTeX. If you didn't download btparse with Text::BibTeX, don't
+sweat it; the Makefile.PL is smart enough to go out and find it for you
+(using your preferred CPAN mirror if you have set one up,
+http://www.cpan.org/ otherwise). If you did download btparse, just put
+it in the same directory as the Text::BibTeX tarball. Text::BibTeX's
+Makefile.PL will unpack and configure it for you.
+
+
+BUILDING
+--------
+
+Start by generating the Makefile and other preparatory steps (most notably,
+preparing the btparse package for building).
+
+ perl Makefile.PL
+
+This goes to some lengths to ensure that btparse, the companion C library,
+is found and available. It will configure btparse for you, so you
+shouldn't have to do anything in the 'btparse' directory unless things go
+wrong there.
+
+Note that if Perl was built with a non-ANSI C compiler, you will need to
+specify an ANSI compiler when running Makefile.PL. For instance, if you're
+running SunOS and Perl was built with 'cc' (non-ANSI), you will need to
+build Text::BibTeX (and btparse as well, for that matter) with an ANSI
+compiler such as 'gcc' or Sun's 'acc'. This can be done for Text::BibTeX
+as follows:
+
+ perl Makefile.PL CC=gcc # or acc, or whatever works on your system
+
+However, this is fraught with danger and not recommended. The only safe
+thing to do is to build Perl with an ANSI-compliant compiler. This will
+probably save you trouble with other extensions that require an ANSI C
+compiler.
+
+Now, build everything:
+
+ make
+
+This takes care of building btparse as well as the Text::BibTeX modules, so
+again you don't have to go grubbing around in the btparse directory unless
+things go wrong.
+
+Optionally, you can test and install btparse:
+
+ cd btparse
+ make test
+ make install
+ cd ..
+
+(You do not have to install btparse to use Text::BibTeX; that's only needed
+if you plan to do C programming using btparse. See btparse/README for
+details.)
+
+Finally, test and install the Text::BibTeX Perl modules:
+
+ make test
+ make install
+
+Note that you will need to be root on most systems in order to install
+either btparse or Text::BibTeX. Consult the 'perlmodinstall' man page (as
+of Perl 5.005 or later) for more information on building and installing
+Perl module distributions.
+
+Please let me know if anything goes wrong with building either btparse
+or Text::BibTeX.
+
+
+BUGS AND LIMITIATIONS
+---------------------
+
+See the btparse(3) and Text::BibTeX(3) man pages for details. Executive
+summary:
+
+ * Text::BibTeX is not thread-safe, due to a heavy dependence on
+ global variables in the parser and lexical scanner components
+ of btparse
+
+ * for the same reason, you can't have multiple files open and
+ being parsed at the same time; attempting to do so is an
+ invalid use of btparse
+
+ * Text::BibTeX cannot be used with an sfio-based Perl, because
+ the Perl I/O API is apparently not yet up to the task of replacing
+ stdio in third-party C libraries
+
+
+AVAILABILITY
+------------
+
+You can find the latest version of Text::BibTeX, as well as
+documentation, information on known bugs, etc. at the btOOL web site:
+
+ http://starship.python.net/~gward/btOOL/
+
+Here you will find HTML versions of the documentation and technical
+report describing the project, links to download the code, and whatever
+other goodies I can come up with over time.
+
+
+CREDITS
+-------
+
+For spotting bugs and sometimes even providing patches:
+
+ Dirk Vleugels <vleugels@do.isst.fhg.de>
+ Kjetil Kjernsmo <kjetil.kjernsmo@astro.uio.no>
+ Andrew Cassin <acassin@cs.mu.oz.au>
+ Thomas Kamphusmann <thomas@delphi.ping.de>
+ Horst Tellioglu <telliogl@h2hobel.phl.univie.ac.at>
+ Nikolay Pelov <Nikolay.Pelov@cs.kuleuven.ac.be>
+
+...and probably a few others. Please send me email if you feel you
+belong on this list and I forgot to include you.
+
+Also, see the btparse README file for contributors to the C library.
+
+$Id$
diff --git a/THANKS b/THANKS
new file mode 100644
index 0000000..ddc9341
--- /dev/null
+++ b/THANKS
@@ -0,0 +1,15 @@
+
+ ****************
+****** Text::BibTeX ******************************************************
+ ****************
+
+Alberto would like to thank you:
+
+* Greg Ward for his work on btparse and Text::BibTeX. He is not
+ maintaining this code anymore but without his work none of these
+ software packages were possible.
+
+* Philip Kime for his continuous support on my work when porting from
+ 0.38 to 0.40
+
+* Jens Rehsack and Jerome Quelin for building patchs.
diff --git a/btool_faq.pod b/btool_faq.pod
new file mode 100644
index 0000000..4310811
--- /dev/null
+++ b/btool_faq.pod
@@ -0,0 +1,121 @@
+=head1 NAME
+
+btool_faq - Frequently-Asked Questions about btparse and Text::BibTeX
+
+=head1 DESCRIPTION
+
+This document attempts to address questions that I have been asked
+several times, and are easy to answer -- but not by perusing the
+documentation. For various reasons, the answers tend to be thinly
+distributed across several man pages, making it difficult to figure out
+what's going on. Hence, this man page will attempt to tie together
+various strands of thought, providing quick, focused, "How do I do X?"
+answers as opposed to lengthy descriptions of the capabilities and
+conventions of the btOOL libraries.
+
+=head1 PERL LIBRARY
+
+This section covers questions that users of C<Text::BibTeX>, the Perl
+component of B<btOOL>, have asked.
+
+=head2 Why aren't the BibTeX "month" macros defined?
+
+Because they're bibliography-specific, and C<Text::BibTeX> by default
+doesn't impose any assumptions about a particular type of database or
+data-processing domain on your entries. The problem arises when you
+parse entries from a file, say F<foo.bib> that quite sensibly use the
+month macros (C<jan>, C<feb>, etc.) provided by the BibTeX standard
+style files:
+
+ $bibfile = Text::BibTeX::File->new('foo.bib') # open file
+ or die "foo.bib: $!\n";
+ $entry = Text::BibTeX::Entry->new($bibfile); # parse first entry
+
+Using this code, you might get an "undefined macro" warning for every
+entry parsed from F<foo.bib>. Apart from the superficial annoyance of
+all those warning messages, the undefined macros are expanded as empty
+strings, meaning you lose any information about them---not good.
+
+You could always kludge it and forcibly define the month macros
+yourself. Prior to release 0.30, this had to be done by parsing a set
+of fake entries, but now C<Text::BibTeX> provides a direct interface to
+the underlying macro table. You I<could> just do this before parsing any
+entries:
+
+ use Text::BibTeX qw(:macrosubs);
+ # ...
+ my %month = (jan => 'January', feb => 'February', ... );
+ add_macro_text ($macro, $value)
+ while (($macro, $value) = each %month);
+
+But there's a better way that's more in keeping with how things are done
+under BibTeX (where default macros are defined in the style file): use
+C<Text::BibTeX>'s object-oriented analogue to style files, called
+structure modules. C<Text::BibTeX> provides a structure module,
+C<Text::BibTeX::Bib>, that (partially) emulates the standard style files
+of BibTeX 0.99, including the definition of month macros. Structure
+modules are specified on a per-file basis by using the C<set_structure>
+method on a C<Text::BibTeX::File> object. It's quite simple to tell
+C<Text::BibTeX> that entries from C<$bibfile> are expected to conform to
+the C<Bib> structure (which is implemented by the C<Text::BibTeX::Bib>
+module, but you don't really need to know that):
+
+ $bibfile = Text::BibTeX::File->new('foo.bib')
+ or die "foo.bib: $!\n";
+ $bibfile->set_structure ('Bib');
+
+You probably shouldn't hardcode the name of a particular structure in
+your programs, though, as there will eventually be a multitude of
+structure modules to choose from (just as there are a multitude of
+BibTeX style files to choose from). My preferred approach is to make
+the structure a command-line option which defaults to C<Bib> (since
+that's the only structure actually implemented as of this writing).
+
+=head2 How do I append to a BibTeX file?
+
+Just open it in append mode, and write entries to it as usual.
+Remember, a C<Text::BibTeX::File> object is mainly a wrapper around an
+C<IO::File> object, and the C<Text::BibTeX::File::open> method (and thus
+C<new> as well) is just a front-end to C<IO::File::open>.
+C<IO::File::open>, in turn, is a front-end either to Perl's builtin
+C<open> (if called with one argument) or C<sysopen> (two or three
+arguments). To save you the trouble of going off and reading all those
+man pages, here's the trick: if you pass just a filename to
+C<Text::BibTeX::File>'s C<new> method, then it's treated just like a
+filename passed to Perl's builtin C<open>:
+
+ my $append_file = Text::BibTeX::File->new(">>$filename")
+ or die "couldn't open $filename for appending: $!\n";
+
+opens C<$filename> for appending. If, later on, you have an entry from
+another file (say C<$entry>), then you can append it to C<$append_file>
+by just writing it as usual:
+
+ $entry->write ($append_file);
+
+See C<append_entries> in the F<examples/> subdirectory of the
+C<Text::BibTeX> distribution for a complete example.
+
+=head1 C LIBRARY
+
+This section covers frequently-asked questions about B<btparse>, the C
+component of B<btOOL>.
+
+=head2 Is there a Python binding for B<btparse> yet?
+
+Not that I know of. I haven't written one. If you do so, please let me
+know about it.
+
+=head1 SEE ALSO
+
+L<btparse>, L<Text::BibTeX>
+
+=head1 AUTHOR
+
+Greg Ward <gward@python.net>
+
+=head1 COPYRIGHT
+
+Copyright (c) 1997-2000 by Gregory P. Ward. All rights reserved. This file
+is part of the Text::BibTeX library. This library is free software; you
+may redistribute it and/or modify it under the same terms as Perl itself.
diff --git a/btparse/doc/bt_format_names.pod b/btparse/doc/bt_format_names.pod
new file mode 100644
index 0000000..b257040
--- /dev/null
+++ b/btparse/doc/bt_format_names.pod
@@ -0,0 +1,297 @@
+=head1 NAME
+
+bt_format_names - formatting BibTeX names for consistent output
+
+=head1 SYNOPSIS
+
+ bt_name_format * bt_create_name_format (char * parts,
+ boolean abbrev_first);
+ void bt_free_name_format (bt_name_format * format);
+ void bt_set_format_text (bt_name_format * format,
+ bt_namepart part,
+ char * pre_part,
+ char * post_part,
+ char * pre_token,
+ char * post_token);
+ void bt_set_format_options (bt_name_format * format,
+ bt_namepart part,
+ boolean abbrev,
+ bt_joinmethod join_tokens,
+ bt_joinmethod join_part);
+ char * bt_format_name (bt_name * name, bt_name_format * format);
+
+=head1 DESCRIPTION
+
+After splitting a name into its components parts (represented as a
+C<bt_name> structure), you often want to put it back together again as a
+single string in a consistent way. B<btparse> provides a very flexible
+way to do this, generally in two stages: first, you create a "name
+format" which describes how to put the tokens and parts of any name back
+together, and then you apply the format to a particular name.
+
+The "name format" is encapsulated in a C<bt_name_format> structure,
+which is created with C<bt_create_name_format()>. This function
+includes some clever trickery that means you can usually get away with
+calling it alone, and not need to do any customization of the format.
+If you do need to customize the format, though, C<bt_set_format_text()>
+and C<bt_set_format_options()> provide that capability.
+
+The format controls the following:
+
+=over 4
+
+=item *
+
+which name parts are printed, and in what order (e.g. "first von last
+jr", or "von last jr first")
+
+=item *
+
+the text that precedes and follows each part (e.g. if the first name
+follows the last name, you probably want a comma before the `first'
+part: "Smith, John" rather than "Smith John")
+
+=item *
+
+the text that precedes and follows each token (e.g. if the first name is
+abbreviated, you may want a period after each token: "J. R. Smith"
+rather than "J R Smith")
+
+=item *
+
+the method used to join the tokens of each part together
+
+=item *
+
+the method used to join each part to the following part
+
+=back
+
+All of these except the list of parts to format are kept in arrays
+indexed by name part: for example, the structure has a field
+
+ char * post_token[BT_MAX_NAMEPARTS]
+
+and C<post_token[BTN_FIRST]> (C<BTN_FIRST> is from the C<bt_namepart>
+C<enum>) is the string to be added after each token in the first
+name---for example, C<"."> if the first name is to be abbreviated in the
+conventional way.
+
+Yet another C<enum>, C<bt_joinmethod>, describes the available methods
+for joining tokens together. Note that there are I<two> sets of join
+methods in a name format: between tokens within a single part, and
+between the tokens of two different parts. The first allows you, for
+example, to change C<"J R Smith"> (first name abbreviated with no
+post-token text but tokens joined by a space) to C<"JR Smith"> (the
+same, but first-name tokens jammed together). The second is mainly used
+to ensure that "von" and "last" name-parts may be joined with a tie:
+C<"de~Roche"> rather than C<"de Roche">.
+
+The token join methods are:
+
+=over 4
+
+=item BTJ_MAYTIE
+
+Insert a "discretionary tie" between tokens. That is, either a space or
+a "tie" is inserted, depending on context. (A "tie," otherwise known as
+unbreakable space, is currently hard-coded as C<"~">---from TeX.)
+
+The format is then applied to a particular name by C<bt_format_name()>,
+which returns a new string.
+
+=item BTJ_SPACE
+
+Always insert a space between tokens.
+
+=item BTJ_FORCETIE
+
+Always insert a "tie" (C<"~">) between tokens.
+
+=item BTJ_NOTHING
+
+Insert nothing between tokens---just jam them together.
+
+=back
+
+Tokens are joined together, and thus the choice of whether to insert a
+"discretionary tie" is made, at two places: within a part and between
+two parts. Naturally, this only applies when C<BTJ_MAYTIE> was supplied
+as the token-join method; C<BTJ_SPACE> and C<BTJ_FORCETIE> always insert
+either a space or tie, and C<BTJ_NOTHING> always adds nothing between
+tokens. Within a part, ties are added after a the first token if it is
+less than three characters long, and before the last token. Between
+parts, a tie is added only if the preceding part consisted of single
+token that was less than three characters long. In all other cases,
+spaces are inserted. (This implementation slavishly follows BibTeX.)
+
+=head1 FUNCTIONS
+
+=over 4
+
+=item bt_create_name_format()
+
+ bt_name_format * bt_create_name_format (char * parts,
+ boolean abbrev_first)
+
+Creates a name format for a given set of parts, with variations for the
+most common forms of customization---the order of parts and whether to
+abbreviate the first name.
+
+The C<parts> parameter specifies which parts to include in a formatted
+name, as well as the order in which to format them. C<parts> must be a
+string of four or fewer characters, each of which denotes one of the
+four name parts: for instance, C<"vljf"> means to format all four parts
+in "von last jr first" order. No characters outside of the set
+C<"fvlj"> are allowed, and no characters may be repeated.
+C<abbrev_first> controls whether the `first' part will be abbreviated
+(i.e., only the first letter from each token will be printed).
+
+In addition to simply setting the list of parts to format and the
+"abbreviate" flag for the first name, C<bt_create_name_format()>
+initializes the entire format structure so as to minimize the need for
+further customizations:
+
+=over 4
+
+=item *
+
+The "token join method"---what to insert between tokens of the same
+part---is set to C<BTJ_MAYTIE> (discretionary tie) for all parts
+
+=item *
+
+The "part join method"---what to insert after the final token of a
+particular part, assuming there are more parts to come---is set to
+C<BTJ_SPACE> for the `first', `last', and `jr' parts. If the `von' part
+is present and immediately precedes the `last' part (which will almost
+always be the case), C<BTJ_MAYTIE> is used to join `von' to `last';
+otherwise, `von' also gets C<BTJ_SPACE> for the inter-part join method.
+
+=item *
+
+The abbreviation flag is set to C<FALSE> for the `von', `last', and `jr'
+parts; for `first', the abbreviation flag is set to whatever you pass in
+as C<abbrev_first>.
+
+=item *
+
+Initially, all "surrounding text" (pre-part, post-part, pre-token, and
+post-token) for all parts is set to the empty string. Then a few tweaks
+are done, depending on the C<abbrev_first> flag and the order of
+tokens. First, if C<abbrev_first> is C<TRUE>, the post-token text for
+first name is set to C<".">---this changes C<"J R Smith"> to
+C<"J. R. Smith">, which is usually the desired form. (If you I<don't>
+want the periods, you'll have to set the post-token text yourself with
+C<bt_set_format_text()>.)
+
+Then, if `jr' is present and immediately after `last' (almost always the
+case), the pre-part text for `jr' is set to C<", ">, and the inter-part
+join method for `last' is set to C<BTJ_NOTHING>. This changes
+C<"John Smith Jr"> (where the space following C<"Smith"> comes from
+formatting the last name with a C<BTJ_SPACE> inter-part join method) to
+C<"John Smith, Jr"> (where the C<", "> is now associated with
+C<"Jr">---that way, if there is no `jr' part, the C<", "> will
+not be printed.)
+
+Finally, if `first' is present and immediately follows either `jr' or
+`last' (which will usually be the case in "last-name first" formats),
+the same sort of trickery is applied: the pre-part text for `first' is
+set to C<", ">, and the part join method for the preceding part (either
+`jr' or `last') is set to C<BTJ_NOTHING>.
+
+=back
+
+While all these rules are rather complicated, they mean that you are
+usually freed from having to do any customization of the name format.
+Certainly this is the case if you only need C<"fvlj"> and C<"vljf"> part
+orders, only want to abbreviate the first name, want periods after
+abbreviated tokens, non-breaking spaces in the "right" places, and
+commas in the conventional places.
+
+If you want something out of the ordinary---for instance, abbreviated
+tokens jammed together with no puncuation, or abbreviated last
+names---you'll need to customize the name format a bit with
+C<bt_set_format_text()> and C<bt_set_format_options()>.
+
+=item bt_free_name_format()
+
+ void bt_free_name_format (bt_name_format * format)
+
+Frees a name format created by C<bt_create_name_format()>.
+
+=item bt_set_format_text()
+
+ void bt_set_format_text (bt_name_format * format,
+ bt_namepart part,
+ char * pre_part,
+ char * post_part,
+ char * pre_token,
+ char * post_token)
+
+Allows you to customize some or all of the surrounding text for a single
+name part. Supply C<NULL> for any chunk of text that you don't want to
+change.
+
+For instance, say you want a name format that will abbreviate first
+names, but without any punctuation after the abbreviated
+tokens. You could create and customize the format as follows:
+
+ format = bt_create_name_format ("fvlj", TRUE);
+ bt_set_format_text (format,
+ BTN_FIRST, /* name-part to customize */
+ NULL, NULL, /* pre- and post- part text */
+ NULL, ""); /* empty string for post-token */
+
+Without the C<bt_set_format_text()> call, C<format> would result in
+names formatted like C<"J. R. Smith">. After setting the post-token
+text for first names to C<"">, this name would become C<"J R Smith">.
+
+=item bt_set_format_options()
+
+ void bt_set_format_options (bt_name_format * format,
+ bt_namepart part,
+ boolean abbrev,
+ bt_joinmethod join_tokens,
+ bt_joinmethod join_part)
+
+Allows further customization of a name format: you can set the
+abbreviation flag and the two token-join methods. Alas, there is no
+mechanism for leaving a value unchanged; you must set everything with
+C<bt_set_format_options()>.
+
+For example, let's say that just dropping periods from abbreviated
+tokens in the first name isn't enough; you I<really> want to save
+space by jamming the abbreviated tokens together: C<"JR Smith"> rather
+than C<"J R Smith"> Assuming the two calls in the above example have
+been done, the following will finish the job:
+
+ bt_set_format_options (format, BTN_FIRST,
+ TRUE, /* keep same value for abbrev flag */
+ BTJ_NOTHING, /* jam tokens together */
+ BTJ_SPACE); /* space after final token of part */
+
+Note that we unfortunately had to know (and supply) the current values
+for the abbreviation flag and post-part join method, even though we were
+only setting the intra-part join method.
+
+=item bt_format_name()
+
+ char * bt_format_name (bt_name * name, bt_name_format * format)
+
+Once a name format has been created and customized to your heart's
+content, you can use it to format any number of names that have been
+split with C<bt_split_name> (see L<bt_split_names>). Simply pass the
+name structure and name format structure, and a newly-allocated string
+containing the formatted name will be returned to you. It is your
+responsibility to C<free()> this string.
+
+=back
+
+=head1 SEE ALSO
+
+L<btparse>, L<bt_split_names>
+
+=head1 AUTHOR
+
+Greg Ward <gward@python.net>
diff --git a/btparse/doc/bt_input.pod b/btparse/doc/bt_input.pod
new file mode 100644
index 0000000..2cf6fa0
--- /dev/null
+++ b/btparse/doc/bt_input.pod
@@ -0,0 +1,175 @@
+=head1 NAME
+
+bt_input - input/parsing functions in B<btparse> library
+
+=head1 SYNOPSIS
+
+ void bt_set_stringopts (bt_metatype_t metatype, btshort options);
+ AST * bt_parse_entry_s (char * entry_text,
+ char * filename,
+ int line,
+ btshort options,
+ boolean * status);
+ AST * bt_parse_entry (FILE * infile,
+ char * filename,
+ btshort options,
+ boolean * status);
+ AST * bt_parse_file (char * filename,
+ btshort options,
+ boolean * overall_status);
+
+
+=head1 DESCRIPTION
+
+The functions described here are used to read and parse BibTeX data,
+converting it from raw text to abstract-syntax trees (ASTs).
+
+=over 4
+
+=item bt_set_stringopts ()
+
+ void bt_set_stringopts (bt_metatype_t metatype, btshort options);
+
+Set the string-processing options for a particular entry metatype. This
+affects the entry post-processing done by C<bt_parse_entry_s()>,
+C<bt_parse_entry()>, and C<bt_parse_file()>. If C<bt_set_stringopts()>
+is never called, the four metatypes default to the following sets of
+string options:
+
+ BTE_REGULAR BTO_CONVERT | BTO_EXPAND | BTO_PASTE | BTO_COLLAPSE
+ BTE_COMMENT 0
+ BTE_PREAMBLE 0
+ BTE_MACRODEF BTO_CONVERT | BTO_EXPAND | BTO_PASTE
+
+For example,
+
+ bt_set_stringopts (BTE_COMMENT, BTO_COLLAPSE);
+
+will cause the library to collapse whitespace in the value from all
+comment entries; the AST returned by one of the C<bt_parse_*> functions
+will reflect this change.
+
+=item bt_parse_entry ()
+
+ AST * bt_parse_entry (FILE * infile,
+ char * filename,
+ btshort options,
+ boolean * status);
+
+Scans and parses the next BibTeX entry in C<infile>. You should supply
+C<filename> to help B<btparse> generate accurate error messages; the
+library keeps track of C<infile>'s current line number internally, so you
+don't need to pass that in. C<options> should be a bitmap of
+non-string-processing options (currently, C<BTO_NOSTORE> to disable storing
+macro expansions is the only such option). C<*status> will be set to
+C<TRUE> if the entry parsed successfully or with only minor warnings, and
+C<FALSE> if there were any serious lexical or syntactic errors. If
+C<status> is C<NULL>, then the parse status will be unavailable to you.
+Both minor warnings and serious errors are reported on C<stderr>.
+
+Returns a pointer to the abstract-syntax tree (AST) describing the entry
+just parsed, or C<NULL> if no more entries were found in C<infile> (this
+will leave C<infile> at end-of-file). Do not attempt to second guess
+C<bt_parse_entry()> by detecting end-of-file yourself; it must be allowed
+to determine this on its own so it can clean up some static data that is
+preserved between calls on the same file.
+
+C<bt_parse_entry()> has two important restrictions that you should know
+about. First, you should let B<btparse> manage all the input on the
+file; this is for reasons both superficial (so the library knows the
+current line number in order to generate accurate error messages) and
+fundamental (the library must be allowed to detect end-of-file in order
+to cleanup certain static variables and allow you to parse another
+file). Second, you cannot interleave the parsing of two different
+files; attempting to do so will result in a fatal error that will crash
+your program. This is a direct result of the static state maintained
+between calls of C<bt_parse_entry()>.
+
+Because of two distinct "failures" possible for C<bt_parse_entry()>
+(end-of-file, which is expected but means to stop processing the current
+file; and error-in-input, which is not expected but allows you to
+continue processing the same file), you should usually call it like
+this:
+
+ while (entry = bt_parse_entry (file, filename, options, &ok))
+ {
+ if (ok)
+ {
+ /* ... process entry ... */
+ }
+ }
+
+At the end of this loop, C<feof (file)> will be true.
+
+=item bt_parse_entry_s ()
+
+ AST * bt_parse_entry_s (char * entry_text,
+ char * filename,
+ int line,
+ btshort options,
+ boolean * status)
+
+Scans and parses a single complete BibTeX entry contained in a string,
+C<entry_text>. If you read this string from a file, you should help
+B<btparse> generate accurate error messages by supplying the name of the
+file as C<filename> and the line number of the beginning of the entry as
+C<line>; otherwise, set C<filename> to C<NULL> and C<line> to C<1>.
+C<options> and C<status> are the same as for C<bt_parse_entry()>.
+
+Returns a pointer to the abstract-syntax tree (AST) describing the entry
+just parsed, and C<NULL> if no entries were found in C<entry_text> or if
+C<entry_text> was C<NULL>.
+
+You should call C<bt_parse_entry_s()> once more than the total number of
+entries you wish to parse; on the final call, set C<entry_text> to
+C<NULL> so the function knows there's no more text to parse. This final
+call allows it to clean up some structures allocated on the first call.
+Thus, C<bt_parse_entry_s()> is usually used like this:
+
+ char * entry_text;
+ btshort options = 0;
+ boolean ok;
+ AST * entry_ast;
+
+ while (entry_text = get_more_text ())
+ {
+ entry_ast = bt_parse_entry_s (entry_text, NULL, 1, options, &ok);
+ if (ok)
+ {
+ /* ... process entry ... */
+ }
+ }
+
+ bt_parse_entry_s (NULL, NULL, 1, options, NULL); /* cleanup */
+
+assuming that C<get_more_text()> returns a pointer to the text of an
+entry to parse, or C<NULL> if there's no more text available.
+
+=item bt_parse_file ()
+
+ AST * bt_parse_file (char * filename,
+ btshort options,
+ boolean * status)
+
+Scans and parses an entire BibTeX file. If C<filename> is C<NULL> or
+C<"-">, then C<stdin> will be read; otherwise, attempts to open the named
+file. If this attempt fails, prints an error message to C<stderr> and
+returns C<NULL>. C<options> and C<status> are the same as for
+C<bt_parse_entry()>---note that C<*status> will be C<FALSE> if there were
+I<any> errors in the entire file; for finer granularity of error-checking,
+you should use C<bt_parse_entry()>.
+
+Returns a pointer to a linked list of ASTs representing the entries in the
+file, or C<NULL> if no entries were found in the file. This list can
+be traversed with C<bt_next_entry()>, and the individual entries then
+traversed as usual (see L<bt_traversal>).
+
+=back
+
+=head1 SEE ALSO
+
+L<btparse>, L<bt_postprocess>, L<bt_traversal>
+
+=head1 AUTHOR
+
+Greg Ward <gward@python.net>
diff --git a/btparse/doc/bt_language.pod b/btparse/doc/bt_language.pod
new file mode 100644
index 0000000..c7465ae
--- /dev/null
+++ b/btparse/doc/bt_language.pod
@@ -0,0 +1,277 @@
+=head1 NAME
+
+bt_language - the BibTeX data language, as recognized by B<btparse>
+
+=head1 SYNOPSIS
+
+ # Lexical grammar, mode 1: top-level
+ AT \@
+ NEWLINE \n
+ COMMENT \%~[\n]*\n
+ WHITESPACE [\ \r\t]+
+ JUNK ~[\@\n\ \r\t]+
+
+ # Lexical grammar, mode 2: in-entry
+ NEWLINE \n
+ COMMENT \%~[\n]*\n
+ WHITESPACE [\ \r\t]+
+ NUMBER [0-9]+
+ NAME [a-z0-9\!\$\&\*\+\-\.\/\:\;\<\>\?\[\]\^\_\`\|]+
+ LBRACE \{
+ RBRACE \}
+ LPAREN \(
+ RPAREN \)
+ EQUALS =
+ HASH \#
+ COMMA ,
+ QUOTE \"
+
+ # Lexical grammar, mode 3: strings
+ # (very hairy -- see text)
+
+ # Syntactic grammar:
+ bibfile : ( entry )*
+
+ entry : AT NAME body
+
+ body : STRING # for comment entries
+ | ENTRY_OPEN contents ENTRY_CLOSE
+
+ contents : ( NAME | NUMBER ) COMMA fields # for regular entries
+ | fields # for macro definition entries
+ | value # for preamble entries
+
+ fields : field { COMMA fields }
+ |
+
+ field : NAME EQUALS value
+
+ value : simple_value ( HASH simple_value )*
+
+ simple_value : STRING
+ | NUMBER
+ | NAME
+
+
+=head1 DESCRIPTION
+
+One of the problems with BibTeX is that there is no formal specification
+of the language. This means that users exploring the arcane corners of
+the language are largely on their own, and programmers implementing
+their own parsers are completely on their own---except for observing the
+behaviour of the original implementation.
+
+Other parser implementors (Nelson Beebe of C<bibclean> fame, in
+particular) have taken the trouble to explain the language accepted by
+their parser, and in that spirit the following is presented.
+
+If you are unfamiliar with the arcana of regular and context-free
+languages, you will not have any easy time understanding this. This is
+I<not> an introduction to the BibTeX language; any LaTeX book would be
+more suitable for learning the data language itself.
+
+=head1 LEXICAL GRAMMAR
+
+The lexical scanner has three distinct modes: top-level, in-entry, and
+string. Roughly speaking, top-level is the initial mode; we enter
+in-entry mode on seeing an C<@> at top-level; and on seeing the C<}> or
+C<)> that ends the entry, we return to top-level. We enter string mode
+on seeing a C<"> or non-entry-delimiting C<{> from in-entry mode. Note
+that the lexical language is both non-regular (because braces must
+balance) and context-sensitive (because C<{> can mean different things
+depending on its syntactic context). That said, we will use regular
+expressions to describe the lexical elements, because they are the
+starting point used by the lexical scanner itself. The rest of the
+lexical grammar will be informally explained in the text.
+
+From top-level, the following tokens are recognized according to the
+regular expressions on the right:
+
+ AT \@
+ NEWLINE \n
+ COMMENT \%~[\n]*\n
+ WHITESPACE [\ \r\t]+
+ JUNK ~[\@\n\ \r\t]+
+
+(Note that this is PCCTS regular expression syntax, which should be
+fairly familiar to users of other regex engines. One oddity is that a
+character class is negated as C<~[...]> rather than C<[^...]>.)
+
+On seeing C<at> at top-level, we enter in-entry mode. Whitespace, junk,
+newlines, and comments are all skipped, with the latter two incrementing
+a line counter. (Junk is explicitly recognized to allow for C<bibtex>'s
+"implicit comment" scheme.)
+
+From in-entry mode, we recognize newline, comment, and whitespace
+identically to top-level mode. In addition, the following tokens are
+recognized:
+
+ NUMBER [0-9]+
+ NAME [a-z0-9\!\$\&\*\+\-\.\/\:\;\<\>\?\[\]\^\_\`\|]+
+ LBRACE \{
+ RBRACE \}
+ LPAREN \(
+ RPAREN \)
+ EQUALS =
+ HASH \#
+ COMMA ,
+ QUOTE \"
+
+At this point, the lexical scanner starts to sound suspiciously like a
+context-free grammar, rather than a collection of independent regular
+expressions. However, it is necessary to keep this complexity in the
+scanner because certain characters (C<{> and C<(> in particular) have
+very different lexical meanings depending on the tokens that have
+preceded them in the input stream.
+
+In particular, C<{> and C<(> are treated as "entry openers" if they
+follow one C<at> and one C<name> token, unless the value of the C<name>
+token is C<"comment">. (Note the switch from top-level to in-entry
+between the two tokens.) In the C<@comment> case, the delimiter is
+considered as starting a string, and we enter string mode. Otherwise,
+the delimiter is saved, and when we see a corresponding C<}> or C<)> it
+is considered an "entry closer". (Braces are balanced for free here
+because the string lexer takes care of counting brace-depth.)
+
+Anywhere else, C<{> is considered as starting a string, and we enter
+string mode. C<"> always starts a string, regardless of context. The
+other tokens (C<name>, C<number>, C<equals>, C<hash>, and C<comma>) are
+recognized unconditionally.
+
+Note that C<name> is a catch-all token used for entry types, citation
+keys, field names, and macro names; because BibTeX has slightly
+different (largely undocumented) rules for these various elements, a bit
+of trickery is needed to make things work. As a starting point,
+consider BibTeX's definition of what's allowed for an entry key:
+a sequence of any characters I<except>
+
+ " # % ' ( ) , = { }
+
+plus space. There are a couple of problems with this scheme. First,
+without specifying the character set from which those "magic 10"
+characters are drawn, it's a bit hard to know just what is allowed.
+Second, allowing C<@> characters could lead to confusing BibTeX syntax
+(it doesn't confuse BibTeX, but it might confuse a human reader).
+Finally, allowing certain characters that are special to TeX means that
+BibTeX can generate bogus TeX code: try putting a backslash (C<\>) or
+tilde (C<~>) in a citation key. (This last exception is rather specific
+to the "generating (La)TeX code from a BibTeX database" application, but
+since that's the major application for BibTeX databases, then it will
+presumably be the major application for B<btparse>, at least initially.
+Thus, it makes sense to pay attention to this problem.)
+
+In B<btparse>, then, a name is defined as any sequence of letters,
+digits, underscores, and the following characters:
+
+ ! $ & * + - . / : ; < > ? [ ] ^ _ ` |
+
+This list was derived by removing BibTeX's "magic 10" from the set of
+printable 7-bit ASCII characters (32-126), and then further removing
+C<@>, C<\>, and C<~>. This means that B<btparse> disallows some of the
+weirder entry keys that BibTeX would accept, such as C<\foo@bar>, but
+still allows a string with initial digits. In fact, from the above
+definition it appears that B<btparse> would accept a string of all
+digits as a "name;" this is not the case, though, as the lexical scanner
+recognizes such a digit string as a number first. There are two
+problems here: BibTeX entry keys may in fact be entirely numeric, and
+field names may not begin with a digit. (Those are two of the
+not-so-obvious differences in BibTeX's handling of keys and field
+names.) The tricks used to deal with these problems are implemented in
+the parser rather than the lexical scanner, so are described in
+L<"SYNTACTIC GRAMMAR"> below.
+
+The string lexer recognizes C<lbrace>, C<rbrace>, C<lparen>, and
+C<rparen> tokens in order to count brace- or parenthesis-depth. This is
+necessary so it knows when to accept a string delimited by braces or
+parentheses. (Note that a parenthesis-delimited string is only allowed
+after C<@comment>---this is not a normal BibTeX construct.) In
+addition, it converts each non-space whitespace character (newline,
+carriage-return, and tab) to a single space. (Sequences of whitespace
+are not collapsed; that's the domain of string post-processing, which is
+well removed from the scanner or parser.) Finally, it accepts C<"> to
+delimit quote-delimited strings. Apart from those restrictions, the
+string lexer accepts anything up to the end-of-string delimiter.
+
+=head1 SYNTACTIC GRAMMAR
+
+(The language used to describe the grammar here is the extended
+Backus-Naur Form (EBNF) used by PCCTS. Terminals are represented by
+uppercase strings, non-terminals by lowercase strings; terminal names
+are the same as those given in the lexical grammar above. C<( foo )*>
+means zero or more repetitions of the C<foo> production, and C<{ foo }>
+means an optional C<foo>.)
+
+A file is just a sequence of zero or more entries:
+
+ bibfile : ( entry )*
+
+An entry is an at-sign, a name (the "entry type"), and the entry body:
+
+ entry : AT NAME body
+
+A body is either a string (this alternative is only tried if the entry
+type is C<"comment">) or the entry contents:
+
+ body : STRING # for comment entries
+ | ENTRY_OPEN contents ENTRY_CLOSE
+
+(C<ENTRY_OPEN> and C<ENTRY_CLOSE> are either C<{> and C<}> or C<(> and
+C<)>, depending what is seen in the input for a particular entry.)
+
+There are three possible productions for the "contents" non-terminal.
+Only one applies to any given entry, depending on the entry metatype
+(which in turn depends on the entry type). Currently, B<btparse>
+supports four entry metatypes: comment, preamble, macro definition, and
+regular. The first two correspond to C<@comment> and C<@preamble>
+entries; "macro definition" is for C<@string> entries; and "regular" is
+for all other entry types. (The library will be extended to handle
+C<@modify> and C<@alias> entry types, and corresponding "modify" and
+"alias" metatypes, when BibTeX 1.0 is released and the exact syntax is
+known.) The "metatype" concept is necessary so that all entry types
+that aren't specifically recognized fall into the "regular" metatype.
+It's also convenient not to have to C<strcmp> the entry type all the
+time.
+
+ contents : ( NAME | NUMBER ) COMMA fields # for regular entries
+ | fields # for macro definition entries
+ | value # for preamble entries
+
+Note that the entry key is not just a C<NAME>, but C<( NAME | NUMBER)>.
+This is necessary because BibTeX allows all-numeric entry keys, but
+B<btparse>'s lexical scanner recognizes such digit strings as C<NUMBER>
+tokens.
+
+C<fields> is a comma-separated list of fields, with an optional single
+trailing comma:
+
+ fields : field { COMMA fields }
+ |
+
+A C<field> is a single "field = value" assignment:
+
+ field : NAME EQUALS value
+
+Note that C<NAME> here is a restricted version of the "name" token
+described in L<"LEXICAL GRAMMAR"> above. Any "name" token will be
+accepted by the parser, but it is immediately checked to ensure that it
+doesn't begin with a digit; if so, an artificial syntax error is
+triggered. (This is for compatibility with BibTeX, which doesn't allow
+field names to start with a digit.)
+
+A C<value> is a series of simple values joined by C<'#'> characters:
+
+ value : simple_value ( HASH simple_value )*
+
+A simple value is a string, number, or name (for macro invocations):
+
+ simple_value : STRING
+ | NUMBER
+ | NAME
+
+=head1 SEE ALSO
+
+L<btparse>
+
+=head1 AUTHOR
+
+Greg Ward <gward@python.net>
diff --git a/btparse/doc/bt_macros.pod b/btparse/doc/bt_macros.pod
new file mode 100644
index 0000000..81df6a7
--- /dev/null
+++ b/btparse/doc/bt_macros.pod
@@ -0,0 +1,152 @@
+=head1 NAME
+
+bt_macros - accessing and manipulating the btparse macro table
+
+=head1 SYNOPSIS
+
+ void bt_add_macro_value (AST * assignment,
+ btshort options);
+ void bt_add_macro_text (char * macro,
+ char * text,
+ char * filename,
+ int line);
+
+ void bt_delete_macro (char * macro);
+ void bt_delete_all_macros (void);
+
+ int bt_macro_length (char *macro);
+ char * bt_macro_text (char * macro,
+ char * filename,
+ int line);
+
+=head1 DESCRIPTION
+
+B<btparse> maintains a single table of all macros (abbreviations)
+encountered while parsing BibTeX entries. It updates this table
+whenever it encounters a "macro definition" (C<@string>) entry, and
+refers to it whenever a macro is used in an entry and needs to be
+expanded. (Macros are not necessarily expanded on input, although this
+is the default. See L<bt_postprocess>.) Macro definitions are only
+cleared when B<btparse>'s global cleanup function, C<bt_cleanup()>, is
+called. Thus, unless you explicitly call C<bt_delete_macro()> or
+C<bt_delete_all_macros()>, macro definitions persist for as long as you
+use the library---usually, the lifetime of your process.
+
+=head1 FUNCTIONS
+
+You can use the following functions to add macros, delete them, and
+query their values---thus interfering with B<btparse>'s normal operation
+on the fly.
+
+=over 4
+
+=item bt_add_macro_text ()
+
+ void bt_add_macro_text (char * macro,
+ char * text,
+ char * filename,
+ int line);
+
+Defines a new macro, or redefines an old one. C<macro> is the name of
+the macro, and C<text> is the text it should expand to. C<filename> and
+C<line> are just used to generate any warnings about the macro
+definition; if they don't apply, specify C<NULL> for C<filename> and
+C<0> for C<line>. The only such warning occurs when you redefine an old
+macro: its value is overridden, and C<bt_add_macro_text()> issues a
+warning saying so.
+
+For instance, when parsing this macro definition entry:
+
+ @string{fubar = "Fouled Up Beyond All Recognition"}
+
+the library (in particular, the post-processing code called after an
+entry is successfully parsed) will ultimately do this:
+
+ bt_add_macro_text ("fubar", "Fouled Up Beyond All Recognition",
+ filename, line);
+
+This in turn will cause the macro C<fubar> to be expanded appropriately
+whenever the post-processing code sees it in any future entries.
+
+=item bt_add_macro_value ()
+
+ void bt_add_macro_value (AST * assignment,
+ btshort options);
+
+This function is mainly for internal use by the library, but it's
+available to you if you ever find yourself with a little bit of AST
+representing a macro definition, and you want to set the macro yourself
+(rather than letting the library's post-processing code take care of it
+for you). C<assignment> must be an AST node as returned by
+C<bt_next_field()>. Unlike most other B<btparse> functions that take an
+C<options> argument, C<options> here tells how the value in
+C<assignment> was post-processed. This is needed because macro values
+have to be processed in a special way to be valid in future expansions;
+if this one wasn't processed like that, C<bt_add_macro_value()> will do
+it for you. If you don't know how the value was post-processed, just
+supply C<0> for C<options>---that's guaranteed to describe something
+different from "the right way" for macros, so the post-processing will
+be done correctly.
+
+The processing done to macro values is mainly to ensure that we can get
+away with storing just a string in the macro table: macros invoked by
+the macro are themselves expanded, and all sub-strings are concatenated.
+For instance, if B<btparse> parses these entries:
+
+ @string{and = " and "}
+ @string{jim_n_bob = "James Smith" # and # "Bob Jones"}
+
+then the value stored for C<jim_n_bob> should obviously be the string
+C<"James Smith and Bob Jones">. To ensure this, B<btparse> has to
+process the value of C<and> differently from most BibTeX strings: in
+particular, whitespace is I<not> collapsed before the string is stored.
+That way, the correct value, C<" and ">, is interpolated into the value
+of C<jim_n_bob>. Thus, all macro values have sub-macros expanded and
+strings concatenated before they are stored, but whitespace is not
+collapsed until the macro is used in a regular entry.
+
+This function calls C<bt_add_macro_text()>, so the same proviso about
+redefining old macros applies---a warning will be issued, and the old
+value lost.
+
+=item bt_delete_macro ()
+
+ void bt_delete_macro (char * macro);
+
+Deletes a macro from the macro table. If C<macro> isn't defined,
+takes no action.
+
+=item bt_delete_all_macros ()
+
+ void bt_delete_all_macros (void);
+
+Deletes all macros from the macro table.
+
+=item bt_macro_length ()
+
+ int bt_macro_length (char *macro);
+
+Returns the length of a macro's expansion text. If the macro is
+undefined, returns 0; no warning is issued.
+
+=item bt_macro_text ()
+
+ char * bt_macro_text (char * macro,
+ char * filename,
+ int line);
+
+Returns the expansion text of a macro. If the macro is not defined,
+issues a warning and returns C<NULL>. C<filename> and C<line> are used
+for generating this warning; if they don't apply (i.e. you're not
+expanding the macro as a result of finding it in some file), supply
+C<NULL> for C<filename> and C<0> for C<line>.
+
+=back
+
+=head1 SEE ALSO
+
+L<btparse>
+
+=head1 AUTHOR
+
+Greg Ward <gward@python.net>
diff --git a/btparse/doc/bt_misc.pod b/btparse/doc/bt_misc.pod
new file mode 100644
index 0000000..d053c74
--- /dev/null
+++ b/btparse/doc/bt_misc.pod
@@ -0,0 +1,147 @@
+=head1 NAME
+
+bt_misc - miscellaneous BibTeX-like string-processing utilities
+
+=head1 SYNOPSIS
+
+ void bt_purify_string (char * string, btshort options);
+ void bt_change_case (char transform, char * string, btshort options);
+
+=head1 DESCRIPTION
+
+=over 4
+
+=item bt_purify_string()
+
+ void bt_purify_string (char * string, btshort options);
+
+"Purifies" a C<string> in the BibTeX way (usually used for generating
+sort keys). C<string> is modified in-place. C<options> is currently
+unused; just set it to zero for future compatibility. Purification
+consists of copying alphanumeric characters, converting hyphens and ties
+to space, copying spaces, and skipping (almost) everything else.
+
+"Almost" because "special characters" (used for accented and non-English
+letters) are handled specially. Recall that a BibTeX special character
+is any brace-group that starts at brace-depth zero whose first character
+is a backslash. For instance, the string
+
+ {\foo bar}Herr M\"uller went from {P{\r r}erov} to {\AA}rhus
+
+contains two special characters: C<"{\foo bar}"> and C<"\AA">. Neither
+the C<\"u> nor the C<\r r> are special characters, because they are not
+at the right brace depth.
+
+Special characters are handled as follows: if the control sequence (the
+TeX command that follows the backslash) is recognized as one of LaTeX's
+"foreign letters" (C<\oe>, C<\ae>, C<\o>, C<\l>, C<\ae>, C<\ss>, plus
+uppercase versions), then it is converted to a reasonable English
+approximation by stripping the backslash and converting the second
+character (if any) to lowercase; thus, C<{\AA}> in the above example
+would become simply C<Aa>. All other control sequences in a special
+character are stripped, as are all non-alphabetic characters.
+
+For example the above string, after "purification," becomes
+
+ barHerr Muller went from Pr rerov to Aarhus
+
+Obviously, something has gone wrong with the word C<P{\r r}erov> (a town
+in the Czech Republic). The accented `r' should be a special character,
+starting at brace-depth zero. If the original string were instead
+
+ {\foo bar}Herr M\"uller went from P{\r r}erov to {\AA}rhus
+
+then the purified result would be more sensible:
+
+ barHerr Muller went from Prerov to Aarhus
+
+Note the use of a "nonsense" special character C<{\foo bar}>: this trick
+is often used to put certain text in a string solely for generating sort
+keys; the text is then ignored when the document is processed by TeX (as
+long as C<\foo> is defined as a no-op TeX macro). This assumes, of
+course, that the output is eventually processed by TeX; if not, then
+this trick will backfire on you.
+
+Also, C<bt_purify_string()> is adequate for generating sort keys when
+you want to sort according to English-language conventions. To follow
+the conventions of other languages, though, a more sophisticated
+approach will be needed; hopefully, future versions of B<btparse> will
+address this deficiency.
+
+=item bt_change_case()
+
+ void bt_change_case (char transform, char * string, btshort options);
+
+Converts a string to lowercase, uppercase, or "non-book title
+capitalization", with special attention paid to BibTeX special
+characters and other brace-groups. The form of conversion is selected
+by the single character C<transform>: C<'u'> to convert to uppercase,
+C<'l'> for lowercase, and C<'t'> for "title capitalization". C<string>
+is modified in-place, and C<options> is currently unused; set it to zero
+for future compatibility.
+
+Lowercase and uppercase conversion are obvious, with the proviso that
+text in braces is treated differently (explained below). Title
+capitalization simply means that everything is converted to lowercase,
+except the first letter of the first word, and words immediately
+following a colon or sentence-ending punctuation. For instance,
+
+ Flying Squirrels: Their Peculiar Habits. Part One
+
+would be converted to
+
+ Flying squirrels: Their peculiar habits. Part one
+
+Text within braces is handled as follows. First, in a "special
+character" (see above for definition), control sequences that constitute
+one of LaTeX's non-English letters are converted appropriately---e.g.,
+when converting to lowercase, C<\AE> becomes C<\ae>). Any other control
+sequence in a special character (including accents) is preserved, and
+all text in a special character, regardless of depth and punctuation, is
+converted to lowercase or uppercase. (For "title capitalization," all
+text in a special character is converted to lowercase.)
+
+Brace groups that are not special characters are left completely
+untouched: neither text nor control sequences within non-special
+character braces are touched.
+
+For example, the string
+
+ A Guide to \LaTeXe: Document Preparation ...
+
+would, when C<transform> is C<'t'> (title capitalization), be converted
+to
+
+ A guide to \latexe: Document preparation ...
+
+which is probably not the desired result. A better attempt is
+
+ A Guide to {\LaTeXe}: Document Preparation ...
+
+which becomes
+
+ A guide to {\LaTeXe}: Document preparation ...
+
+However, if you go back and re-read the description of
+C<bt_purify_string()>, you'll discover that C<{\LaTeXe}> here is a
+special character, but not a non-English letter: thus, the control
+sequence is stripped. Thus, a sort key generated from this title would
+be
+
+ A Guide to Document Preparation
+
+...oops! The right solution (and this applies to any title with a TeX
+command that becomes actual text) is to bury the control sequence at
+brace-depth two:
+
+ A Guide to {{\LaTeXe}}: Document Preparation ...
+
+=back
+
+=head1 SEE ALSO
+
+L<btparse>
+
+=head1 AUTHOR
+
+Greg Ward <gward@python.net>
diff --git a/btparse/doc/bt_post_processing.pod b/btparse/doc/bt_post_processing.pod
new file mode 100644
index 0000000..56323e1
--- /dev/null
+++ b/btparse/doc/bt_post_processing.pod
@@ -0,0 +1,249 @@
+=head1 NAME
+
+bt_post_processing - post-processing of BibTeX strings, values, and entries
+
+=head1 SYNOPSIS
+
+ void bt_postprocess_string (char * s,
+ btshort options)
+
+ char * bt_postprocess_value (AST * value,
+ btshort options,
+ boolean replace);
+
+ char * bt_postprocess_field (AST * field,
+ btshort options,
+ boolean replace);
+
+ void bt_postprocess_entry (AST * entry,
+ btshort options);
+
+=head1 DESCRIPTION
+
+When B<btparse> parses a BibTeX entry, it initially stores the results
+in an abstract syntax tree (AST), in a form exactly mirroring the parsed
+data. For example, the entry
+
+ @Article{Jones:1997a,
+ AuThOr = "Bob Jones" # and # "Jim Smith ",
+ TITLE = "Feeding Habits of
+ the Common Cockroach",
+ JoUrNaL = j_ent,
+ YEAR = 1997
+ }
+
+would parse to an AST that could be represented as follows:
+
+ (entry,"Article")
+ (key,"Jones:1997a")
+ (field,"AuThOr")
+ (string,"Bob Jones")
+ (macro,"and")
+ (string,"Jim Smith ")
+ (field,"TITLE")
+ (string,"Feeding Habits of the Common Cockroach")
+ (field,"JoUrNaL")
+ (macro,"j_ent")
+ (field,"YEAR")
+ (number,"1997")
+
+The advantage of this form is that all the important information in the
+entry is readily available by traversing the tree using the functions
+described in L<bt_traversal>. This obvious problem is that the data is
+a little too raw to be immediately useful: entry types and field names
+are inconsistently capitalized, strings are full of unwanted whitespace,
+field values not reduced to single strings, and so forth.
+
+All of these problems are addressed by B<btparse>'s post-processing
+functions, described here. Normally, you won't have to call these
+functions---the library does the Right Thing for you after parsing each
+entry, and you can customize what exactly the Right Thing is for your
+application. (For instance, you can tell it to expand macros, but not
+to concatenate substrings together.) However, it's conceivable that you
+might wish to move the post-processing into your own code and out of the
+library's control. More likely, you could have strings that come from
+something other than BibTeX files that you would like to have treated as
+BibTeX strings; for that situation, the post-processing functions are
+essential. Finally, you might just be curious about what exactly
+happens to your data after it's parsed. If so, you've come to the right
+place for excruciatingly detailed explanations.
+
+=head1 FUNCTIONS
+
+B<btparse> offers four points of entry to its post-processing code. Of
+these, probably only the first and last---for processing individual
+strings and whole entries---will be commonly used.
+
+=head2 Post-processing entry points
+
+To understand why four entry points are offered, an explanation of the
+sample AST shown above will help. First of all, the whole entry is
+represented by the C<(entry,"Article")> node; this node has the entry
+key and all its field/value pairs as children. Entry nodes are returned
+by C<bt_parse_entry()> and C<bt_parse_entry_s()> (see L<bt_input>) as
+well as C<bt_next_entry()> (which traverses a list of entries returned
+from C<bt_parse_file()>---see L<bt_traversal>). Whole entries may be
+post-processed with C<bt_postprocess_entry()>.
+
+You may also need to post-process a single field, or just the value
+associated with it. (The difference is that processing the field can
+change the field name---e.g. to lowercase---in addition to the field
+value.) The C<(field,"AuThOr")> node above is an example of a field
+sub-AST, and C<(string,"Bob Jones")> is the first node in the list of
+simple values representing that field's value. (Recall that a field
+value is, in general, a list of simple values.) Field nodes are
+returned by C<bt_next_field()>, value nodes by C<bt_next_value()>. The
+former may be passed to C<bt_postprocess_field()> for post-processing,
+the latter to C<bt_postprocess_value()>.
+
+Finally, individual strings may wander into your program from many
+places other than a B<btparse> AST. For that reason,
+C<bt_postprocess_string()> is available for post-processing arbitrary
+strings.
+
+=head2 Post-processing options
+
+All of the post-processing routines have an C<options> parameter, which
+you can use to fine-tune the post-processing. (This is just like the
+per-metatype string-processing options that you can set before parsing
+entries; see C<bt_set_stringopts()> in L<bt_input>.) Like elsewhere in
+the library, C<options> is a bitmap constructed by or'ing together
+various predefined constants. These constants and their effects are
+documented in L<btparse/"String processing option macros">.
+
+=over 4
+
+=item bt_postprocess_string ()
+
+ void bt_postprocess_string (char * s,
+ btshort options)
+
+Post-processes an individual string, C<s>, which is modified in place.
+The only post-processing option that makes sense on individual strings
+is whether to collapse whitespace according to the BibTeX rules; thus,
+if C<options & BTO_COLLAPSE> is false, this function has no effect.
+(Although it makes a complete pass over the string anyways. This is for
+future expansion.)
+
+The exact rules for collapsing whitespace are simple: non-space
+whitespace characters (tabs and newlines mainly) are converted to space,
+any strings of more than one space within are collapsed to a single
+space, and any leading or trailing spaces are deleted. (Ensuring that
+all whitespace is spaces is actually done by B<btparse>'s lexical
+scanner, so strings in B<btparse> ASTs will never have whitespace apart
+from space. Likewise, any strings passed to bt_postprocess_string()
+should not contain non-space whitespace characters.)
+
+=item bt_postprocess_value ()
+
+ char * bt_postprocess_value (AST * value,
+ btshort options,
+ boolean replace);
+
+Post-processes a single field value, which is the head of a list of
+simple values as returned by C<bt_next_value()>. All of the relevant
+string-processing options come into play here: conversion of numbers to
+strings (C<BTO_CONVERT>), macro expansion (C<BTO_EXPAND>), collapsing of
+whitespace (C<BTO_COLLAPSE>), and string pasting (C<BTO_PASTE>). Since
+pasting substrings together without first expanding macros and
+converting numbers would be nonsensical, attempting to do so is a fatal
+error.
+
+If C<replace> is true, then the list headed by C<value> will be replaced
+by a list representing the processed value. That is, if string pasting
+is turned on (C<options & BTO_PASTE> is true), then this list will be
+collapsed to a single node containing the single string that results
+from pasting together all the substrings. If string pasting is not on,
+then each node in the list will be left intact, but will have its
+text replaced by processed text.
+
+If C<replace> is false, then a new string will be built on the fly and
+returned by the function. Note that if pasting is not on in this case,
+you will only get the last string in the list. (It doesn't really make
+a lot of sense to post-process a value without pasting unless you're
+replacing it with the new value, though.)
+
+Returns the string that resulted from processing the whole value, which
+only makes sense if pasting was on or there was only one value in the
+list. If a multiple-value list was processed without pasting, the last
+string in the list is returned (after processing).
+
+Consider what might be done to the value of the C<author> field in the
+above example, which is the concatenation of a string, a macro, and
+another string. Assume that the macro C<and> expands to C<" and ">, and
+that the variable C<value> points to the sub-AST for this value.
+The original sub-AST corresponding to this value is
+
+ (string,"Bob Jones")
+ (macro,"and")
+ (string,"Jim Smith ")
+
+To fully process this value in-place, you would call
+
+ bt_postprocess_value (value, BTO_FULL, TRUE);
+
+(C<BTO_FULL> is just the combination of all possible string-processing
+options: C<BTO_CONVERT|BTO_EXPAND|BTO_PASTE|BTO_COLLAPSE>.) This would
+convert the value to a single-element list,
+
+ (string,"Bob Jones and Jim Smith")
+
+and return the fully-processed string C<"Bob Jones and Jim Smith">.
+Note that the C<and> macro has been expanded, interpolated between the
+two literal strings, everything pasted together, and finally whitespace
+collapsed. (Collapsing whitespace before concatenating the strings
+would be a bad idea.)
+
+Let's say you'd rather preserve the list nature of the value, while
+expanding macros and converting any numbers to strings. (This
+conversion is trivial: it just changes the type of the node from
+C<BTAST_NUMBER> to C<BTAST_STRING>. "Number" values are always stored
+as a string of digits, just as they appear in the file.) This would be
+done with the call
+
+ bt_postprocess_value
+ (value, BTO_CONVERT|BTO_EXPAND|BTO_COLLAPSE,TRUE);
+
+which would change the list to
+
+ (string,"Bob Jones")
+ (string,"and")
+ (string,"Jim Smith")
+
+Note that whitespace is collapsed here I<before> any concatenation can
+be done; this is probably a bad idea. But you can do it if you wish.
+(If you get any ideas about cooking up your own value post-processing
+scheme by doing it in little steps like this, take a look at the source
+to C<bt_postprocess_value()>; it should dissuade you from such a
+venture.)
+
+=item bt_postprocess_field ()
+
+ char * bt_postprocess_field (AST * field,
+ btshort options,
+ boolean replace);
+
+This is little more than a front-end to C<bt_postprocess_value()>; the
+only difference is that you pass it a "field" AST node (eg. the
+C<(field,"AuThOr")> in the above example), and that it transforms the
+field name in addition to its value. In particular, the field name is
+forced to lowercase; this behaviour is (currently) not optional.
+
+Returns the string returned by C<bt_postprocess_value()>.
+
+=item bt_postprocess_entry ()
+
+ void bt_postprocess_entry (AST * entry,
+ btshort options);
+
+Post-processes all values in an entry. If C<entry> points to the AST
+for a "regular" or "macro definition" entry, then the values are just
+what you'd expect: everything on the right-hand side of a field or macro
+"assignment." You can also post-process comment and preamble entries,
+though. Comment entries are essentially one big string, so only
+whitespace collapsing makes sense on them. Preambles may have multiple
+strings pasted together, so all the string-processing options apply to
+them. (And there's nothing to prevent you from using macros in a
+preamble.)
+
+=back
diff --git a/btparse/doc/bt_postprocess.pod b/btparse/doc/bt_postprocess.pod
new file mode 100644
index 0000000..82ac07e
--- /dev/null
+++ b/btparse/doc/bt_postprocess.pod
@@ -0,0 +1,265 @@
+=head1 NAME
+
+bt_postprocess - post-processing of BibTeX strings, values, and entries
+
+=head1 SYNOPSIS
+
+ void bt_postprocess_string (char * s,
+ btshort options)
+
+ char * bt_postprocess_value (AST * value,
+ btshort options,
+ boolean replace);
+
+ char * bt_postprocess_field (AST * field,
+ btshort options,
+ boolean replace);
+
+ void bt_postprocess_entry (AST * entry,
+ btshort options);
+
+=head1 DESCRIPTION
+
+When B<btparse> parses a BibTeX entry, it initially stores the results
+in an abstract syntax tree (AST), in a form exactly mirroring the parsed
+data. For example, the entry
+
+ @Article{Jones:1997a,
+ AuThOr = "Bob Jones" # and # "Jim Smith ",
+ TITLE = "Feeding Habits of
+ the Common Cockroach",
+ JoUrNaL = j_ent,
+ YEAR = 1997
+ }
+
+would parse to an AST that could be represented as follows:
+
+ (entry,"Article")
+ (key,"Jones:1997a")
+ (field,"AuThOr")
+ (string,"Bob Jones")
+ (macro,"and")
+ (string,"Jim Smith ")
+ (field,"TITLE")
+ (string,"Feeding Habits of the Common Cockroach")
+ (field,"JoUrNaL")
+ (macro,"j_ent")
+ (field,"YEAR")
+ (number,"1997")
+
+The advantage of this form is that all the important information in the
+entry is readily available by traversing the tree using the functions
+described in L<bt_traversal>. This obvious problem is that the data is
+a little too raw to be immediately useful: entry types and field names
+are inconsistently capitalized, strings are full of unwanted whitespace,
+field values not reduced to single strings, and so forth.
+
+All of these problems are addressed by B<btparse>'s post-processing
+functions, described here. Normally, you won't have to call these
+functions---the library does the Right Thing for you after parsing each
+entry, and you can customize what exactly the Right Thing is for your
+application. (For instance, you can tell it to expand macros, but not
+to concatenate substrings together.) However, it's conceivable that you
+might wish to move the post-processing into your own code and out of the
+library's control. More likely, you could have strings that come from
+something other than BibTeX files that you would like to have treated as
+BibTeX strings; for that situation, the post-processing functions are
+essential. Finally, you might just be curious about what exactly
+happens to your data after it's parsed. If so, you've come to the right
+place for excruciatingly detailed explanations.
+
+=head1 FUNCTIONS
+
+B<btparse> offers four points of entry to its post-processing code. Of
+these, probably only the first and last---for processing individual
+strings and whole entries---will be commonly used.
+
+=head2 Post-processing entry points
+
+To understand why four entry points are offered, an explanation of the
+sample AST shown above will help. First of all, the whole entry is
+represented by the C<(entry,"Article")> node; this node has the entry
+key and all its field/value pairs as children. Entry nodes are returned
+by C<bt_parse_entry()> and C<bt_parse_entry_s()> (see L<bt_input>) as
+well as C<bt_next_entry()> (which traverses a list of entries returned
+from C<bt_parse_file()>---see L<bt_traversal>). Whole entries may be
+post-processed with C<bt_postprocess_entry()>.
+
+You may also need to post-process a single field, or just the value
+associated with it. (The difference is that processing the field can
+change the field name---e.g. to lowercase---in addition to the field
+value.) The C<(field,"AuThOr")> node above is an example of a field
+sub-AST, and C<(string,"Bob Jones")> is the first node in the list of
+simple values representing that field's value. (Recall that a field
+value is, in general, a list of simple values.) Field nodes are
+returned by C<bt_next_field()>, value nodes by C<bt_next_value()>. The
+former may be passed to C<bt_postprocess_field()> for post-processing,
+the latter to C<bt_postprocess_value()>.
+
+Finally, individual strings may wander into your program from many
+places other than a B<btparse> AST. For that reason,
+C<bt_postprocess_string()> is available for post-processing arbitrary
+strings.
+
+=head2 Post-processing options
+
+All of the post-processing routines have an C<options> parameter, which
+you can use to fine-tune the post-processing. (This is just like the
+per-metatype string-processing options that you can set before parsing
+entries; see C<bt_set_stringopts()> in L<bt_input>.) Like elsewhere in
+the library, C<options> is a bitmap constructed by or'ing together
+various predefined constants. These constants and their effects are
+documented in L<btparse/"String processing option macros">.
+
+=over 4
+
+=item bt_postprocess_string ()
+
+ void bt_postprocess_string (char * s,
+ btshort options)
+
+Post-processes an individual string, C<s>, which is modified in place.
+The only post-processing option that makes sense on individual strings
+is whether to collapse whitespace according to the BibTeX rules; thus,
+if C<options & BTO_COLLAPSE> is false, this function has no effect.
+(Although it makes a complete pass over the string anyways. This is for
+future expansion.)
+
+The exact rules for collapsing whitespace are simple: non-space
+whitespace characters (tabs and newlines mainly) are converted to space,
+any strings of more than one space within are collapsed to a single
+space, and any leading or trailing spaces are deleted. (Ensuring that
+all whitespace is spaces is actually done by B<btparse>'s lexical
+scanner, so strings in B<btparse> ASTs will never have whitespace apart
+from space. Likewise, any strings passed to bt_postprocess_string()
+should not contain non-space whitespace characters.)
+
+=item bt_postprocess_value ()
+
+ char * bt_postprocess_value (AST * value,
+ btshort options,
+ boolean replace);
+
+Post-processes a single field value, which is the head of a list of
+simple values as returned by C<bt_next_value()>. All of the relevant
+string-processing options come into play here: conversion of numbers to
+strings (C<BTO_CONVERT>), macro expansion (C<BTO_EXPAND>), collapsing of
+whitespace (C<BTO_COLLAPSE>), and string pasting (C<BTO_PASTE>). Since
+pasting substrings together without first expanding macros and
+converting numbers would be nonsensical, attempting to do so is a fatal
+error.
+
+If C<replace> is true, then the list headed by C<value> will be replaced
+by a list representing the processed value. That is, if string pasting
+is turned on (C<options & BTO_PASTE> is true), then this list will be
+collapsed to a single node containing the single string that results
+from pasting together all the substrings. If string pasting is not on,
+then each node in the list will be left intact, but will have its
+text replaced by processed text.
+
+If C<replace> is false, then a new string will be built on the fly and
+returned by the function. Note that if pasting is not on in this case,
+you will only get the last string in the list. (It doesn't really make
+a lot of sense to post-process a value without pasting unless you're
+replacing it with the new value, though.)
+
+Returns the string that resulted from processing the whole value, which
+only makes sense if pasting was on or there was only one value in the
+list. If a multiple-value list was processed without pasting, the last
+string in the list is returned (after processing).
+
+Consider what might be done to the value of the C<author> field in the
+above example, which is the concatenation of a string, a macro, and
+another string. Assume that the macro C<and> expands to C<" and ">, and
+that the variable C<value> points to the sub-AST for this value.
+The original sub-AST corresponding to this value is
+
+ (string,"Bob Jones")
+ (macro,"and")
+ (string,"Jim Smith ")
+
+To fully process this value in-place, you would call
+
+ bt_postprocess_value (value, BTO_FULL, TRUE);
+
+This would convert the value to a single-element list,
+
+ (string,"Bob Jones and Jim Smith")
+
+and return the fully-processed string C<"Bob Jones and Jim Smith">.
+Note that the C<and> macro has been expanded, interpolated between the
+two literal strings, everything pasted together, and finally whitespace
+collapsed. (Collapsing whitespace before concatenating the strings
+would be a bad idea.)
+
+(Incidentally, C<BTO_FULL> is just a macro for the combination of all
+possible string-processing options, currently:
+
+ BTO_CONVERT | BTO_EXPAND | BTO_PASTE | BTO_COLLAPSE
+
+There are two other similar shortcut macros: C<BTO_MACRO> to express the
+special string-processing done on macro values, which is the same as
+C<BTO_FULL> except for the absence of C<BTO_COLLAPSE>; and
+C<BTO_MINIMAL>, which means no string-processing is to be done.)
+
+Let's say you'd rather preserve the list nature of the value, while
+expanding macros and converting any numbers to strings. (This
+conversion is trivial: it just changes the type of the node from
+C<BTAST_NUMBER> to C<BTAST_STRING>. "Number" values are always stored
+as a string of digits, just as they appear in the file.) This would be
+done with the call
+
+ bt_postprocess_value
+ (value, BTO_CONVERT|BTO_EXPAND|BTO_COLLAPSE,TRUE);
+
+which would change the list to
+
+ (string,"Bob Jones")
+ (string,"and")
+ (string,"Jim Smith")
+
+Note that whitespace is collapsed here I<before> any concatenation can
+be done; this is probably a bad idea. But you can do it if you wish.
+(If you get any ideas about cooking up your own value post-processing
+scheme by doing it in little steps like this, take a look at the source
+to C<bt_postprocess_value()>; it should dissuade you from such a
+venture.)
+
+=item bt_postprocess_field ()
+
+ char * bt_postprocess_field (AST * field,
+ btshort options,
+ boolean replace);
+
+This is little more than a front-end to C<bt_postprocess_value()>; the
+only difference is that you pass it a "field" AST node (eg. the
+C<(field,"AuThOr")> in the above example), and that it transforms the
+field name in addition to its value. In particular, the field name is
+forced to lowercase; this behaviour is (currently) not optional.
+
+Returns the string returned by C<bt_postprocess_value()>.
+
+=item bt_postprocess_entry ()
+
+ void bt_postprocess_entry (AST * entry,
+ btshort options);
+
+Post-processes all values in an entry. If C<entry> points to the AST
+for a "regular" or "macro definition" entry, then the values are just
+what you'd expect: everything on the right-hand side of a field or macro
+"assignment." You can also post-process comment and preamble entries,
+though. Comment entries are essentially one big string, so only
+whitespace collapsing makes sense on them. Preambles may have multiple
+strings pasted together, so all the string-processing options apply to
+them. (And there's nothing to prevent you from using macros in a
+preamble.)
+
+=back
+
+=head1 SEE ALSO
+
+L<btparse>, L<bt_input>, L<bt_traversal>
+
+=head1 AUTHOR
+
+Greg Ward <gward@python.net>
diff --git a/btparse/doc/bt_split_names.pod b/btparse/doc/bt_split_names.pod
new file mode 100644
index 0000000..9fd2c58
--- /dev/null
+++ b/btparse/doc/bt_split_names.pod
@@ -0,0 +1,280 @@
+=head1 NAME
+
+bt_split_names - splitting up BibTeX names and lists of names
+
+=head1 SYNOPSIS
+
+ bt_stringlist * bt_split_list (char * string,
+ char * delim,
+ char * filename,
+ int line,
+ char * description);
+ void bt_free_list (bt_stringlist *list);
+ bt_name * bt_split_name (char * name,
+ char * filename,
+ int line,
+ int name_num);
+ void bt_free_name (bt_name * name);
+
+=head1 DESCRIPTION
+
+When BibTeX files are used for their original purpose---bibliographic
+entries describing scholarly publications---processing lists of names
+(authors and editors mostly) becomes important. Although such
+name-processing is outside the general-purpose database domain of most
+of the B<btparse> library, these splitting functions are provided as a
+concession to reality: most BibTeX data files use the BibTeX conventions
+for author names, and a library to process that data ought to be capable
+of processing the names.
+
+Name-processing comes in two stages: first, split up a list of names
+into individual strings; second, split up each name into "parts" (first,
+von, last, and jr). The first is actually quite general: you could pick
+a delimiter (such as C<'and'>, used for lists of names) and use it to
+divide any string into substrings. C<bt_split_list()> could then be
+called to break up the original string and extract the substrings.
+C<bt_split_name()>, however, is quite specific to four-part author names
+written using BibTeX conventions. (These conventions are described
+informally in any BibTeX documentation; the description you will find
+here is more formal and algorithmic---and thus harder to understand.)
+
+See L<bt_format_names> for information on turning split-up names back
+into strings in a variety of ways.
+
+=head1 FUNCTIONS
+
+=over 4
+
+=item bt_split_list()
+
+ bt_stringlist * bt_split_list (char * string,
+ char * delim,
+ char * filename,
+ int line,
+ char * description)
+
+Splits C<string> into substrings delimited by C<delim> (a fixed string).
+The splitting is done according to the rules used by BibTeX for
+splitting up a list of names, in particular:
+
+=over 4
+
+=item *
+
+delimiters at beginning or end of string are ignored
+
+=item *
+
+delimiters must be surrounded by whitespace
+
+=item *
+
+matching of delimiters is case insensitive
+
+=item *
+
+delimiters at non-zero brace depth are ignored
+
+=back
+
+For instance, if the delimiter is C<"and">, then the string
+
+ Candy and Apples AnD {Green Eggs and Ham}
+
+splits into three substrings: C<"Candy">, C<"Apples">, and
+C<"{Green Eggs and Ham}">.
+
+If there are extra delimiters at the extremities of the string---say,
+an C<"and"> at the beginning of the string---then they are included in
+the first/last string; no warning is currently printed, but this may
+change. Successive delimiters (C<"and and">) result in a warning and a
+NULL string being added to the list of substrings. For instance, the
+string
+
+ and Joe Q. Blow and and Smith, Jr., John
+
+would split into three substrings: C<"and Joe Q. Blow">, C<NULL>, and
+C<"Smith, Jr., John">.
+
+(If these rules seem somewhat odd, don't blame me: I just implemented
+BibTeX's observed behaviour and added warning messages for one of the
+more obvious and easily-detected mistakes.)
+
+The substrings are returned as a C<bt_stringlist> structure:
+
+ typedef struct
+ {
+ char * string;
+ int num_items;
+ char ** items;
+ } bt_stringlist;
+
+There is currently no elegant interface to this structure: you just have
+to poke around in it yourself. The fields are:
+
+=over 4
+
+=item C<string>
+
+a copy of the C<string> parameter passed to C<bt_split_list()>, but with
+NUL characters replacing the space after each substring. (This is safe
+because delimiters must be surrounded by whitespace, which means that
+each substring is followed by whitespace which is not part of the
+substring.) You probably shouldn't fiddle with C<string>; it's just
+there so that C<bt_free_list()> has something to C<free()>.
+
+=item C<num_items>
+
+the number of substrings found in the string passed to
+C<bt_split_list()>.
+
+=item C<items>
+
+an array of C<num_items> pointers into C<string>. For instance,
+C<items[1]> points to the second substring. Since C<string> has been
+mangled with NUL characters, it is safe to treat C<items[i]> as a
+regular C string.
+
+C<filename>, C<line>, and C<description> are all used for generating
+warning messages. C<filename> and C<line> simply describe where the
+string came from, and C<description> is a brief (one word) description
+of the substrings. For instance, if you are splitting a list of names,
+supply C<"name"> for C<description>---that way, warnings will refer to
+"name X" rather than "substring x".
+
+=back
+
+=item bt_free_list()
+
+ void bt_free_list (bt_stringlist *list)
+
+Frees a C<bt_stringlist> structure as returned by C<bt_split_list()>.
+That is, it frees the copy of the string you passed to
+C<bt_split_list()>, and then frees the structure itself.
+
+=item bt_split_name()
+
+ bt_name * bt_split_name (char * name,
+ char * filename,
+ int line,
+ int name_num)
+
+Splits a single BibTeX-style author name into four parts: first, von,
+last, and jr. This can handle almost all names in the style of the
+major Western European languages, but not quite. (Alas!)
+
+A name is split by first dividing into tokens; tokens are separated by
+whitespace or commas at brace-level zero. Thus the name
+
+ van der Graaf, Horace Q.
+
+has five tokens, whereas the name
+
+ {Foo, Bar, and Sons}
+
+consists of a single token.
+
+How tokens are divided into parts depends on the form of the name. If
+the name has no commas at brace-level zero (as in the second example),
+then it is assumed to be in either "first last" or "first von last"
+form. If there are no tokens that start with a lower-case letter, then
+"first last" form is assumed: the final token is the last name, and all
+other tokens form the first name. Otherwise, the earliest contiguous
+sequence of tokens with initial lower-case letters is taken as the `von'
+part; if this sequence includes the final token, then a warning is
+printed and the final token is forced to be the `last' part.
+
+If a name has a single comma, then it is assumed to be in "von last,
+first" form. A leading sequence of tokens with initial lower-case
+letters, if any, forms the `von' part; tokens between the `von' and the
+comma form the `last' part; tokens following the comma form the `first'
+part. Again, if there are no token following a leading sequence of
+lowercase tokens, a warning is printed and the token immediately
+preceding the comma is taken to be the `last' part.
+
+If a name has more than two commas, a warning is printed and the name is
+treated as though only the first two commas were present.
+
+Finally, if a name has two commas, it is assumed to be in "von last, jr,
+first" form. (This is the only way to represent a name with a `jr'
+part.) The parsing of the name is the same as for a one-comma name,
+except that tokens between the two commas are taken to be the `jr' part.
+
+The one case not properly handled by BibTeX name conventions is a name
+with a 'jr' part not separated from the last name by a comma; for
+example:
+
+ Henry Ford Jr.
+ George Herbert Walker Bush III
+
+Both of these would be incorrectly interpreted by both BibTeX and
+bt_split_name(): the C<"Jr."> or C<"III"> token would be taken as the
+last name, and the other tokekens as a two- or four-part first name.
+The workaround is to shoehorn the 'jr' into the last name:
+
+ Henry {Ford Jr.}
+ George Herbert Walker {Bush III}
+
+but this will make it impossible to extract the last name on its own,
+e.g. to generate "author-year" style citations. This design flaw may be
+fixed in a future version of B<btparse>.
+
+The split-up name is returned as a C<bt_name> structure:
+
+ typedef struct
+ {
+ bt_stringlist * tokens;
+ char ** parts[BT_MAX_NAMEPARTS];
+ int part_len[BT_MAX_NAMEPARTS];
+ } bt_name;
+
+Again, there's no nice interface to this structure; you'll just have to
+access the fields individually. They are:
+
+=over 4
+
+=item C<tokens>
+
+the name, broken down into a flat list of tokens. See above for a
+description of the C<bt_stringlist> structure.
+
+=item C<parts>
+
+an array of arrays of pointers into the token list. The major dimension
+of this beast is the "name part;" you should index this dimension using
+the C<bt_namepart> enum. For instance, C<parts[BTN_LAST]> is an array
+of pointers to the tokens comprising the last name;
+C<parts[BTN_LAST][1]> is a C<char *>: the second token of the 'last'
+part; and C<parts[BTN_LAST][1][0]> is the first character of the second
+token of the 'last' part.
+
+=item C<part_len>
+
+the length, in tokens, of each part. For instance, you might loop over
+all tokens in the 'first' part as follows (assuming C<name> is a
+C<bt_name *> returned by C<bt_split_name()>):
+
+ for (i = 0; i < name->part_len[BTN_FIRST]; i++)
+ {
+ printf ("token %d of first name: %s\n",
+ i, name->parts[BTN_FIRST][i]);
+ }
+
+=back
+
+=item bt_free_name()
+
+ void bt_free_name (bt_name * name)
+
+Frees the C<bt_name> structure created by C<bt_split_name()> (including
+the C<bt_stringlist> structure inside the C<bt_name>).
+
+=back
+
+=head1 SEE ALSO
+
+L<btparse>, L<bt_format_names>
+
+=head1 AUTHOR
+
+Greg Ward <gward@python.net>
diff --git a/btparse/doc/bt_traversal.pod b/btparse/doc/bt_traversal.pod
new file mode 100644
index 0000000..243a5f8
--- /dev/null
+++ b/btparse/doc/bt_traversal.pod
@@ -0,0 +1,181 @@
+=head1 NAME
+
+bt_traversal - AST traversal/query functions in B<btparse> library
+
+=head1 SYNOPSIS
+
+ AST * bt_next_entry (AST * entry_list,
+ AST * prev_entry)
+ AST * bt_next_field (AST * entry, AST * prev, char ** name)
+ AST * bt_next_value (AST * head,
+ AST * prev,
+ bt_nodetype_t * nodetype,
+ char ** text)
+
+ bt_metatype_t bt_entry_metatype (AST * entry)
+ char * bt_entry_type (AST * entry)
+ char * bt_entry_key (AST * entry)
+ char * bt_get_text (AST * node)
+
+=head1 DESCRIPTION
+
+The functions described here are all used to traverse and query the
+abstract-syntax trees (ASTs) returned by the input functions described
+in L<bt_input>. The three "bt_next" functions (C<bt_next_entry()>,
+C<bt_next_field()>, and C<bt_next_value()>) are used respectively to
+traverse a list of entries, the list of fields within a particular
+entry, and the list of simple values associated with a particular field.
+The other functions are just used to query various nodes in the tree for
+the useful information contained in them.
+
+=head2 Traversal functions
+
+=over 4
+
+=item bt_next_entry()
+
+ AST * bt_next_entry (AST * entry_list,
+ AST * prev_entry)
+
+Used to traverse the linked list of entries returned by
+C<bt_parse_file()> (see L<bt_input>). On the first call, you should
+supply C<NULL> for C<prev_entry>, and a pointer to the head of the list
+will be returned. On subsequent calls, pass the previous return value
+as C<prev_entry>; the function returns the next entry in the list, or
+C<NULL> if there are no more entries. Also returns C<NULL> if either
+C<entry_list> or C<prev_entry> are improper.
+
+For example (ignoring error handling and variable declarations):
+
+ entries = bt_parse_file (filename, options, &status);
+ entry = NULL;
+ while (entry = bt_next_entry (entries, entry))
+ {
+ /* process entry */
+ }
+
+=item bt_next_field()
+
+ AST * bt_next_field (AST * entry, AST * prev, char ** name)
+
+Used to traverse the list of fields in a regular or macro definition
+entry. (You should call C<bt_entry_metatype()> to determine if you have
+the right kind of entry before calling C<bt_next_field()>.) C<entry>
+should be a pointer to the AST for a single entry, as returned by
+C<bt_parse_entry()>, C<bt_parse_entry_s()>, or C<bt_next_entry()>. On
+the first call, supply C<NULL> for C<prev>; C<bt_next_field()> will
+return a pointer to the first field in C<entry>, or C<NULL> if C<entry>
+has no fields (for instance, if it's a comment or preamble entry). On
+subsequent calls, pass the previous return value as C<prev>;
+C<bt_next_field()> will keep returning pointers to field sub-ASTs as
+long as it makes sense. These pointers can then be passed to
+C<bt_next_value()> or C<bt_get_text()> to get the field's value.
+
+For example, the loop body in the previous example could be:
+
+ field = NULL;
+ while (field = bt_next_field (entry, field, &field_name))
+ {
+ /* process field */
+ }
+
+=item bt_next_value()
+
+ AST * bt_next_value (AST * head,
+ AST * prev,
+ bt_nodetype_t * nodetype,
+ char ** text)
+
+Traverses the list of simple values that make up the value of a single
+field. (Recall that a simple value is either a quoted string, a macro
+invocation, or a number. A compound value is a list of these separated
+by C<'#'> in the original input. Depending on the string
+post-processing options used when the data was parsed, the "list of
+simple values" nature of the original data may be preserved in the AST
+that you're traversing, in which case you'll need a C<bt_next_value()>
+loop.
+
+C<bt_next_value()> works much like C<bt_next_entry()> and
+C<bt_next_field()>: on the first call, you supply C<NULL> for C<prev>,
+and on subsequent calls you supply the previous return value. Returns
+C<NULL> when there are no more simple values to return. Also sets
+C<*nodetype> and C<*text> to the corresponding information from the
+simple value node. C<*nodetype> will be one of C<BTAST_STRING>,
+C<BTAST_MACRO>, or C<BTAST_NUMBER>; C<*text> will point to the same
+string as the AST node does (it is not copied for you), so don't mess
+with it.
+
+For example, the loop body in the C<bt_next_field()> example could be
+replaced with:
+
+ value = NULL;
+ while (value = bt_next_field (field, value, &nodetype, &text))
+ {
+ switch (nodetype)
+ {
+ case BTAST_STRING: /* process the string */
+ case BTAST_MACRO: /* process the macro */
+ case BTAST_NUMBER: /* process the number */
+ }
+ }
+
+See also L</bt_get_text>.
+
+=back
+
+=head2 Query functions
+
+=over 4
+
+=item bt_entry_metatype()
+
+ bt_metatype_t bt_entry_metatype (AST * entry)
+
+Returns the metatype of an entry. (Recall that the I<metatype> is an
+enumerated type whose values are derived from the specific type of an
+entry; for instance, an C<@comment> entry has type C<"comment"> and
+metatype C<BTE_COMMENT>. The type-metatype relationship is similarly
+obvious for C<BTE_PREAMBLE>; C<BTE_MACRODEF> corresponds to C<@string>
+entries; and C<BTE_REGULAR> corresponds to any other type.)
+
+Returns C<BTE_UNKNOWN> if C<entry> is invalid (i.e., C<NULL> or not a
+pointer to an entry AST).
+
+=item bt_entry_type()
+
+ char * bt_entry_type (AST * entry)
+
+Returns the type of an entry. Recall that the type is the name that
+appears after the C<'@'> character in the original input. Returns
+C<NULL> if C<entry> is invalid (i.e., C<NULL> or not a pointer to an
+entry AST).
+
+=item bt_entry_key()
+
+ char * bt_entry_key (AST * entry)
+
+Returns the citation key of a regular entry. (The citation key is the
+name that appears after the entry-open delimiter in a regular entry.)
+Returns C<NULL> if C<entry> is invalid (i.e., C<NULL> or not a pointer
+to the AST for a regular entry).
+
+=item bt_get_text()
+
+ char * bt_get_text (AST * node)
+
+Performs all string post-processing (macro expansion, concatenation of
+simple values, and whitespace collapsing) of a compound value and
+returns the string that results. Can be called either on a field for a
+regular or macro definition entry (as returned by C<bt_next_field()>),
+or on a comment or preamble entry. Returns C<NULL> if called on an
+invalid AST node.
+
+=back
+
+=head1 SEE ALSO
+
+L<btparse>, L<bt_input>, L<bt_postprocess>
+
+=head1 AUTHOR
+
+Greg Ward <gward@python.net>
diff --git a/btparse/doc/btparse.pod b/btparse/doc/btparse.pod
new file mode 100644
index 0000000..0bdbfab
--- /dev/null
+++ b/btparse/doc/btparse.pod
@@ -0,0 +1,542 @@
+=head1 NAME
+
+btparse - C library for parsing and processing BibTeX data files
+
+=head1 SYNOPSIS
+
+ #include <btparse.h>
+
+ /* Basic library initialization / cleanup */
+ void bt_initialize (void);
+ void bt_free_ast (AST *ast);
+ void bt_cleanup (void);
+
+ /* Input / interface to parser */
+ void bt_set_stringopts (bt_metatype_t metatype, btshort options);
+ AST * bt_parse_entry_s (char * entry_text,
+ char * filename,
+ int line,
+ btshort options,
+ boolean * status);
+ AST * bt_parse_entry (FILE * infile,
+ char * filename,
+ btshort options,
+ boolean * status);
+ AST * bt_parse_file (char * filename,
+ btshort options,
+ boolean * overall_status);
+
+ /* AST traversal/query */
+ AST * bt_next_entry (AST * entry_list,
+ AST * prev_entry)
+ AST * bt_next_field (AST *entry, AST *prev, char **name);
+ AST * bt_next_value (AST *head,
+ AST *prev,
+ bt_nodetype_t *nodetype,
+ char **text);
+
+ bt_metatype_t bt_entry_metatype (AST *entry);
+ char *bt_entry_type (AST *entry);
+ char *bt_entry_key (AST *entry);
+ char *bt_get_text (AST *node);
+
+ /* Splitting names and lists of names */
+ bt_stringlist * bt_split_list (char * string,
+ char * delim,
+ char * filename,
+ int line,
+ char * description);
+ void bt_free_list (bt_stringlist *list);
+ bt_name * bt_split_name (char * name,
+ char * filename,
+ int line,
+ int name_num);
+ void bt_free_name (bt_name * name);
+
+ /* Formatting names */
+ bt_name_format * bt_create_name_format (char * parts, boolean abbrev_first);
+ void bt_free_name_format (bt_name_format * format);
+ void bt_set_format_text (bt_name_format * format,
+ bt_namepart part,
+ char * pre_part,
+ char * post_part,
+ char * pre_token,
+ char * post_token);
+ void bt_set_format_options (bt_name_format * format,
+ bt_namepart part,
+ boolean abbrev,
+ bt_joinmethod join_tokens,
+ bt_joinmethod join_part);
+ char * bt_format_name (bt_name * name, bt_name_format * format);
+
+ /* Construct tree from TeX groups */
+ bt_tex_tree * bt_build_tex_tree (char * string);
+ void bt_free_tex_tree (bt_tex_tree **top);
+ void bt_dump_tex_tree (bt_tex_tree *node, int depth, FILE *stream);
+ char * bt_flatten_tex_tree (bt_tex_tree *top);
+
+ /* Miscellaneous string utilities */
+ void bt_purify_string (char * string, btshort options);
+ void bt_change_case (char transform, char * string, btshort options);
+
+=head1 DESCRIPTION
+
+B<btparse> is a C library for parsing and processing BibTeX files. It
+provides a lexical scanner and LR parser (constructed by PCCTS), both of
+which are efficient and offer good error detection and recovery; a set
+of functions for traversing the AST (abstract syntax tree) generated by
+the parser; and utility functions for manipulating strings according to
+BibTeX conventions. (Note that nothing in the library assumes that
+you're using BibTeX files for their original purpose of bibliographic
+data for scholarly publications; you could use the file format for any
+conceivable purpose that fits it. However, there is some code in the
+library that is really only appropriate for use with strings meant to be
+processed in the same way that BibTeX itself does. This is all entirely
+optional, though.)
+
+Note that the interface provided by B<btparse>, while complete, is
+fairly low-level. If you have more sophisticated needs, you might be
+interested my C<Text::BibTeX> module for Perl 5 (available on CPAN).
+
+=head1 CONCEPTS AND TERMINOLOGY
+
+To understand this document and use B<btparse>, you should already be
+familiar with the BibTeX language---more specifically, the BibTeX data
+description language. (BibTeX being the complex beast that it is, one
+can conceive of the term applying to the program, the data language, the
+particular database structure described in the original BibTeX
+documentation, the ".bst" formatting language, and the set of
+conventions embodied in the standard styles included with the BibTeX
+distribution. In this document, I'll stick to the first two
+meanings---the data language because that's what B<btparse> deals with,
+and the program because it's occasionally necessary to explain
+differences between my parser and BibTeX's.)
+
+In particular, you should have a good idea what's going on in the
+following:
+
+ @string{and = { and },
+ joe = "Blow, Joe",
+ john = "John Smith"}
+
+ @book(ourbook,
+ author = joe # and # john,
+ title = {Our Little Book})
+
+If this looks like something you want to parse, but don't want to have
+to write your own parser for, you've come to the right place.
+
+Before going much further, though, you're going to have to learn some of
+the terminology I use for describing BibTeX data. Most of it's the same
+as you'll find in any BibTeX documentation, but it's important to be
+sure that we're talking about the same things here. So, some
+definitions:
+
+=over 4
+
+=item top-level
+
+All text in a BibTeX file from the start of the file to the start of the
+first entry, and between entries thereafter.
+
+=item name
+
+A string of letters, digits, and the following characters:
+
+ ! $ & * + - . / : ; < > ? [ ] ^ _ ` |
+
+A "name" is a catch-all used for entry types, entry keys, and field and
+macro names. For BibTeX compatibility, there are slightly different
+rules for these four entities; currently, the only such rule actually
+implemented is that field and macro names may not begin with a digit.
+Some names in the above example: C<string>, C<and>.
+
+=item entry
+
+A chunk of text starting with an "at" sign (C<@>) at top-level, followed
+by a name (the I<entry type>), an I<entry delimiter> (C<{> or C<(>), and
+proceeding to the matching closing delimiter. Also, the data structure
+that results from parsing this chunk of text. There are two entries in
+the above example.
+
+=item entry type
+
+The name that comes right after an C<@> at top-level. Examples from
+above: C<string>, C<book>.
+
+=item entry metatype
+
+A classification of entry types that allows us to group one or more
+entry types under the same heading. With the standard BibTeX database
+structure, C<article>, C<book>, C<inbook>, etc. all fall under the
+"regular entry" metatype. Other metatypes are "macro definition" (for
+C<string> entries), "preamble" (for C<preamble>) entries, and "comment"
+(C<comment> entries). In fact, any entry whose type is not one of
+C<string>, C<preamble>, or C<comment> is called a "regular" entry.
+
+=item entry delimiters
+
+C<{> and C<}>, or C<(> and C<)>: the pair of characters that (almost)
+mark the boundaries of an entry. "Almost" because the start of an entry
+is marked by an C<@>, not by the "entry open" delimiter.
+
+=item entry key
+
+(Or just I<key> when it's clear what we're speaking of.) The name
+immediately following the entry open delimiter in a regular entry, which
+uniquely identifies the entry. Example from above: C<ourbook>. Only
+regular entries have keys.
+
+=item field
+
+A name to the left of an equals sign in a regular or macro-definition
+entry. In the latter context, might also be called a macro name.
+Examples from above: C<joe>, C<author>.
+
+=item field list
+
+In a regular entry, everything between the entry delimiters except for
+the entry key. In a macro definition entry, everything between the
+entry delimiters (possibly also called a macro list).
+
+=item compound value
+
+(Usually just "value".) The text that follows an equals sign (C<=>) in
+a regular or macro definition entry, up to a comma or the entry close
+delimiter; a list of one or more simple values joined by hash signs
+(C<#>).
+
+=item simple value
+
+A string, macro, or number.
+
+=item string
+
+(Or, sometimes, "quoted string.") A chunk of text between quotes (C<">)
+or braces (C<{> and C<}>). Braces must balance: C<{this is a {string}>
+is not a BibTeX string, but C<{this is a {string}}> is.
+(C<"this is a {string"> is also illegal, mainly to avoid the possibility
+of generating bogus TeX code--which BibTeX will do in certain cases.)
+
+=item macro
+
+A name that appears on the right-hand side of an equals sign (i.e. as
+one simple value in a compound value). Implies that this name was
+defined as a macro in an earlier macro definition entry, but this is
+only checked if B<btparse> is being asked to expand macros to their full
+definitions.
+
+=item number
+
+An unquoted string of digits.
+
+=back
+
+Working with B<btparse> generally consists of passing the library some
+BibTeX data (or a source for some BibTeX data, such as a filename or a
+file pointer), which it then lexically scans, parses, and constructs an
+abstract syntax tree (AST) from. It returns this AST to you, and you
+call other B<btparse> functions to traverse and query the tree.
+
+The contents of AST nodes are the private domain of the library, and you
+shouldn't go poking into them. This being C, though, there's nothing to
+prevent you from doing so except good manners and the possibility that I
+might change the AST structure in future releases, breaking any
+badly-behaved code. Also, it's not necessary to know the structural
+relationships between nodes in the AST---that's taken care of by the
+query/traversal functions.
+
+However, it's useful to know some of the things that B<btparse> deposits
+in the AST and returns to you through those query/traversal functions.
+First off, each node has a "node type," which records the syntactic
+element corresponding to each node. For instance, the entry
+
+ @book{mybook, author = "Joe Blow", title = "My Little Book"}
+
+is rooted by an "entry" node; under this would be found a "key" node
+(for the entry key), two "field" nodes (for the "author" and "title"
+fields); and associated with each field node would be a "string" node.
+The only time this concerns you is when you ask the library for a simple
+value; just looking at the text is not enough to distinguish quoted
+strings, numbers, and macro names, so B<btparse> returns the nodetype as
+well.
+
+In addition to the nodetype, B<btparse> records the metatype of each
+"entry" node. This allows you (and the library) to distinguish, say,
+regular entries from comment entries. Not only do they have very
+different structures and must therefore be traversed differently by the
+library, but certain traversal functions make no sense on certain entry
+metatypes---thus it's necessary for you to be able to make the
+distinction as well.
+
+That said, everything you need to know to work with the AST is explained
+in L<bt_traversal>.
+
+=head1 DATA TYPES AND MACROS
+
+B<btparse> defines several types required for the external interface.
+First, it trivially defines a C<boolean> type (along with C<TRUE> and
+C<FALSE> macros). This might affect you when including the F<btparse.h>
+header in your own code---since it's not possible for the code to detect
+if there is already a C<boolean> type defined, you might have to define
+the C<HAVE_BOOLEAN> pre-processor token to deactivate F<btparse.h>'s
+C<typedef> of C<boolean>.
+
+Next, two enumeration types are defined: C<bt_metatype> and
+C<bt_nodetype>. Both of these are used extensively in the library
+itself, and are made available to users of the library because they can
+be found in nodes of the C<btparse> AST (abstract syntax tree). (I.e.,
+querying the AST can give you C<bt_metatype> and C<bt_nodetype>
+values, so the C<typedef>s must be available to your code.)
+
+=head2 Entry metatype enum
+
+C<bt_metatype_t> has the following values:
+
+=over 4
+
+=item *
+
+C<BTE_UNKNOWN>
+
+=item *
+
+C<BTE_REGULAR>
+
+=item *
+
+C<BTE_COMMENT>
+
+=item *
+
+C<BTE_PREAMBLE>
+
+=item *
+
+C<BTE_MACRODEF>
+
+=back
+
+which are determined by the "entry type" token. (C<@string> entries
+have the C<BTE_MACRODEF> metatype; C<@comment> and C<@preamble>
+correspond to C<BTE_COMMENT> and C<BTE_PREAMBLE>; and any other entry
+type has the C<BTE_REGULAR> metatype.)
+
+=head2 AST nodetype enum
+
+C<bt_nodetype> has the following values:
+
+=over 4
+
+=item *
+
+C<BTAST_UNKNOWN>
+
+=item *
+
+C<BTAST_ENTRY>
+
+=item *
+
+C<BTAST_KEY>
+
+=item *
+
+C<BTAST_FIELD>
+
+=item *
+
+C<BTAST_STRING>
+
+=item *
+
+C<BTAST_NUMBER>
+
+=item *
+
+C<BTAST_MACRO>
+
+=back
+
+Of these, you'll only ever deal with the last three. They are returned
+when you query the AST for a simple value---just seeing the text isn't
+enough to distinguish between a quoted string, a number, and a macro, so
+the AST nodetype is supplied along with the text.
+
+=head2 String processing option macros
+
+Since BibTeX is essentially a system for glueing strings together in a
+wide variety of ways, the processing done to its strings is fairly
+important. Most of the string transformations are done outside of the
+lexer/parser; this reduces their complexity, and makes it easier to
+switch different transformations on and off. This switching is done
+with an "options" bitmap which can be specified on a per-entry-metatype
+basis. (That is, you can have one set of transformations done to the
+strings in all regular entries, another set done to the strings in all
+macro definition entries, and so on.) If you need finer control than
+that, it's currently unavailable outside of the library (but it's just a
+matter of making a couple functions available and documenting them---so
+bug me if you need this feature).
+
+There are three basic macros for constructing this bitmap:
+
+=over 4
+
+=item C<BTO_CONVERT>
+
+Convert "number" values to strings. (The conversion is trivial,
+involving changing the type of the AST node representing the number from
+C<BTAST_NUMBER> to C<BTAST_STRING>. "Number" values are stored as
+strings of digits, just as they are in the input data.)
+
+=item C<BTO_EXPAND>
+
+Expand macro invocations to the full macro text.
+
+=item C<BTO_PASTE>
+
+Paste simple values together.
+
+=item C<BTO_COLLAPSE>
+
+Collapse whitespace according to the BibTeX rules.
+
+=back
+
+For instance, supplying C<BTO_CONVERT | BTO_EXPAND> as the string
+options bitmap for the C<BTE_REGULAR> metatype means that all simple
+values in "regular" entries will be converted to strings: numbers will
+simply have their "nodetype" changed, and macros will be expanded.
+Nothing else will be done to the simple values, though---they will not
+be concatenated, nor will whitespace be collapsed. See the
+C<bt_set_stringopts()> and C<bt_parse_*()> functions in L<bt_input> for
+more information on the various options for parsing; see
+L<bt_postprocess> for details on the post-processing.
+
+=head1 USING THE LIBRARY
+
+The following code is a skeletal example of using the B<btparse>
+library:
+
+ #include <btparse.h>
+
+ int main (void)
+ {
+ bt_initialize ();
+
+ /* process some data */
+
+ bt_cleanup ();
+ exit (0);
+ }
+
+Please note the call to C<bt_initialize()>; this is very important!
+Without it, the library may crash or fail mysteriously. You I<must>
+call C<bt_initialize()> before calling any other B<btparse> functions.
+C<bt_cleanup()> just frees the memory allocated by C<bt_initialize()>;
+if you are careful to call it before exiting, and C<bt_free_ast()> on
+any abstract syntax trees generated by B<btparse> when you are done with
+them, then your program shouldn't have any memory leaks. (Unless
+they're due to your own code, of course!)
+
+=head1 BUGS AND LIMITATIONS
+
+B<btparse> has several inherent limitations that are due to the lexical
+scanner and parser generated by PCCTS 1.x. In short, the scanner and
+parser are both heavily dependent on global variables, meaning that
+thread safety -- or even the ability to have two files open and being
+parsed at the same time -- is well-nigh impossible. This will not
+change until I get with the times and adopt ANTLR 2.0, the successor to
+PCCTS -- presuming of course that it can generate more modular C
+scanners and parsers.
+
+Another limitation that is due to PCCTS: entries with a large number of
+fields (more than about 90, if each field value is just a single string)
+will cause the parser to crash. This is unavoidable due to the parser
+using statically-allocated stacks for attributes and abstract-syntax
+tree nodes. I could increase the static allocation, but that would just
+decrease the likelihood of encountering the problem, not make it go
+away. Again, the chances of this changing as long as I'm using PCCTS
+1.x are nil.
+
+Apart from those inherent limitations, there are no known bugs in
+B<btparse>. Any segmentation faults or bus errors from the library
+should be considered bugs. They probably result from using the library
+incorrectly (eg. attempting to interleave the parsing of two files), but
+I do make an attempt to catch all such mistakes, and if I've missed any
+I'd like to know about it.
+
+Any memory leaks from the library are also a concern; as long as you are
+conscientious about calling the cleanup functions (C<bt_free_ast()> and
+C<bt_cleanup()>), then the library shouldn't leak.
+
+=head1 SEE ALSO
+
+To read and parse BibTeX data files, see L<bt_input>.
+
+To traverse the syntax tree that results, see L<bt_traversal>.
+
+To learn what is done to values in parsed entries, and how to customize
+that munging, see L<bt_postprocess>.
+
+To learn how B<btparse> deals with strings, see L<bt_strings> (oops, I
+haven't written this one yet!).
+
+To manipulate and access the B<btparse> macro table, see L<bt_macros>.
+
+For splitting author names and lists "the BibTeX way" using B<btparse>,
+L<bt_split_names>.
+
+To put author names back together again, see L<bt_format_names>.
+
+Miscellaneous functions for processing strings "the BibTeX way":
+L<bt_misc>.
+
+A semi-formal language definition is in L<bt_language>.
+
+=head1 AUTHOR
+
+Greg Ward <gward@python.net>
+
+=head1 COPYRIGHT
+
+Copyright (c) 1996-97 by Gregory P. Ward.
+
+This library is free software; you can redistribute it and/or modify it
+under the terms of the GNU Library General Public License as published
+by the Free Software Foundation; either version 2 of the License, or (at
+your option) any later version.
+
+This library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+=head1 AVAILABILITY
+
+The btOOL home page, where you can get up-to-date information about
+B<btparse> (and download the latest version) is
+
+ http://starship.python.net/~gward/btOOL/
+
+You will also find the latest version of B<Text::BibTeX>, the Perl
+library that provides a high-level front-end to B<btparse>, there.
+B<btparse> is needed to build C<Text::BibTeX>, and must be downloaded
+separately.
+
+Both libraries are also available on CTAN (the Comprehensive TeX Archive
+Network, C<http://www.ctan.org/tex-archive/>) and CPAN (the Comprehensive
+Perl Archive Network, C<http://www.cpan.org/>). Look in
+F<biblio/bibtex/utils/btOOL/> on CTAN, and F<authors/Greg_Ward/> on
+CPAN. For example,
+
+ http://www.ctan.org/tex-archive/biblio/bibtex/utils/btOOL/
+ http://www.cpan.org/authors/Greg_Ward
+
+will both get you to the latest version of C<Text::BibTeX> and B<btparse>
+-- but of course, you should always access busy sites like CTAN and CPAN
+through a mirror.
diff --git a/btparse/pccts/antlr.h b/btparse/pccts/antlr.h
new file mode 100644
index 0000000..d54220b
--- /dev/null
+++ b/btparse/pccts/antlr.h
@@ -0,0 +1,625 @@
+/* antlr.h
+ *
+ * SOFTWARE RIGHTS
+ *
+ * We reserve no LEGAL rights to the Purdue Compiler Construction Tool
+ * Set (PCCTS) -- PCCTS is in the public domain. An individual or
+ * company may do whatever they wish with source code distributed with
+ * PCCTS or the code generated by PCCTS, including the incorporation of
+ * PCCTS, or its output, into commerical software.
+ *
+ * We encourage users to develop software with PCCTS. However, we do ask
+ * that credit is given to us for developing PCCTS. By "credit",
+ * we mean that if you incorporate our source code into one of your
+ * programs (commercial product, research project, or otherwise) that you
+ * acknowledge this fact somewhere in the documentation, research report,
+ * etc... If you like PCCTS and have developed a nice tool with the
+ * output, please mention that you developed it using PCCTS. In
+ * addition, we ask that this header remain intact in our source code.
+ * As long as these guidelines are kept, we expect to continue enhancing
+ * this system and expect to make other tools available as they are
+ * completed.
+ *
+ * ANTLR 1.33
+ * Terence Parr
+ * Parr Research Corporation
+ * with Purdue University and AHPCRC, University of Minnesota
+ * 1989-1995
+ */
+#ifndef ANTLR_H
+#define ANTLR_H
+
+#include "config.h"
+
+/*
+ * Define all of the stack setup and manipulation of $i, #i variables.
+ *
+ * Notes:
+ * The type 'Attrib' must be defined before entry into this .h file.
+ */
+
+#ifdef __USE_PROTOS
+#include <stdlib.h>
+#else
+#ifdef VAXC
+#include <stdlib.h>
+#else
+#include <malloc.h>
+#endif
+#endif
+#include <string.h>
+
+typedef int ANTLRTokenType;
+typedef unsigned char SetWordType;
+
+typedef char ANTLRChar;
+
+ /* G u e s s S t u f f */
+
+#ifdef ZZCAN_GUESS
+#ifndef ZZINF_LOOK
+#define ZZINF_LOOK
+#endif
+#endif
+
+#ifdef ZZCAN_GUESS
+typedef struct _zzjmp_buf {
+ jmp_buf state;
+ } zzjmp_buf;
+#endif
+
+
+/* can make this a power of 2 for more efficient lookup */
+#ifndef ZZLEXBUFSIZE
+#define ZZLEXBUFSIZE 2000
+#endif
+
+#define zzOvfChk \
+ if ( zzasp <= 0 ) \
+ { \
+ fprintf(stderr, zzStackOvfMsg, __FILE__, __LINE__); \
+ exit(PCCTS_EXIT_FAILURE); \
+ }
+
+#ifndef ZZA_STACKSIZE
+#define ZZA_STACKSIZE 400
+#endif
+#ifndef ZZAST_STACKSIZE
+#define ZZAST_STACKSIZE 400
+#endif
+
+#ifndef zzfailed_pred
+#define zzfailed_pred(_p) \
+ fprintf(stderr, "semantic error; failed predicate: '%s'\n",_p)
+#endif
+
+#ifdef LL_K
+#define LOOKAHEAD \
+ int zztokenLA[LL_K]; \
+ char zztextLA[LL_K][ZZLEXBUFSIZE]; \
+ int zzlap = 0, zzlabase=0; /* labase only used for DEMAND_LOOK */
+#else
+#define LOOKAHEAD \
+ int zztoken;
+#endif
+
+#ifndef zzcr_ast
+#define zzcr_ast(ast,attr,tok,text)
+#endif
+
+#ifdef DEMAND_LOOK
+#define DemandLookData int zzdirty=1;
+#else
+#define DemandLookData
+#endif
+
+ /* S t a t e S t u f f */
+
+#ifdef ZZCAN_GUESS
+#define zzGUESS_BLOCK zzantlr_state zzst; int zzrv;
+#define zzGUESS zzsave_antlr_state(&zzst); \
+ zzguessing = 1; \
+ zzrv = setjmp(zzguess_start.state);
+#define zzGUESS_FAIL longjmp(zzguess_start.state, 1)
+#define zzGUESS_DONE zzrestore_antlr_state(&zzst);
+#define zzNON_GUESS_MODE if ( !zzguessing )
+#define zzGuessData \
+ zzjmp_buf zzguess_start; \
+ int zzguessing;
+#else
+#define zzGUESS_BLOCK
+#define zzGUESS
+#define zzGUESS_FAIL
+#define zzGUESS_DONE
+#define zzNON_GUESS_MODE
+#define zzGuessData
+#endif
+
+typedef struct _zzantlr_state {
+#ifdef ZZCAN_GUESS
+ zzjmp_buf guess_start;
+ int guessing;
+#endif
+ int asp;
+ int ast_sp;
+#ifdef ZZINF_LOOK
+ int inf_lap; /* not sure we need to save this one */
+ int inf_labase;
+ int inf_last;
+#endif
+#ifdef DEMAND_LOOK
+ int dirty;
+#endif
+
+#ifdef LL_K
+ int tokenLA[LL_K];
+ char textLA[LL_K][ZZLEXBUFSIZE];
+ int lap;
+ int labase;
+#else
+ int token;
+ char text[ZZLEXBUFSIZE];
+#endif
+ } zzantlr_state;
+
+
+ /* I n f i n i t e L o o k a h e a d */
+
+
+#ifdef ZZINF_LOOK
+#define InfLookData \
+ int *zzinf_tokens; \
+ char **zzinf_text; \
+ char *zzinf_text_buffer; \
+ int *zzinf_line; \
+ int zzinf_labase; \
+ int zzinf_last;
+#else
+#define InfLookData
+#endif
+
+#ifdef ZZINF_LOOK
+
+#ifndef ZZINF_DEF_TEXT_BUFFER_SIZE
+#define ZZINF_DEF_TEXT_BUFFER_SIZE 20000
+#endif
+#ifndef ZZINF_DEF_TOKEN_BUFFER_SIZE
+#define ZZINF_DEF_TOKEN_BUFFER_SIZE 2000
+#endif
+/* WARNING!!!!!!
+ * ZZINF_BUFFER_TEXT_CHUNK_SIZE must be > sizeof(text) largest possible token.
+ */
+#ifndef ZZINF_BUFFER_TEXT_CHUNK_SIZE
+#define ZZINF_BUFFER_TEXT_CHUNK_SIZE 5000
+#endif
+#ifndef ZZINF_BUFFER_TOKEN_CHUNK_SIZE
+#define ZZINF_BUFFER_TOKEN_CHUNK_SIZE 1000
+#endif
+
+#if ZZLEXBUFSIZE > ZZINF_BUFFER_TEXT_CHUNK_SIZE
+#define ZZINF_BUFFER_TEXT_CHUNK_SIZE ZZLEXBUFSIZE+5
+#endif
+
+/* make inf_look user-access macros */
+#ifdef LL_K
+#define ZZINF_LA_VALID(i) (((zzinf_labase+i-1)-LL_K+1) <= zzinf_last)
+#define ZZINF_LA(i) zzinf_tokens[(zzinf_labase+i-1)-LL_K+1]
+#define ZZINF_LATEXT(i) zzinf_text[(zzinf_labase+i-1)-LL_K+1]
+/* #define ZZINF_LINE(i) zzinf_line[(zzinf_labase+i-1)-LL_K+1]*/
+#else
+#define ZZINF_LA_VALID(i) (((zzinf_labase+i-1)) <= zzinf_last)
+#define ZZINF_LA(i) zzinf_tokens[(zzinf_labase+i-1)]
+#define ZZINF_LATEXT(i) zzinf_text[(zzinf_labase+i-1)]
+#endif
+
+#define inf_zzgettok _inf_zzgettok()
+extern void _inf_zzgettok();
+
+#endif /* ZZINF_LOOK */
+
+
+#ifdef LL_K
+
+#ifdef __USE_PROTOS
+#define ANTLR_INFO \
+ Attrib zzempty_attr(void) {static Attrib a; return a;} \
+ Attrib zzconstr_attr(int _tok, char *_text)\
+ {Attrib a; zzcr_attr((&a),_tok,_text); return a;} \
+ int zzasp=ZZA_STACKSIZE; \
+ char zzStackOvfMsg[]="fatal: attrib/AST stack overflow %s(%d)!\n"; \
+ Attrib zzaStack[ZZA_STACKSIZE]; DemandLookData \
+ InfLookData \
+ zzGuessData
+#else
+#define ANTLR_INFO \
+ Attrib zzempty_attr() {static Attrib a; return a;} \
+ Attrib zzconstr_attr(_tok, _text) int _tok; char *_text;\
+ {Attrib a; zzcr_attr((&a),_tok,_text); return a;} \
+ int zzasp=ZZA_STACKSIZE; \
+ char zzStackOvfMsg[]="fatal: attrib/AST stack overflow %s(%d)!\n"; \
+ Attrib zzaStack[ZZA_STACKSIZE]; DemandLookData \
+ InfLookData \
+ zzGuessData
+#endif
+
+#else
+
+#ifdef __USE_PROTOS
+#define ANTLR_INFO \
+ Attrib zzempty_attr(void) {static Attrib a; return a;} \
+ Attrib zzconstr_attr(int _tok, char *_text)\
+ {Attrib a; zzcr_attr((&a),_tok,_text); return a;} \
+ int zzasp=ZZA_STACKSIZE; \
+ char zzStackOvfMsg[]="fatal: attrib/AST stack overflow %s(%d)!\n"; \
+ Attrib zzaStack[ZZA_STACKSIZE]; DemandLookData \
+ InfLookData \
+ zzGuessData
+#else
+#define ANTLR_INFO \
+ Attrib zzempty_attr() {static Attrib a; return a;} \
+ Attrib zzconstr_attr(_tok, _text) int _tok; char *_text;\
+ {Attrib a; zzcr_attr((&a),_tok,_text); return a;} \
+ int zzasp=ZZA_STACKSIZE; \
+ char zzStackOvfMsg[]="fatal: attrib/AST stack overflow %s(%d)!\n"; \
+ Attrib zzaStack[ZZA_STACKSIZE]; DemandLookData \
+ InfLookData \
+ zzGuessData
+#endif
+
+#endif /* LL_k */
+
+
+#ifdef ZZINF_LOOK
+
+#ifdef LL_K
+#ifdef DEMAND_LOOK
+#define zzPrimeLookAhead {zzdirty=LL_K; zzlap = zzlabase = 0;}
+#else
+#define zzPrimeLookAhead {zzlap = zzlabase = 0; zzfill_inf_look();\
+ {int _i; for(_i=1;_i<=LL_K; _i++) \
+ {zzCONSUME;} zzlap = zzlabase = 0;}}
+#endif
+
+#else /* LL_K */
+
+#ifdef DEMAND_LOOK
+#define zzPrimeLookAhead zzfill_inf_look(); zzdirty=1
+#else
+#define zzPrimeLookAhead zzfill_inf_look(); inf_zzgettok
+
+#endif
+#endif /* LL_K */
+
+#else /* ZZINF_LOOK */
+
+#ifdef LL_K
+#ifdef DEMAND_LOOK
+#define zzPrimeLookAhead {zzdirty=LL_K; zzlap = zzlabase = 0;}
+#else
+#define zzPrimeLookAhead {int _i; zzlap = 0; for(_i=1;_i<=LL_K; _i++) \
+ {zzCONSUME;} zzlap = 0;}
+#endif
+
+#else
+
+#ifdef DEMAND_LOOK
+#define zzPrimeLookAhead zzdirty=1
+#else
+#define zzPrimeLookAhead zzgettok()
+#endif
+#endif /* LL_K */
+
+#endif /* ZZINF_LOOK */
+
+
+#ifdef LL_K
+#define zzenterANTLRs(s) \
+ zzlextext = &(zztextLA[0][0]); zzrdstr( s ); zzPrimeLookAhead;
+#define zzenterANTLRf(f) \
+ zzlextext = &(zztextLA[0][0]); zzrdfunc( f ); zzPrimeLookAhead;
+#define zzenterANTLR(f) \
+ zzlextext = &(zztextLA[0][0]); zzrdstream( f ); zzPrimeLookAhead;
+#ifdef ZZINF_LOOK
+#define zzleaveANTLR(f) free(zzinf_text_buffer); free(zzinf_text); free(zzinf_tokens); free(zzinf_line);
+#define zzleaveANTLRf(f) free(zzinf_text_buffer); free(zzinf_text); free(zzinf_tokens); free(zzinf_line);
+#define zzleaveANTLRs(f) free(zzinf_text_buffer); free(zzinf_text); free(zzinf_tokens); free(zzinf_line);
+#else
+#define zzleaveANTLR(f)
+#define zzleaveANTLRf(f)
+#define zzleaveANTLRs(f)
+#endif
+
+#else
+
+#define zzenterANTLRs(s) \
+ {static char zztoktext[ZZLEXBUFSIZE]; \
+ zzlextext = zztoktext; zzrdstr( s ); zzPrimeLookAhead;}
+#define zzenterANTLRf(f) \
+ {static char zztoktext[ZZLEXBUFSIZE]; \
+ zzlextext = zztoktext; zzrdfunc( f ); zzPrimeLookAhead;}
+#define zzenterANTLR(f) \
+ {static char zztoktext[ZZLEXBUFSIZE]; \
+ zzlextext = zztoktext; zzrdstream( f ); zzPrimeLookAhead;}
+#ifdef ZZINF_LOOK
+#define zzleaveANTLR(f) free(zzinf_text_buffer); free(zzinf_text); free(zzinf_tokens); free(zzinf_line);
+#define zzleaveANTLRf(f) free(zzinf_text_buffer); free(zzinf_text); free(zzinf_tokens); free(zzinf_line);
+#define zzleaveANTLRs(f) free(zzinf_text_buffer); free(zzinf_text); free(zzinf_tokens); free(zzinf_line);
+#else
+#define zzleaveANTLR(f)
+#define zzleaveANTLRf(f)
+#define zzleaveANTLRs(f)
+#endif
+
+#endif
+
+#define ANTLR(st, f) zzbufsize = ZZLEXBUFSIZE; \
+ zzenterANTLR(f); \
+ st; ++zzasp; \
+ zzleaveANTLR(f);
+
+#define ANTLRm(st, f, _m) zzbufsize = ZZLEXBUFSIZE; \
+ zzmode(_m); \
+ zzenterANTLR(f); \
+ st; ++zzasp; \
+ zzleaveANTLR(f);
+
+#define ANTLRf(st, f) zzbufsize = ZZLEXBUFSIZE; \
+ zzenterANTLRf(f); \
+ st; ++zzasp; \
+ zzleaveANTLRf(f);
+
+#define ANTLRs(st, s) zzbufsize = ZZLEXBUFSIZE; \
+ zzenterANTLRs(s); \
+ st; ++zzasp; \
+ zzleaveANTLRs(s);
+
+#ifdef LL_K
+#define zztext (&(zztextLA[zzlap][0]))
+#else
+#define zztext zzlextext
+#endif
+
+
+ /* A r g u m e n t A c c e s s */
+
+#define zzaCur (zzaStack[zzasp])
+#define zzaRet (*zzaRetPtr)
+#define zzaArg(v,n) zzaStack[v-n]
+#define zzMakeAttr { zzNON_GUESS_MODE {zzOvfChk; --zzasp; zzcr_attr(&(zzaStack[zzasp]),LA(1), (char*)LATEXT(1));}}
+#ifdef zzdef0
+#define zzMake0 { zzOvfChk; --zzasp; zzdef0(&(zzaStack[zzasp]));}
+#else
+#define zzMake0 { zzOvfChk; --zzasp;}
+#endif
+#define zzaPush(_v) { zzOvfChk; zzaStack[--zzasp] = _v;}
+#ifndef zzd_attr
+#define zzREL(t) zzasp=(t); /* Restore state of stack */
+#else
+#define zzREL(t) for (; zzasp<(t); zzasp++) \
+ { zzd_attr(&(zzaStack[zzasp])); }
+#endif
+
+#define zzsetmatch(_es) \
+ if ( !_zzsetmatch(_es, &zzBadText, &zzMissText, &zzMissTok, &zzBadTok, &zzMissSet) ) goto fail;
+#define zzsetmatch_wsig(_es, handler) \
+ if ( !_zzsetmatch_wsig(_es) ) {_signal=MismatchedToken; goto handler;}
+
+#ifdef __USE_PROTOS
+extern int _zzsetmatch(SetWordType *, char **, char **, int *, int *, SetWordType **);
+extern int _zzsetmatch_wsig(SetWordType *);
+#else
+extern int _zzsetmatch();
+extern int _zzsetmatch_wsig();
+#endif
+
+#define zzmatch(_t) \
+ if ( !_zzmatch(_t, &zzBadText, &zzMissText, &zzMissTok, &zzBadTok, &zzMissSet) ) goto fail;
+#define zzmatch_wsig(_t,handler) \
+ if ( !_zzmatch_wsig(_t) ) {_signal=MismatchedToken; goto handler;}
+
+#ifdef __USE_PROTOS
+extern int _zzmatch(int, char **, char **, int *, int *, SetWordType **);
+extern int _zzmatch_wsig(int);
+#else
+extern int _zzmatch();
+extern int _zzmatch_wsig();
+#endif
+
+#define zzmatch_wdfltsig(_t,_f) \
+ if ( !_zzmatch_wdfltsig(_t,_f) ) _signal=MismatchedToken;
+#define zzsetmatch_wdfltsig(tw,tt,wf) \
+ if ( !_zzsetmatch_wdfltsig(tw,tt,wf) ) _signal=MismatchedToken;
+
+#ifdef __USE_PROTOS
+extern int _zzmatch_wdfltsig(int, SetWordType *);
+extern int _zzsetmatch_wdfltsig(SetWordType *tokensWanted,
+ int tokenTypeOfSet,
+ SetWordType *whatFollows);
+#else
+extern int _zzmatch_wdfltsig();
+extern int _zzsetmatch_wdfltsig();
+#endif
+
+#ifdef GENAST
+#define zzRULE Attrib *zzaRetPtr = &(zzaStack[zzasp-1]); \
+ SetWordType *zzMissSet=NULL; int zzMissTok=0; \
+ int zzBadTok=0; char *zzBadText=""; \
+ int zzErrk=1; \
+ char *zzMissText=""; zzASTVars
+#else
+#define zzRULE Attrib *zzaRetPtr = &(zzaStack[zzasp-1]); \
+ int zzBadTok=0; char *zzBadText=""; \
+ int zzErrk=1; \
+ SetWordType *zzMissSet=NULL; int zzMissTok=0; char *zzMissText=""
+#endif
+
+#ifdef GENAST
+#define zzBLOCK(i) int i = zzasp - 1; int zztsp = zzast_sp
+#define zzEXIT(i) zzREL(i); zzastREL; zzNON_GUESS_MODE { zzastPush(*_root); }
+#define zzLOOP(i) zzREL(i); zzastREL
+#else
+#define zzBLOCK(i) int i = zzasp - 1
+#define zzEXIT(i) zzREL(i)
+#define zzLOOP(i) zzREL(i)
+#endif
+
+#ifdef LL_K
+
+#ifdef DEMAND_LOOK
+#define LOOK(_k) {int i,stop=_k-(LL_K-zzdirty); for (i=1; i<=stop; i++) \
+ zzCONSUME;}
+#define zzCONSUME {zzgettok(); zzdirty--; \
+ zzlap = (zzlap+1)&(LL_K-1); \
+ zzlextext = &(zztextLA[zzlap][0]);}
+#else
+#ifdef ZZINF_LOOK
+#define zzCONSUME {inf_zzgettok; \
+ zzlap = (zzlap+1)&(LL_K-1); \
+ zzlextext = &(zztextLA[zzlap][0]); \
+ }
+#else
+#define zzCONSUME {zzgettok(); \
+ zzlap = (zzlap+1)&(LL_K-1); \
+ zzlextext = &(zztextLA[zzlap][0]);}
+#endif /* ZZINF_LOOK */
+#endif /* DEMAND_LOOK */
+
+#else /* LL_K */
+
+#ifdef DEMAND_LOOK
+#define LOOK(_k) if ( zzdirty) zzCONSUME;
+#ifdef ZZINF_LOOK
+#define zzCONSUME inf_zzgettok; zzdirty=0;
+#else
+#define zzCONSUME zzgettok(); zzdirty=0;
+#endif /* ZZINF_LOOK */
+
+#else /* DEMAND_LOOK */
+
+#ifdef ZZINF_LOOK
+#define zzCONSUME inf_zzgettok
+#else
+#define zzCONSUME zzgettok();
+#endif
+
+#endif /* DEMAND_LOOK */
+
+#endif /* LL_K */
+
+#ifdef LL_K
+#define NLA zztokenLA[zzlap&(LL_K-1)] /* --> next LA */
+#define NLATEXT zztextLA[zzlap&(LL_K-1)] /* --> next text of LA */
+#ifdef DEMAND_LOOK
+#define LA(i) zztokenLA[(zzlabase+(i)-1)&(LL_K-1)]
+#define LATEXT(i) (&(zztextLA[(zzlabase+(i)-1)&(LL_K-1)][0]))
+#else
+#define LA(i) zztokenLA[(zzlap+(i)-1)&(LL_K-1)]
+#define LATEXT(i) (&(zztextLA[(zzlap+(i)-1)&(LL_K-1)][0]))
+#endif
+#else
+#define NLA zztoken
+#define NLATEXT zztext
+#define LA(i) zztoken
+#define LATEXT(i) zztext
+#endif
+
+
+ /* S t a n d a r d S i g n a l s */
+
+#define NoSignal 0
+#define MismatchedToken 1
+#define NoViableAlt 2
+#define NoSemViableAlt 3
+
+
+ /* F u n c t i o n T r a c i n g */
+
+#ifndef zzTRACEIN
+#define zzTRACEIN(r) fprintf(stderr, "enter rule \"%s\"\n", r);
+#endif
+#ifndef zzTRACEOUT
+#define zzTRACEOUT(r) fprintf(stderr, "exit rule \"%s\"\n", r);
+#endif
+
+#ifdef ZZWCHAR_T
+#define zzchar_t unsigned wchar_t
+#else
+#define zzchar_t unsigned char
+#endif
+
+ /* E x t e r n D e f s */
+
+#ifdef __USE_PROTOS
+extern Attrib zzempty_attr(void);
+extern Attrib zzconstr_attr(int, char *);
+extern void zzsyn(char *, int, char *, SetWordType *, int, int, char *);
+extern int zzset_el(unsigned, SetWordType *);
+extern int zzset_deg(SetWordType *);
+extern void zzedecode(SetWordType *);
+extern void zzFAIL(int k, ...);
+extern void zzresynch(SetWordType *, SetWordType);
+extern void zzsave_antlr_state(zzantlr_state *);
+extern void zzrestore_antlr_state(zzantlr_state *);
+extern void zzfill_inf_look(void);
+#ifdef EXCEPTION_HANDLING
+extern void zzdflthandlers(int, int *);
+#endif
+#else
+extern Attrib zzempty_attr();
+extern Attrib zzconstr_attr();
+extern void zzsyn();
+extern int zzset_el();
+extern int zzset_deg();
+extern void zzedecode();
+extern void zzFAIL();
+extern void zzresynch();
+extern void zzsave_antlr_state();
+extern void zzrestore_antlr_state();
+extern void zzfill_inf_look();
+#ifdef EXCEPTION_HANDLING
+extern void zzdflthandlers();
+#endif
+#endif
+
+ /* G l o b a l V a r i a b l e s */
+
+/* Define a parser; user should do a "#parser myname" in their grammar file */
+/*extern struct pccts_parser zzparser;*/
+
+extern char *zztokens[];
+#ifdef LL_K
+extern int zztokenLA[];
+extern char zztextLA[][ZZLEXBUFSIZE];
+extern int zzlap;
+extern int zzlabase;
+#else
+extern int zztoken;
+#endif
+
+extern char zzStackOvfMsg[];
+extern int zzasp;
+extern Attrib zzaStack[];
+#ifdef ZZINF_LOOK
+extern int *zzinf_tokens;
+extern char **zzinf_text;
+extern char *zzinf_text_buffer;
+extern int *zzinf_line;
+extern int zzinf_labase;
+extern int zzinf_last;
+#endif
+#ifdef DEMAND_LOOK
+extern int zzdirty;
+#endif
+#ifdef ZZCAN_GUESS
+extern int zzguessing;
+extern zzjmp_buf zzguess_start;
+#endif
+
+/* Define global veriables that refer to values exported by the scanner.
+ * These declarations duplicate those in dlgdef.h, but are needed
+ * if ANTLR is not to generate a .dlg file (-gx); PS, this is a hack.
+ */
+extern zzchar_t *zzlextext; /* text of most recently matched token */
+extern int zzbufsize; /* how long zzlextext is */
+
+#endif
diff --git a/btparse/pccts/ast.c b/btparse/pccts/ast.c
new file mode 100644
index 0000000..da6b8de
--- /dev/null
+++ b/btparse/pccts/ast.c
@@ -0,0 +1,283 @@
+/* Abstract syntax tree manipulation functions
+ *
+ * SOFTWARE RIGHTS
+ *
+ * We reserve no LEGAL rights to the Purdue Compiler Construction Tool
+ * Set (PCCTS) -- PCCTS is in the public domain. An individual or
+ * company may do whatever they wish with source code distributed with
+ * PCCTS or the code generated by PCCTS, including the incorporation of
+ * PCCTS, or its output, into commerical software.
+ *
+ * We encourage users to develop software with PCCTS. However, we do ask
+ * that credit is given to us for developing PCCTS. By "credit",
+ * we mean that if you incorporate our source code into one of your
+ * programs (commercial product, research project, or otherwise) that you
+ * acknowledge this fact somewhere in the documentation, research report,
+ * etc... If you like PCCTS and have developed a nice tool with the
+ * output, please mention that you developed it using PCCTS. In
+ * addition, we ask that this header remain intact in our source code.
+ * As long as these guidelines are kept, we expect to continue enhancing
+ * this system and expect to make other tools available as they are
+ * completed.
+ *
+ * ANTLR 1.33
+ * Terence Parr
+ * Parr Research Corporation
+ * with Purdue University and AHPCRC, University of Minnesota
+ * 1989-1995
+ */
+#ifdef __STDC__
+#include <stdarg.h>
+#else
+#include <varargs.h>
+#endif
+
+/* ensure that tree manipulation variables are current after a rule
+ * reference
+ */
+void
+#ifdef __STDC__
+zzlink(AST **_root, AST **_sibling, AST **_tail)
+#else
+zzlink(_root, _sibling, _tail)
+AST **_root, **_sibling, **_tail;
+#endif
+{
+ if ( *_sibling == NULL ) return;
+ if ( *_root == NULL ) *_root = *_sibling;
+ else if ( *_root != *_sibling ) (*_root)->down = *_sibling;
+ if ( *_tail==NULL ) *_tail = *_sibling;
+ while ( (*_tail)->right != NULL ) *_tail = (*_tail)->right;
+}
+
+AST *
+#ifdef __STDC__
+zzastnew(void)
+#else
+zzastnew()
+#endif
+{
+ AST *p = (AST *) calloc(1, sizeof(AST));
+ if ( p == NULL ) fprintf(stderr,"%s(%d): cannot allocate AST node\n",__FILE__,__LINE__);
+ return p;
+}
+
+/* add a child node to the current sibling list */
+void
+#ifdef __STDC__
+zzsubchild(AST **_root, AST **_sibling, AST **_tail)
+#else
+zzsubchild(_root, _sibling, _tail)
+AST **_root, **_sibling, **_tail;
+#endif
+{
+ AST *n;
+ zzNON_GUESS_MODE {
+ n = zzastnew();
+#ifdef DEMAND_LOOK
+ zzcr_ast(n, &(zzaCur), LA(0), LATEXT(0));
+#else
+ zzcr_ast(n, &(zzaCur), LA(1), LATEXT(1));
+#endif
+ zzastPush( n );
+ if ( *_tail != NULL ) (*_tail)->right = n;
+ else {
+ *_sibling = n;
+ if ( *_root != NULL ) (*_root)->down = *_sibling;
+ }
+ *_tail = n;
+ if ( *_root == NULL ) *_root = *_sibling;
+ }
+}
+
+/* make a new AST node. Make the newly-created
+ * node the root for the current sibling list. If a root node already
+ * exists, make the newly-created node the root of the current root.
+ */
+void
+#ifdef __STDC__
+zzsubroot(AST **_root, AST **_sibling, AST **_tail)
+#else
+zzsubroot(_root, _sibling, _tail)
+AST **_root, **_sibling, **_tail;
+#endif
+{
+ AST *n;
+ zzNON_GUESS_MODE {
+ n = zzastnew();
+#ifdef DEMAND_LOOK
+ zzcr_ast(n, &(zzaCur), LA(0), LATEXT(0));
+#else
+ zzcr_ast(n, &(zzaCur), LA(1), LATEXT(1));
+#endif
+ zzastPush( n );
+ if ( *_root != NULL )
+ if ( (*_root)->down == *_sibling ) *_sibling = *_tail = *_root;
+ *_root = n;
+ (*_root)->down = *_sibling;
+ }
+}
+
+/* Apply function to root then each sibling
+ * example: print tree in child-sibling LISP-format (AST has token field)
+ *
+ * void show(tree)
+ * AST *tree;
+ * {
+ * if ( tree == NULL ) return;
+ * printf(" %s", zztokens[tree->token]);
+ * }
+ *
+ * void before() { printf(" ("); }
+ * void after() { printf(" )"); }
+ *
+ * LISPdump() { zzpre_ast(tree, show, before, after); }
+ *
+ */
+void
+#ifdef __STDC__
+zzpre_ast(
+ AST *tree,
+ void (*func)(AST *), /* apply this to each tree node */
+ void (*before)(AST *), /* apply this to root of subtree before preordering it */
+ void (*after)(AST *)) /* apply this to root of subtree after preordering it */
+#else
+zzpre_ast(tree, func, before, after)
+AST *tree;
+void (*func)(), /* apply this to each tree node */
+ (*before)(), /* apply this to root of subtree before preordering it */
+ (*after)(); /* apply this to root of subtree after preordering it */
+#endif
+{
+ while ( tree!= NULL )
+ {
+ if ( tree->down != NULL ) (*before)(tree);
+ (*func)(tree);
+ zzpre_ast(tree->down, func, before, after);
+ if ( tree->down != NULL ) (*after)(tree);
+ tree = tree->right;
+ }
+}
+
+/* free all AST nodes in tree; apply func to each before freeing */
+void
+#ifdef __STDC__
+zzfree_ast(AST *tree)
+#else
+zzfree_ast(tree)
+AST *tree;
+#endif
+{
+ if ( tree == NULL ) return;
+ zzfree_ast( tree->down );
+ zzfree_ast( tree->right );
+ zztfree( tree );
+}
+
+/* build a tree (root child1 child2 ... NULL)
+ * If root is NULL, simply make the children siblings and return ptr
+ * to 1st sibling (child1). If root is not single node, return NULL.
+ *
+ * Siblings that are actually siblins lists themselves are handled
+ * correctly. For example #( NULL, #( NULL, A, B, C), D) results
+ * in the tree ( NULL A B C D ).
+ *
+ * Requires at least two parameters with the last one being NULL. If
+ * both are NULL, return NULL.
+ */
+#ifdef __STDC__
+AST *zztmake(AST *rt, ...)
+#else
+AST *zztmake(va_alist)
+va_dcl
+#endif
+{
+ va_list ap;
+ register AST *child, *sibling=NULL, *tail, *w;
+ AST *root;
+
+#ifdef __STDC__
+ va_start(ap, rt);
+ root = rt;
+#else
+ va_start(ap);
+ root = va_arg(ap, AST *);
+#endif
+
+ if ( root != NULL )
+ if ( root->down != NULL ) return NULL;
+ child = va_arg(ap, AST *);
+ while ( child != NULL )
+ {
+ for (w=child; w->right!=NULL; w=w->right) {;} /* find end of child */
+ if ( sibling == NULL ) {sibling = child; tail = w;}
+ else {tail->right = child; tail = w;}
+ child = va_arg(ap, AST *);
+ }
+ if ( root==NULL ) root = sibling;
+ else root->down = sibling;
+ va_end(ap);
+ return root;
+}
+
+/* tree duplicate */
+AST *
+#ifdef __STDC__
+zzdup_ast(AST *t)
+#else
+zzdup_ast(t)
+AST *t;
+#endif
+{
+ AST *u;
+
+ if ( t == NULL ) return NULL;
+ u = zzastnew();
+ *u = *t;
+#ifdef zzAST_DOUBLE
+ u->up = NULL; /* set by calling invocation */
+ u->left = NULL;
+#endif
+ u->right = zzdup_ast(t->right);
+ u->down = zzdup_ast(t->down);
+#ifdef zzAST_DOUBLE
+ if ( u->right!=NULL ) u->right->left = u;
+ if ( u->down!=NULL ) u->down->up = u;
+#endif
+ return u;
+}
+
+void
+#ifdef __STDC__
+zztfree(AST *t)
+#else
+zztfree(t)
+AST *t;
+#endif
+{
+#ifdef zzd_ast
+ zzd_ast( t );
+#endif
+ free( t );
+}
+
+#ifdef zzAST_DOUBLE
+/*
+ * Set the 'up', and 'left' pointers of all nodes in 't'.
+ * Initial call is double_link(your_tree, NULL, NULL).
+ */
+void
+#ifdef __STDC__
+zzdouble_link(AST *t, AST *left, AST *up)
+#else
+zzdouble_link(t, left, up)
+AST *t, *left, *up;
+#endif
+{
+ if ( t==NULL ) return;
+ t->left = left;
+ t->up = up;
+ zzdouble_link(t->down, NULL, t);
+ zzdouble_link(t->right, t, up);
+}
+#endif
diff --git a/btparse/pccts/ast.h b/btparse/pccts/ast.h
new file mode 100644
index 0000000..2f5deda
--- /dev/null
+++ b/btparse/pccts/ast.h
@@ -0,0 +1,115 @@
+/* Abstract syntax tree
+ *
+ * Macros, definitions
+ *
+ * SOFTWARE RIGHTS
+ *
+ * We reserve no LEGAL rights to the Purdue Compiler Construction Tool
+ * Set (PCCTS) -- PCCTS is in the public domain. An individual or
+ * company may do whatever they wish with source code distributed with
+ * PCCTS or the code generated by PCCTS, including the incorporation of
+ * PCCTS, or its output, into commerical software.
+ *
+ * We encourage users to develop software with PCCTS. However, we do ask
+ * that credit is given to us for developing PCCTS. By "credit",
+ * we mean that if you incorporate our source code into one of your
+ * programs (commercial product, research project, or otherwise) that you
+ * acknowledge this fact somewhere in the documentation, research report,
+ * etc... If you like PCCTS and have developed a nice tool with the
+ * output, please mention that you developed it using PCCTS. In
+ * addition, we ask that this header remain intact in our source code.
+ * As long as these guidelines are kept, we expect to continue enhancing
+ * this system and expect to make other tools available as they are
+ * completed.
+ *
+ * ANTLR 1.33
+ * Terence Parr
+ * Parr Research Corporation
+ * with Purdue University and AHPCRC, University of Minnesota
+ * 1989-1995
+ */
+
+#ifndef ZZAST_H
+#define ZZAST_H
+
+#define zzastOvfChk \
+ if ( zzast_sp <= 0 ) \
+ { \
+ fprintf(stderr, zzStackOvfMsg, __FILE__, __LINE__); \
+ exit(PCCTS_EXIT_FAILURE); \
+ }
+
+#ifndef USER_DEFINED_AST
+#ifndef AST_FIELDS
+#define AST_FIELDS
+#endif
+
+typedef struct _ast {
+ struct _ast *right, *down;
+#ifdef zzAST_DOUBLE
+ struct _ast *left, *up;
+#endif
+ AST_FIELDS
+} AST;
+
+#else
+
+#ifdef zzAST_DOUBLE
+#define AST_REQUIRED_FIELDS struct _ast *right, *down, *left, *up;
+#else
+#define AST_REQUIRED_FIELDS struct _ast *right, *down;
+#endif
+
+#endif
+
+
+/* N o d e a c c e s s m a c r o s */
+#define zzchild(t) (((t)==NULL)?NULL:(t->down))
+#define zzsibling(t) (((t)==NULL)?NULL:(t->right))
+
+
+/* define global variables needed by #i stack */
+#define zzASTgvars \
+ AST *zzastStack[ZZAST_STACKSIZE]; \
+ int zzast_sp = ZZAST_STACKSIZE;
+
+#define zzASTVars AST *_ast = NULL, *_sibling = NULL, *_tail = NULL
+#define zzSTR ( (_tail==NULL)?(&_sibling):(&(_tail->right)) )
+#define zzastCur (zzastStack[zzast_sp])
+#define zzastArg(i) (zzastStack[zztsp-i])
+#define zzastPush(p) zzastOvfChk; zzastStack[--zzast_sp] = p;
+#define zzastDPush --zzast_sp
+#define zzastMARK zztsp=zzast_sp; /* Save state of stack */
+#define zzastREL zzast_sp=zztsp; /* Return state of stack */
+#define zzrm_ast {zzfree_ast(*_root); _tail = _sibling = (*_root)=NULL;}
+
+extern int zzast_sp;
+extern AST *zzastStack[];
+
+#ifdef __STDC__
+void zzlink(AST **, AST **, AST **);
+void zzsubchild(AST **, AST **, AST **);
+void zzsubroot(AST **, AST **, AST **);
+void zzpre_ast(AST *, void (*)(), void (*)(), void (*)());
+void zzfree_ast(AST *);
+AST *zztmake(AST *, ...);
+AST *zzdup_ast(AST *);
+void zztfree(AST *);
+void zzdouble_link(AST *, AST *, AST *);
+AST *zzastnew(void);
+
+#else
+
+void zzlink();
+AST *zzastnew();
+void zzsubchild();
+void zzsubroot();
+void zzpre_ast();
+void zzfree_ast();
+AST *zztmake();
+AST *zzdup_ast();
+void zztfree();
+void zzdouble_link();
+#endif
+
+#endif
diff --git a/btparse/pccts/config.h b/btparse/pccts/config.h
new file mode 100644
index 0000000..83db19b
--- /dev/null
+++ b/btparse/pccts/config.h
@@ -0,0 +1,230 @@
+#ifndef CONFIG_H
+#define CONFIG_H
+/*
+ * config.h (for ANTLR, DLG, and SORCERER)
+ *
+ * This is a simple configuration file that doesn't have config stuff
+ * in it, but it's a start.
+ *
+ * SOFTWARE RIGHTS
+ *
+ * We reserve no LEGAL rights to the Purdue Compiler Construction Tool
+ * Set (PCCTS) -- PCCTS is in the public domain. An individual or
+ * company may do whatever they wish with source code distributed with
+ * PCCTS or the code generated by PCCTS, including the incorporation of
+ * PCCTS, or its output, into commerical software.
+ *
+ * We encourage users to develop software with PCCTS. However, we do ask
+ * that credit is given to us for developing PCCTS. By "credit",
+ * we mean that if you incorporate our source code into one of your
+ * programs (commercial product, research project, or otherwise) that you
+ * acknowledge this fact somewhere in the documentation, research report,
+ * etc... If you like PCCTS and have developed a nice tool with the
+ * output, please mention that you developed it using PCCTS. In
+ * addition, we ask that this header remain intact in our source code.
+ * As long as these guidelines are kept, we expect to continue enhancing
+ * this system and expect to make other tools available as they are
+ * completed.
+ *
+ * Used by PCCTS 1.33 (SORCERER 1.00B11 and up)
+ * Terence Parr
+ * Parr Research Corporation
+ * with Purdue University and AHPCRC, University of Minnesota
+ * 1989-1995
+ */
+
+/* This file knows about the following ``environments''
+ UNIX (default)
+ DOS (use #define PC)
+ MAC (use #define MPW; has a few things for THINK C, Metrowerks)
+ */
+
+/* should test __STDC__ for 1, but some compilers don't set value, just def */
+#ifdef __STDC__
+#define __USE_PROTOS
+#endif
+#ifdef __cplusplus
+#ifndef __USE_PROTOS
+#define __USE_PROTOS
+#endif
+#endif
+
+/*
+* Define PC32 if in a 32-bit PC environment (e.g. extended DOS or Win32).
+* The macros tested here are defined by Watcom, Microsoft, Borland,
+* and djgpp, respectively, when they are used as 32-bit compilers.
+* Users of these compilers *must* be sure to define PC in their
+* makefiles for this to work correctly.
+*/
+#ifdef PC
+# if (defined(__WATCOM__) || defined(_WIN32) || defined(__WIN32__) || \
+ defined(__GNUC__) || defined(__GNUG__))
+# ifndef PC32
+# define PC32
+# endif
+# endif
+#endif
+
+#ifdef PC
+#define ATOKEN_H "AToken.h"
+#define ATOKPTR_H "ATokPtr.h"
+#define ATOKPTR_C "ATokPtr.cpp"
+#define ATOKENBUFFER_H "ATokBuf.h"
+#define ATOKENBUFFER_C "ATokBuf.cpp"
+#define ATOKENSTREAM_H "ATokStr.h"
+#define APARSER_H "AParser.h"
+#define APARSER_C "AParser.cpp"
+#define ASTBASE_H "ASTBase.h"
+#define ASTBASE_C "ASTBase.cpp"
+#define PCCTSAST_C "PCCTSAST.cpp"
+#define LIST_C "List.cpp"
+#define DLEXERBASE_H "DLexBase.h"
+#define DLEXERBASE_C "DLexBase.cpp"
+#define DLEXER_C "DLexer.cpp"
+#define STREESUPPORT_C "STreeSup.C"
+#else
+#define ATOKEN_H "AToken.h"
+#define ATOKPTR_H "ATokPtr.h"
+#define ATOKPTR_C "ATokPtr.cpp"
+#define ATOKENBUFFER_H "ATokenBuffer.h"
+#define ATOKENBUFFER_C "ATokenBuffer.cpp"
+#define ATOKENSTREAM_H "ATokenStream.h"
+#define APARSER_H "AParser.h"
+#define APARSER_C "AParser.cpp"
+#define ASTBASE_H "ASTBase.h"
+#define ASTBASE_C "ASTBase.cpp"
+#define PCCTSAST_C "PCCTSAST.cpp"
+#define LIST_C "List.cpp"
+#define DLEXERBASE_H "DLexerBase.h"
+#define DLEXERBASE_C "DLexerBase.cpp"
+#define DLEXER_C "DLexer.cpp"
+#define STREESUPPORT_C "STreeSupport.cpp"
+#endif
+
+/* SORCERER Stuff */
+#ifdef PC
+#define STPARSER_H "STreePar.h"
+#define STPARSER_C "STreePar.C"
+#else
+#define STPARSER_H "STreeParser.h"
+#define STPARSER_C "STreeParser.cpp"
+#endif
+
+#ifdef MPW
+#define CPP_FILE_SUFFIX ".cp"
+#define CPP_FILE_SUFFIX_NO_DOT "cp"
+#define OBJ_FILE_SUFFIX ".o"
+#else
+#ifdef PC
+#define CPP_FILE_SUFFIX ".cpp"
+#define CPP_FILE_SUFFIX_NO_DOT "cpp"
+#define OBJ_FILE_SUFFIX ".obj"
+#else
+#define CPP_FILE_SUFFIX ".cpp"
+#define CPP_FILE_SUFFIX_NO_DOT "cpp"
+#define OBJ_FILE_SUFFIX ".o"
+#endif
+#endif
+
+/* User may redefine how line information looks */
+#define LineInfoFormatStr "# %d \"%s\"\n"
+
+#ifdef MPW /* Macintosh Programmer's Workshop */
+#define ErrHdr "File \"%s\"; Line %d #"
+#else
+#define ErrHdr "%s, line %d:"
+#endif
+
+
+/* must assume old K&R cpp here, can't use #if defined(..)... */
+
+#ifdef MPW
+#define TopDirectory ":"
+#define DirectorySymbol ":"
+#define OutputDirectoryOption "Directory where all output files should go (default=\":\")"
+#else
+#ifdef PC
+#define TopDirectory "."
+#define DirectorySymbol "\\"
+#define OutputDirectoryOption "Directory where all output files should go (default=\".\")"
+#else
+#define TopDirectory "."
+#define DirectorySymbol "/"
+#define OutputDirectoryOption "Directory where all output files should go (default=\".\")"
+#endif
+#endif
+
+#ifdef MPW
+
+/* Make sure we have prototypes for all functions under MPW */
+
+#include <string.h>
+#include <stdlib.h>
+#include <CursorCtl.h>
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern void fsetfileinfo (char *filename, unsigned long newcreator, unsigned long newtype);
+#ifdef __cplusplus
+}
+#endif
+
+/* File creators for various popular development environments */
+
+#define MAC_FILE_CREATOR 'MPS ' /* MPW Text files */
+#if 0
+#define MAC_FILE_CREATOR 'KAHL' /* THINK C/Symantec C++ Text files */
+#endif
+#if 0
+#define MAC_FILE_CREATOR 'MMCC' /* Metrowerks C/C++ Text files */
+#endif
+
+#endif
+
+#ifdef MPW
+#define DAWDLE SpinCursor(1)
+#else
+#define DAWDLE
+#endif
+
+
+/*
+ * useless definitions of special_inits() and special_fopen_actions()
+ * deleted -- GPW 1997/09/06
+ */
+
+/* Define usable bits for set.c stuff */
+#define BytesPerWord sizeof(unsigned)
+#define WORDSIZE (sizeof(unsigned)*8)
+#define LogWordSize (WORDSIZE==16?4:5)
+
+#ifndef TRUE
+#define TRUE 1
+#endif
+#ifndef FALSE
+#define FALSE 0
+#endif
+
+#ifdef VAXC
+#define PCCTS_EXIT_SUCCESS 1
+#define PCCTS_EXIT_FAILURE 0
+#define zzDIE return 0;
+#define zzDONE return 1;
+
+#else /* !VAXC */
+
+#define PCCTS_EXIT_SUCCESS 0
+#define PCCTS_EXIT_FAILURE 1
+#define zzDIE return 1;
+#define zzDONE return 0;
+
+#endif
+
+#ifdef USER_ZZMODE_STACK
+# ifndef ZZSTACK_MAX_MODE
+# define ZZSTACK_MAX_MODE 32
+# endif
+# define ZZMAXSTK (ZZSTACK_MAX_MODE * 2)
+#endif
+
+#endif
diff --git a/btparse/pccts/dlgauto.h b/btparse/pccts/dlgauto.h
new file mode 100644
index 0000000..0f29359
--- /dev/null
+++ b/btparse/pccts/dlgauto.h
@@ -0,0 +1,474 @@
+/* dlgauto.h automaton
+ *
+ * SOFTWARE RIGHTS
+ *
+ * We reserve no LEGAL rights to the Purdue Compiler Construction Tool
+ * Set (PCCTS) -- PCCTS is in the public domain. An individual or
+ * company may do whatever they wish with source code distributed with
+ * PCCTS or the code generated by PCCTS, including the incorporation of
+ * PCCTS, or its output, into commerical software.
+ *
+ * We encourage users to develop software with PCCTS. However, we do ask
+ * that credit is given to us for developing PCCTS. By "credit",
+ * we mean that if you incorporate our source code into one of your
+ * programs (commercial product, research project, or otherwise) that you
+ * acknowledge this fact somewhere in the documentation, research report,
+ * etc... If you like PCCTS and have developed a nice tool with the
+ * output, please mention that you developed it using PCCTS. In
+ * addition, we ask that this header remain intact in our source code.
+ * As long as these guidelines are kept, we expect to continue enhancing
+ * this system and expect to make other tools available as they are
+ * completed.
+ *
+ * ANTLR 1.33
+ * Will Cohen and Terence Parr
+ * Parr Research Corporation
+ * with Purdue University and AHPCRC, University of Minnesota
+ * 1989-1995
+ */
+
+#ifndef ZZDEFAUTO_H
+#define ZZDEFAUTO_H
+
+zzchar_t *zzlextext; /* text of most recently matched token */
+zzchar_t *zzbegexpr; /* beginning of last reg expr recogn. */
+zzchar_t *zzendexpr; /* beginning of last reg expr recogn. */
+int zzbufsize; /* number of characters in zzlextext */
+int zzbegcol = 0; /* column that first character of token is in*/
+int zzendcol = 0; /* column that last character of token is in */
+int zzline = 1; /* line current token is on */
+int zzreal_line=1; /* line of 1st portion of token that is not skipped */
+int zzchar; /* character to determine next state */
+int zzbufovf; /* indicates that buffer too small for text */
+int zzcharfull = 0;
+static zzchar_t *zznextpos;/* points to next available position in zzlextext*/
+static int zzclass;
+
+#ifdef __USE_PROTOS
+void zzerrstd(const char *);
+void (*zzerr)(const char *)=zzerrstd;/* pointer to error reporting function */
+extern int zzerr_in(void);
+#else
+void zzerrstd();
+void (*zzerr)()=zzerrstd; /* pointer to error reporting function */
+extern int zzerr_in();
+#endif
+
+static FILE *zzstream_in=0;
+static int (*zzfunc_in)() = zzerr_in;
+static zzchar_t *zzstr_in=0;
+
+#ifdef USER_ZZMODE_STACK
+int zzauto = 0;
+#else
+static int zzauto = 0;
+#endif
+static int zzadd_erase;
+static char zzebuf[70];
+
+#ifdef ZZCOL
+#define ZZINC (++zzendcol)
+#else
+#define ZZINC
+#endif
+
+
+#define ZZGETC_STREAM {zzchar = getc(zzstream_in); zzclass = ZZSHIFT(zzchar);}
+#define ZZGETC_FUNC {zzchar = (*zzfunc_in)(); zzclass = ZZSHIFT(zzchar);}
+#define ZZGETC_STR { \
+ if (*zzstr_in){ \
+ zzchar = *zzstr_in; \
+ ++zzstr_in; \
+ }else{ \
+ zzchar = EOF; \
+ } \
+ zzclass = ZZSHIFT(zzchar); \
+}
+
+#define ZZNEWSTATE (newstate = dfa[state][zzclass])
+
+#ifndef ZZCOPY
+#define ZZCOPY \
+ /* Truncate matching buffer to size (not an error) */ \
+ if (zznextpos < lastpos){ \
+ *(zznextpos++) = zzchar; \
+ }else{ \
+ zzbufovf = 1; \
+ }
+#endif
+
+void
+#ifdef __USE_PROTOS
+zzrdstream( FILE *f )
+#else
+zzrdstream( f )
+FILE *f;
+#endif
+{
+ /* make sure that it is really set to something, otherwise just
+ leave it be.
+ */
+ if (f){
+ /* make sure that there is always someplace to get input
+ before closing zzstream_in
+ */
+#if 0
+ if (zzstream_in && zzstream_in!=stdin) fclose( zzstream_in );
+#endif
+ zzline = 1;
+ zzstream_in = f;
+ zzfunc_in = NULL;
+ zzstr_in = 0;
+ zzcharfull = 0;
+ }
+}
+
+void
+#ifdef __USE_PROTOS
+zzrdfunc( int (*f)() )
+#else
+zzrdfunc( f )
+int (*f)();
+#endif
+{
+ /* make sure that it is really set to something, otherwise just
+ leave it be.
+ */
+ if (f){
+ /* make sure that there is always someplace to get input
+ before closing zzstream_in
+ */
+#if 0
+ if (zzstream_in && zzstream_in!=stdin) fclose( zzstream_in );
+#endif
+ zzline = 1;
+ zzstream_in = NULL;
+ zzfunc_in = f;
+ zzstr_in = 0;
+ zzcharfull = 0;
+ }
+}
+
+
+void
+#ifdef __USE_PROTOS
+zzrdstr( zzchar_t *s )
+#else
+zzrdstr( s )
+zzchar_t *s;
+#endif
+{
+ /* make sure that it is really set to something, otherwise just
+ leave it be.
+ */
+ if (s){
+ /* make sure that there is always someplace to get input
+ before closing zzstream_in
+ */
+#if 0
+ if (zzstream_in && zzstream_in!=stdin) fclose( zzstream_in );
+#endif
+ zzline = 1;
+ zzstream_in = NULL;
+ zzfunc_in = 0;
+ zzstr_in = s;
+ zzcharfull = 0;
+ }
+}
+
+
+void
+zzclose_stream()
+{
+#if 0
+ fclose( zzstream_in );
+ zzstream_in = NULL;
+ zzfunc_in = NULL;
+#endif
+}
+
+/* saves dlg state, but not what feeds dlg (such as file position) */
+void
+#ifdef __USE_PROTOS
+zzsave_dlg_state(struct zzdlg_state *state)
+#else
+zzsave_dlg_state(state)
+struct zzdlg_state *state;
+#endif
+{
+ state->stream = zzstream_in;
+ state->func_ptr = zzfunc_in;
+ state->str = zzstr_in;
+ state->auto_num = zzauto;
+ state->add_erase = zzadd_erase;
+ state->lookc = zzchar;
+ state->char_full = zzcharfull;
+ state->begcol = zzbegcol;
+ state->endcol = zzendcol;
+ state->line = zzline;
+ state->lextext = zzlextext;
+ state->begexpr = zzbegexpr;
+ state->endexpr = zzendexpr;
+ state->bufsize = zzbufsize;
+ state->bufovf = zzbufovf;
+ state->nextpos = zznextpos;
+ state->class_num = zzclass;
+}
+
+void
+#ifdef __USE_PROTOS
+zzrestore_dlg_state(struct zzdlg_state *state)
+#else
+zzrestore_dlg_state(state)
+struct zzdlg_state *state;
+#endif
+{
+ zzstream_in = state->stream;
+ zzfunc_in = state->func_ptr;
+ zzstr_in = state->str;
+ zzauto = state->auto_num;
+ zzadd_erase = state->add_erase;
+ zzchar = state->lookc;
+ zzcharfull = state->char_full;
+ zzbegcol = state->begcol;
+ zzendcol = state->endcol;
+ zzline = state->line;
+ zzlextext = state->lextext;
+ zzbegexpr = state->begexpr;
+ zzendexpr = state->endexpr;
+ zzbufsize = state->bufsize;
+ zzbufovf = state->bufovf;
+ zznextpos = state->nextpos;
+ zzclass = state->class_num;
+}
+
+void
+#ifdef __USE_PROTOS
+zzmode( int m )
+#else
+zzmode( m )
+int m;
+#endif
+{
+ /* points to base of dfa table */
+ if (m<MAX_MODE){
+ zzauto = m;
+ /* have to redo class since using different compression */
+ zzclass = ZZSHIFT(zzchar);
+ }else{
+ snprintf(zzebuf, 69, "Invalid automaton mode = %d ",m); // easier to track bugs
+ zzerr(zzebuf);
+ }
+}
+
+/* erase what is currently in the buffer, and get a new reg. expr */
+void
+zzskip()
+{
+ zzadd_erase = 1;
+}
+
+/* don't erase what is in the zzlextext buffer, add on to it */
+void
+zzmore()
+{
+ zzadd_erase = 2;
+}
+
+/* substitute c for the reg. expr last matched and is in the buffer */
+#ifdef __USE_PROTOS
+void
+zzreplchar(zzchar_t c)
+#else
+void
+zzreplchar(c)
+zzchar_t c;
+#endif
+{
+ /* can't allow overwriting null at end of string */
+ if (zzbegexpr < &zzlextext[zzbufsize-1]){
+ *zzbegexpr = c;
+ *(zzbegexpr+1) = '\0';
+ }
+ zzendexpr = zzbegexpr;
+ zznextpos = zzbegexpr + 1;
+}
+
+/* replace the string s for the reg. expr last matched and in the buffer */
+void
+#ifdef __USE_PROTOS
+zzreplstr(register zzchar_t *s)
+#else
+zzreplstr(s)
+register zzchar_t *s;
+#endif
+{
+ register zzchar_t *l= &zzlextext[zzbufsize -1];
+
+ zznextpos = zzbegexpr;
+ if (s){
+ while ((zznextpos <= l) && (*(zznextpos++) = *(s++))!=0){
+ /* empty */
+ }
+ /* correct for NULL at end of string */
+ zznextpos--;
+ }
+ if ((zznextpos <= l) && (*(--s) == 0)){
+ zzbufovf = 0;
+ }else{
+ zzbufovf = 1;
+ }
+ *(zznextpos) = '\0';
+ zzendexpr = zznextpos - 1;
+}
+
+void
+zzgettok()
+{
+ register int state, newstate;
+ /* last space reserved for the null char */
+ zzchar_t *lastpos; /* GPW 1997/09/05 (removed 'register' */
+
+skip:
+ zzreal_line = zzline;
+ zzbufovf = 0;
+ lastpos = &zzlextext[zzbufsize-1];
+ zznextpos = zzlextext;
+ zzbegcol = zzendcol+1;
+more:
+ zzbegexpr = zznextpos;
+#ifdef ZZINTERACTIVE
+ /* interactive version of automaton */
+ /* if there is something in zzchar, process it */
+ state = newstate = dfa_base[zzauto];
+ if (zzcharfull){
+ ZZINC;
+ ZZCOPY;
+ ZZNEWSTATE;
+ }
+ if (zzstr_in)
+ while (zzalternatives[newstate]){
+ state = newstate;
+ ZZGETC_STR;
+ ZZINC;
+ ZZCOPY;
+ ZZNEWSTATE;
+ }
+ else if (zzstream_in)
+ while (zzalternatives[newstate]){
+ state = newstate;
+ ZZGETC_STREAM;
+ ZZINC;
+ ZZCOPY;
+ ZZNEWSTATE;
+ }
+ else if (zzfunc_in)
+ while (zzalternatives[newstate]){
+ state = newstate;
+ ZZGETC_FUNC;
+ ZZINC;
+ ZZCOPY;
+ ZZNEWSTATE;
+ }
+ /* figure out if last character really part of token */
+ if ((state != dfa_base[zzauto]) && (newstate == DfaStates)){
+ zzcharfull = 1;
+ --zznextpos;
+ }else{
+ zzcharfull = 0;
+ state = newstate;
+ }
+ *(zznextpos) = '\0';
+ /* Able to transition out of start state to some non err state?*/
+ if ( state == dfa_base[zzauto] ){
+ /* make sure doesn't get stuck */
+ zzadvance();
+ }
+#else
+ /* non-interactive version of automaton */
+ if (!zzcharfull)
+ zzadvance();
+ else
+ ZZINC;
+ state = dfa_base[zzauto];
+ if (zzstr_in)
+ while (ZZNEWSTATE != DfaStates){
+ state = newstate;
+ ZZCOPY;
+ ZZGETC_STR;
+ ZZINC;
+ }
+ else if (zzstream_in)
+ while (ZZNEWSTATE != DfaStates){
+ state = newstate;
+ ZZCOPY;
+ ZZGETC_STREAM;
+ ZZINC;
+ }
+ else if (zzfunc_in)
+ while (ZZNEWSTATE != DfaStates){
+ state = newstate;
+ ZZCOPY;
+ ZZGETC_FUNC;
+ ZZINC;
+ }
+ zzcharfull = 1;
+ if ( state == dfa_base[zzauto] ){
+ if (zznextpos < lastpos){
+ *(zznextpos++) = zzchar;
+ }else{
+ zzbufovf = 1;
+ }
+ *zznextpos = '\0';
+ /* make sure doesn't get stuck */
+ zzadvance();
+ }else{
+ *zznextpos = '\0';
+ }
+#endif
+#ifdef ZZCOL
+ zzendcol -= zzcharfull;
+#endif
+ zzendexpr = zznextpos -1;
+ zzadd_erase = 0;
+ (*actions[accepts[state]])();
+ switch (zzadd_erase) {
+ case 1: goto skip;
+ case 2: goto more;
+ }
+}
+
+void
+zzadvance()
+{
+ if (zzstream_in) { ZZGETC_STREAM; zzcharfull = 1; ZZINC;}
+ if (zzfunc_in) { ZZGETC_FUNC; zzcharfull = 1; ZZINC;}
+ if (zzstr_in) { ZZGETC_STR; zzcharfull = 1; ZZINC;}
+ if (!(zzstream_in || zzfunc_in || zzstr_in)){
+ zzerr_in();
+ }
+}
+
+void
+#ifdef __USE_PROTOS
+zzerrstd(const char *s)
+#else
+zzerrstd(s)
+char *s;
+#endif
+{
+ fprintf(stderr,
+ "%s near line %d (text was '%s')\n",
+ ((s == NULL) ? "Lexical error" : s),
+ zzline,zzlextext);
+}
+
+int
+zzerr_in()
+{
+ fprintf(stderr,"No input stream, function, or string\n");
+ /* return eof to get out gracefully */
+ return EOF;
+}
+
+#endif
diff --git a/btparse/pccts/dlgdef.h b/btparse/pccts/dlgdef.h
new file mode 100644
index 0000000..c6c413e
--- /dev/null
+++ b/btparse/pccts/dlgdef.h
@@ -0,0 +1,124 @@
+/* dlgdef.h
+ * Things in scanner produced by dlg that should be visible to the outside
+ * world
+ *
+ * SOFTWARE RIGHTS
+ *
+ * We reserve no LEGAL rights to the Purdue Compiler Construction Tool
+ * Set (PCCTS) -- PCCTS is in the public domain. An individual or
+ * company may do whatever they wish with source code distributed with
+ * PCCTS or the code generated by PCCTS, including the incorporation of
+ * PCCTS, or its output, into commerical software.
+ *
+ * We encourage users to develop software with PCCTS. However, we do ask
+ * that credit is given to us for developing PCCTS. By "credit",
+ * we mean that if you incorporate our source code into one of your
+ * programs (commercial product, research project, or otherwise) that you
+ * acknowledge this fact somewhere in the documentation, research report,
+ * etc... If you like PCCTS and have developed a nice tool with the
+ * output, please mention that you developed it using PCCTS. In
+ * addition, we ask that this header remain intact in our source code.
+ * As long as these guidelines are kept, we expect to continue enhancing
+ * this system and expect to make other tools available as they are
+ * completed.
+ *
+ * ANTLR 1.33
+ * Terence Parr
+ * Parr Research Corporation
+ * with Purdue University and AHPCRC, University of Minnesota
+ * 1989-1995
+ */
+
+#ifndef ZZDLGDEF_H
+#define ZZDLGDEF_H
+
+#include "config.h"
+
+#ifndef zzchar_t
+#ifdef ZZWCHAR_T
+#define zzchar_t unsigned wchar_t
+#else
+#define zzchar_t unsigned char
+#endif
+#endif
+
+struct zzdlg_state {
+ FILE *stream;
+ int (*func_ptr)();
+ zzchar_t *str;
+ int auto_num;
+ int add_erase;
+ int lookc;
+ int char_full;
+ int begcol, endcol;
+ int line;
+ zzchar_t *lextext, *begexpr, *endexpr;
+ int bufsize;
+ int bufovf;
+ zzchar_t *nextpos;
+ int class_num;
+};
+
+extern zzchar_t *zzlextext; /* text of most recently matched token */
+extern zzchar_t *zzbegexpr; /* beginning of last reg expr recogn. */
+extern zzchar_t *zzendexpr; /* beginning of last reg expr recogn. */
+extern int zzbufsize; /* how long zzlextext is */
+extern int zzbegcol; /* column that first character of token is in*/
+extern int zzendcol; /* column that last character of token is in */
+extern int zzline; /* line current token is on */
+extern int zzreal_line; /* line of 1st portion of token that is not skipped */
+extern int zzchar; /* character to determine next state */
+extern int zzbufovf; /* indicates that buffer too small for text */
+#ifdef __USE_PROTOS
+extern void (*zzerr)(const char *);/* pointer to error reporting function */
+#else
+extern void (*zzerr)();
+#endif
+
+#ifdef USER_ZZMODE_STACK
+extern int zzauto;
+#endif
+
+#ifdef __USE_PROTOS
+extern void zzadvance(void);
+extern void zzskip(void); /* erase zzlextext, look for antoher token */
+extern void zzmore(void); /* keep zzlextext, look for another token */
+extern void zzmode(int k); /* switch to automaton 'k' */
+extern void zzrdstream(FILE *);/* what stream to read from */
+extern void zzclose_stream(void);/* close the current input stream */
+extern void zzrdfunc(int (*)());/* what function to get char from */
+extern void zzrdstr( zzchar_t * );
+extern void zzgettok(void); /* get next token */
+extern void zzreplchar(zzchar_t c);/* replace last recognized reg. expr. with
+ a character */
+extern void zzreplstr(zzchar_t *s);/* replace last recognized reg. expr. with
+ a string */
+extern void zzsave_dlg_state(struct zzdlg_state *);
+extern void zzrestore_dlg_state(struct zzdlg_state *);
+extern int zzerr_in(void);
+extern void zzerrstd(const char *);
+extern void zzerraction();
+
+#else
+
+extern void zzadvance();
+extern void zzskip(); /* erase zzlextext, look for antoher token */
+extern void zzmore(); /* keep zzlextext, look for another token */
+extern void zzmode(/*k*/); /* switch to automaton 'k' */
+extern void zzrdstream(); /* what stream to read from */
+extern void zzclose_stream();/* close the current input stream */
+extern void zzrdfunc(); /* what function to get char from */
+extern void zzrdstr();
+extern void zzgettok(); /* get next token */
+extern void zzreplchar(); /* replace last recognized reg. expr. with
+ a character */
+extern void zzreplstr(); /* replace last recognized reg. expr. with
+ a string */
+extern void zzsave_dlg_state();
+extern void zzrestore_dlg_state();
+extern int zzerr_in();
+extern void zzerrstd();
+extern void zzerraction();
+#endif
+
+#endif
diff --git a/btparse/pccts/err.h b/btparse/pccts/err.h
new file mode 100644
index 0000000..b30a36a
--- /dev/null
+++ b/btparse/pccts/err.h
@@ -0,0 +1,872 @@
+/*
+ * err.h
+ *
+ * Standard error handling mechanism
+ *
+ * SOFTWARE RIGHTS
+ *
+ * We reserve no LEGAL rights to the Purdue Compiler Construction Tool
+ * Set (PCCTS) -- PCCTS is in the public domain. An individual or
+ * company may do whatever they wish with source code distributed with
+ * PCCTS or the code generated by PCCTS, including the incorporation of
+ * PCCTS, or its output, into commerical software.
+ *
+ * We encourage users to develop software with PCCTS. However, we do ask
+ * that credit is given to us for developing PCCTS. By "credit",
+ * we mean that if you incorporate our source code into one of your
+ * programs (commercial product, research project, or otherwise) that you
+ * acknowledge this fact somewhere in the documentation, research report,
+ * etc... If you like PCCTS and have developed a nice tool with the
+ * output, please mention that you developed it using PCCTS. In
+ * addition, we ask that this header remain intact in our source code.
+ * As long as these guidelines are kept, we expect to continue enhancing
+ * this system and expect to make other tools available as they are
+ * completed.
+ *
+ * Has grown to hold all kinds of stuff (err.h is increasingly misnamed)
+ *
+ * ANTLR 1.33
+ * Terence Parr
+ * Parr Research Corporation
+ * with Purdue University and AHPCRC, University of Minnesota
+ * 1989-1995
+ */
+
+#ifndef ERR_H
+#define ERR_H
+
+#include "config.h"
+
+#include <string.h>
+#ifdef __STDC__
+#include <stdarg.h>
+#else
+#include <varargs.h>
+#endif
+
+#ifdef DUM
+/* Define usable bits per unsigned int word (used for set stuff) */
+#ifdef PC
+#define BSETWORDSIZE 16
+#define BSETLOGWORDSIZE 4
+#else
+#define BSETWORDSIZE 32
+#define BSETLOGWORDSIZE 5
+#endif
+#endif
+
+#define BSETWORDSIZE 8
+#define BSETLOGWORDSIZE 3 /* SetWordType is 8bits */
+
+#define BSETMODWORD(x) ((x) & (BSETWORDSIZE-1)) /* x % BSETWORDSIZE */
+#define BSETDIVWORD(x) ((x) >> BSETLOGWORDSIZE) /* x / BSETWORDSIZE */
+
+/* This is not put into the global pccts_parser structure because it is
+ * hidden and does not need to be saved during a "save state" operation
+ */
+/* maximum of 32 bits/unsigned int and must be 8 bits/byte */
+static SetWordType bitmask[] = {
+ 0x00000001, 0x00000002, 0x00000004, 0x00000008,
+ 0x00000010, 0x00000020, 0x00000040, 0x00000080
+};
+
+void
+#ifdef __USE_PROTOS
+zzresynch(SetWordType *wd,SetWordType mask)
+#else
+zzresynch(wd,mask)
+SetWordType *wd, mask;
+#endif
+{
+ static int consumed = 1;
+
+ /* if you enter here without having consumed a token from last resynch
+ * force a token consumption.
+ */
+ if ( !consumed ) {zzCONSUME; return;}
+
+ /* if current token is in resynch set, we've got what we wanted */
+ if ( wd[LA(1)]&mask || LA(1) == zzEOF_TOKEN ) {consumed=0; return;}
+
+ /* scan until we find something in the resynch set */
+ while ( !(wd[LA(1)]&mask) && LA(1) != zzEOF_TOKEN ) {zzCONSUME;}
+ consumed=1;
+}
+
+void
+#ifdef __USE_PROTOS
+zzconsumeUntil(SetWordType *st)
+#else
+zzconsumeUntil(st)
+SetWordType *st;
+#endif
+{
+ while ( !zzset_el(LA(1), st) ) { zzCONSUME; }
+}
+
+void
+#ifdef __USE_PROTOS
+zzconsumeUntilToken(int t)
+#else
+zzconsumeUntilToken(t)
+int t;
+#endif
+{
+ while ( LA(1)!=t ) { zzCONSUME; }
+}
+
+/* input looks like:
+ * zzFAIL(k, e1, e2, ...,&zzMissSet,&zzMissText,&zzBadTok,&zzBadText)
+ * where the zzMiss stuff is set here to the token that did not match
+ * (and which set wasn't it a member of).
+ */
+void
+#ifdef __USE_PROTOS
+zzFAIL(int k, ...)
+#else
+zzFAIL(va_alist)
+va_dcl
+#endif
+{
+#ifdef LL_K
+ static char text[LL_K*ZZLEXBUFSIZE+1+1]; // allocate an extra byte for strncat() to drop a trailing NULL
+ SetWordType *f[LL_K];
+#else
+ static char text[ZZLEXBUFSIZE+1+1]; // allocate an extra byte for strncat() to drop a trailing NULL
+ SetWordType *f[1];
+#endif
+ SetWordType **miss_set;
+ char **miss_text;
+ int *bad_tok;
+ char **bad_text;
+ int *err_k;
+ int i;
+ va_list ap;
+#ifndef __USE_PROTOS
+ int k;
+#endif
+#ifdef __USE_PROTOS
+ va_start(ap, k);
+#else
+ va_start(ap);
+ k = va_arg(ap, int); /* how many lookahead sets? */
+#endif
+ text[0] = '\0';
+ for (i=1; i<=k; i++) /* collect all lookahead sets */
+ {
+ f[i-1] = va_arg(ap, SetWordType *);
+ }
+ for (i=1; i<=k; i++) /* look for offending token */
+ {
+#ifdef LL_K
+ int freeSpace = (LL_K*ZZLEXBUFSIZE+1) - strlen(text);
+#else
+ int freeSpace = (ZZLEXBUFSIZE+1) - strlen(text);
+#endif
+ if ( i>1 ) strcat(text, " ");
+ // strncat(a,b,n) will actually write n+1 bytes
+ // because of the terminating NULL, unlike strlcpy
+ // non-standard OpenBSD function) which writes exactly n.
+ // this may end up dropping a character, but this is
+ // debug output from a failure case, so it doesn't matter much.
+ strncat(text, (char*)LATEXT(i), freeSpace);
+ if ( !zzset_el((unsigned)LA(i), f[i-1]) ) break;
+ }
+ miss_set = va_arg(ap, SetWordType **);
+ miss_text = va_arg(ap, char **);
+ bad_tok = va_arg(ap, int *);
+ bad_text = va_arg(ap, char **);
+ err_k = va_arg(ap, int *);
+ if ( i>k )
+ {
+ /* bad; lookahead is permutation that cannot be matched,
+ * but, the ith token of lookahead is valid at the ith position
+ * (The old LL sub 1 (k) versus LL(k) parsing technique)
+ */
+ *miss_set = NULL;
+ *miss_text = (char*)zzlextext; // hide warning! [ambs]
+ *bad_tok = LA(1);
+ *bad_text = (char*)LATEXT(1); // hide warning! [ambs]
+ *err_k = k;
+ return;
+ }
+/* fprintf(stderr, "%s not in %dth set\n", zztokens[LA(i)], i);*/
+ *miss_set = f[i-1];
+ *miss_text = text;
+ *bad_tok = LA(i);
+ *bad_text = (char*) LATEXT(i); // hide warning! [ambs]
+ if ( i==1 ) *err_k = 1;
+ else *err_k = k;
+}
+
+void
+#ifdef __USE_PROTOS
+zzsave_antlr_state(zzantlr_state *buf)
+#else
+zzsave_antlr_state(buf)
+zzantlr_state *buf;
+#endif
+{
+#ifdef LL_K
+ int i;
+#endif
+
+#ifdef ZZCAN_GUESS
+ buf->guess_start = zzguess_start;
+ buf->guessing = zzguessing;
+#endif
+ buf->asp = zzasp;
+#ifdef GENAST
+ buf->ast_sp = zzast_sp;
+#endif
+#ifdef ZZINF_LOOK
+ buf->inf_labase = zzinf_labase;
+ buf->inf_last = zzinf_last;
+#endif
+#ifdef DEMAND_LOOK
+ buf->dirty = zzdirty;
+#endif
+#ifdef LL_K
+ for (i=0; i<LL_K; i++) buf->tokenLA[i] = zztokenLA[i];
+ for (i=0; i<LL_K; i++) strcpy(buf->textLA[i], zztextLA[i]);
+ buf->lap = zzlap;
+ buf->labase = zzlabase;
+#else
+ buf->token = zztoken;
+ strcpy(buf->text, (char*) zzlextext); // hide warning! [ambs]
+#endif
+}
+
+void
+#ifdef __USE_PROTOS
+zzrestore_antlr_state(zzantlr_state *buf)
+#else
+zzrestore_antlr_state(buf)
+zzantlr_state *buf;
+#endif
+{
+#ifdef LL_K
+ int i;
+#endif
+
+#ifdef ZZCAN_GUESS
+ zzguess_start = buf->guess_start;
+ zzguessing = buf->guessing;
+#endif
+ zzasp = buf->asp;
+#ifdef GENAST
+ zzast_sp = buf->ast_sp;
+#endif
+#ifdef ZZINF_LOOK
+ zzinf_labase = buf->inf_labase;
+ zzinf_last = buf->inf_last;
+#endif
+#ifdef DEMAND_LOOK
+ zzdirty = buf->dirty;
+#endif
+#ifdef LL_K
+ for (i=0; i<LL_K; i++) zztokenLA[i] = buf->tokenLA[i];
+ for (i=0; i<LL_K; i++) strcpy(zztextLA[i], buf->textLA[i]);
+ zzlap = buf->lap;
+ zzlabase = buf->labase;
+#else
+ zztoken = buf->token;
+ strcpy((char*) zzlextext, buf->text); // Hide warning [ambs]
+#endif
+}
+
+void
+#ifdef __USE_PROTOS
+zzedecode(SetWordType *a)
+#else
+zzedecode(a)
+SetWordType *a;
+#endif
+{
+ register SetWordType *p = a;
+ register SetWordType *endp = &(p[zzSET_SIZE]);
+ register unsigned e = 0;
+
+ if ( zzset_deg(a)>1 ) fprintf(stderr, " {");
+ do {
+ register SetWordType t = *p;
+ register SetWordType *b = &(bitmask[0]);
+ do {
+ if ( t & *b ) fprintf(stderr, " %s", zztokens[e]);
+ e++;
+ } while (++b < &(bitmask[sizeof(SetWordType)*8]));
+ } while (++p < endp);
+ if ( zzset_deg(a)>1 ) fprintf(stderr, " }");
+}
+
+#ifndef USER_ZZSYN
+/* standard error reporting function */
+void
+#ifdef __USE_PROTOS
+zzsyn(char *text, int tok, char *egroup, SetWordType *eset, int etok, int k, char *bad_text)
+#else
+zzsyn(text, tok, egroup, eset, etok, k, bad_text)
+char *text, *egroup, *bad_text;
+int tok;
+int etok;
+int k;
+SetWordType *eset;
+#endif
+{
+
+ fprintf(stderr, "line %d: syntax error at \"%s\"", zzline, (tok==zzEOF_TOKEN)?"EOF":bad_text);
+ if ( !etok && !eset ) {fprintf(stderr, "\n"); return;}
+ if ( k==1 ) fprintf(stderr, " missing");
+ else
+ {
+ fprintf(stderr, "; \"%s\" not", bad_text);
+ if ( zzset_deg(eset)>1 ) fprintf(stderr, " in");
+ }
+ if ( zzset_deg(eset)>0 ) zzedecode(eset);
+ else fprintf(stderr, " %s", zztokens[etok]);
+ if ( strlen(egroup) > 0 ) fprintf(stderr, " in %s", egroup);
+ fprintf(stderr, "\n");
+}
+#endif
+
+/* is b an element of set p? */
+int
+#ifdef __USE_PROTOS
+zzset_el(unsigned b, SetWordType *p)
+#else
+zzset_el(b,p)
+unsigned b;
+SetWordType *p;
+#endif
+{
+ return( p[BSETDIVWORD(b)] & bitmask[BSETMODWORD(b)] );
+}
+
+int
+#ifdef __USE_PROTOS
+zzset_deg(SetWordType *a)
+#else
+zzset_deg(a)
+SetWordType *a;
+#endif
+{
+ /* Fast compute degree of a set... the number
+ of elements present in the set. Assumes
+ that all word bits are used in the set
+ */
+ register SetWordType *p = a;
+ register SetWordType *endp = &(a[zzSET_SIZE]);
+ register int degree = 0;
+
+ if ( a == NULL ) return 0;
+ while ( p < endp )
+ {
+ register SetWordType t = *p;
+ register SetWordType *b = &(bitmask[0]);
+ do {
+ if (t & *b) ++degree;
+ } while (++b < &(bitmask[sizeof(SetWordType)*8]));
+ p++;
+ }
+
+ return(degree);
+}
+
+#ifdef DEMAND_LOOK
+
+#ifdef LL_K
+int
+#ifdef __USE_PROTOS
+_zzmatch(int _t, char **zzBadText, char **zzMissText,
+ int *zzMissTok, int *zzBadTok,
+ SetWordType **zzMissSet)
+#else
+_zzmatch(_t, zzBadText, zzMissText, zzMissTok, zzBadTok, zzMissSet)
+int _t;
+char **zzBadText;
+char **zzMissText;
+int *zzMissTok, *zzBadTok;
+SetWordType **zzMissSet;
+#endif
+{
+ if ( zzdirty==LL_K ) {
+ zzCONSUME;
+ }
+ if ( LA(1)!=_t ) {
+ *zzBadText = *zzMissText=LATEXT(1);
+ *zzMissTok= _t; *zzBadTok=LA(1);
+ *zzMissSet=NULL;
+ return 0;
+ }
+ zzMakeAttr
+ zzdirty++;
+ zzlabase++;
+ return 1;
+}
+
+int
+#ifdef __USE_PROTOS
+_zzmatch_wsig(int _t)
+#else
+_zzmatch_wsig(_t)
+int _t;
+#endif
+{
+ if ( zzdirty==LL_K ) {
+ zzCONSUME;
+ }
+ if ( LA(1)!=_t ) {
+ return 0;
+ }
+ zzMakeAttr
+ zzdirty++;
+ zzlabase++;
+ return 1;
+}
+
+#else
+
+int
+#ifdef __USE_PROTOS
+_zzmatch(int _t, char **zzBadText, char **zzMissText,
+ int *zzMissTok, int *zzBadTok, SetWordType **zzMissSet)
+#else
+_zzmatch(_t, zzBadText, zzMissText, zzMissTok, zzBadTok, zzMissSet)
+int _t;
+char **zzBadText;
+char **zzMissText;
+int *zzMissTok, *zzBadTok;
+SetWordType **zzMissSet;
+#endif
+{
+ if ( zzdirty ) {zzCONSUME;}
+ if ( LA(1)!=_t ) {
+ *zzBadText = *zzMissText=LATEXT(1);
+ *zzMissTok= _t; *zzBadTok=LA(1);
+ *zzMissSet=NULL;
+ return 0;
+ }
+ zzdirty = 1;
+ zzMakeAttr
+ return 1;
+}
+
+int
+#ifdef __USE_PROTOS
+_zzmatch_wsig(int _t)
+#else
+_zzmatch_wsig(_t)
+int _t;
+#endif
+{
+ if ( zzdirty ) {zzCONSUME;}
+ if ( LA(1)!=_t ) {
+ return 0;
+ }
+ zzdirty = 1;
+ zzMakeAttr
+ return 1;
+}
+
+#endif /*LL_K*/
+
+#else
+
+int
+#ifdef __USE_PROTOS
+_zzmatch(int _t, char **zzBadText, char **zzMissText,
+ int *zzMissTok, int *zzBadTok,
+ SetWordType **zzMissSet)
+#else
+_zzmatch(_t, zzBadText, zzMissText, zzMissTok, zzBadTok, zzMissSet)
+int _t;
+char **zzBadText;
+char **zzMissText;
+int *zzMissTok, *zzBadTok;
+SetWordType **zzMissSet;
+#endif
+{
+ if ( LA(1)!=_t ) {
+ *zzBadText = *zzMissText= (char*) LATEXT(1); // hide warning! [ambs]
+ *zzMissTok= _t; *zzBadTok=LA(1);
+ *zzMissSet=NULL;
+ return 0;
+ }
+ zzMakeAttr
+ return 1;
+}
+
+int
+#ifdef __USE_PROTOS
+_zzmatch_wsig(int _t)
+#else
+_zzmatch_wsig(_t)
+int _t;
+#endif
+{
+ if ( LA(1)!=_t ) return 0;
+ zzMakeAttr
+ return 1;
+}
+
+#endif /*DEMAND_LOOK*/
+
+#ifdef ZZINF_LOOK
+void
+#ifdef __USE_PROTOS
+_inf_zzgettok(void)
+#else
+_inf_zzgettok()
+#endif
+{
+ if ( zzinf_labase >= zzinf_last )
+ {NLA = zzEOF_TOKEN; strcpy(NLATEXT, "");}
+ else {
+ NLA = zzinf_tokens[zzinf_labase];
+ zzline = zzinf_line[zzinf_labase]; /* wrong in 1.21 */
+ strcpy(NLATEXT, zzinf_text[zzinf_labase]);
+ zzinf_labase++;
+ }
+}
+#endif
+
+#ifdef ZZINF_LOOK
+/* allocate default size text,token and line arrays;
+ * then, read all of the input reallocing the arrays as needed.
+ * Once the number of total tokens is known, the LATEXT(i) array (zzinf_text)
+ * is allocated and it's pointers are set to the tokens in zzinf_text_buffer.
+ */
+void
+#ifdef __USE_PROTOS
+zzfill_inf_look(void)
+#else
+zzfill_inf_look()
+#endif
+{
+ int tok, line;
+ int zzinf_token_buffer_size = ZZINF_DEF_TOKEN_BUFFER_SIZE;
+ int zzinf_text_buffer_size = ZZINF_DEF_TEXT_BUFFER_SIZE;
+ int zzinf_text_buffer_index = 0;
+ int zzinf_lap = 0;
+
+ /* allocate text/token buffers */
+ zzinf_text_buffer = (char *) malloc(zzinf_text_buffer_size);
+ if ( zzinf_text_buffer == NULL )
+ {
+ fprintf(stderr, "cannot allocate lookahead text buffer (%d bytes)\n",
+ zzinf_text_buffer_size);
+ exit(PCCTS_EXIT_FAILURE);
+ }
+ zzinf_tokens = (int *) calloc(zzinf_token_buffer_size,sizeof(int));
+ if ( zzinf_tokens == NULL )
+ {
+ fprintf(stderr, "cannot allocate token buffer (%d tokens)\n",
+ zzinf_token_buffer_size);
+ exit(PCCTS_EXIT_FAILURE);
+ }
+ zzinf_line = (int *) calloc(zzinf_token_buffer_size,sizeof(int));
+ if ( zzinf_line == NULL )
+ {
+ fprintf(stderr, "cannot allocate line buffer (%d ints)\n",
+ zzinf_token_buffer_size);
+ exit(PCCTS_EXIT_FAILURE);
+ }
+
+ /* get tokens, copying text to text buffer */
+ zzinf_text_buffer_index = 0;
+ do {
+ zzgettok();
+ line = zzreal_line;
+ while ( zzinf_lap>=zzinf_token_buffer_size )
+ {
+ zzinf_token_buffer_size += ZZINF_BUFFER_TOKEN_CHUNK_SIZE;
+ zzinf_tokens = (int *) realloc(zzinf_tokens,
+ zzinf_token_buffer_size*sizeof(int));
+ if ( zzinf_tokens == NULL )
+ {
+ fprintf(stderr, "cannot allocate lookahead token buffer (%d tokens)\n",
+ zzinf_token_buffer_size);
+ exit(PCCTS_EXIT_FAILURE);
+ }
+ zzinf_line = (int *) realloc(zzinf_line,
+ zzinf_token_buffer_size*sizeof(int));
+ if ( zzinf_line == NULL )
+ {
+ fprintf(stderr, "cannot allocate lookahead line buffer (%d ints)\n",
+ zzinf_token_buffer_size);
+ exit(PCCTS_EXIT_FAILURE);
+ }
+
+ }
+ while ( (zzinf_text_buffer_index+strlen(NLATEXT)+1) >= zzinf_text_buffer_size )
+ {
+ zzinf_text_buffer_size += ZZINF_BUFFER_TEXT_CHUNK_SIZE;
+ zzinf_text_buffer = (char *) realloc(zzinf_text_buffer,
+ zzinf_text_buffer_size);
+ if ( zzinf_text_buffer == NULL )
+ {
+ fprintf(stderr, "cannot allocate lookahead text buffer (%d bytes)\n",
+ zzinf_text_buffer_size);
+ exit(PCCTS_EXIT_FAILURE);
+ }
+ }
+ /* record token and text and line of input symbol */
+ tok = zzinf_tokens[zzinf_lap] = NLA;
+ strcpy(&zzinf_text_buffer[zzinf_text_buffer_index], NLATEXT);
+ zzinf_text_buffer_index += strlen(NLATEXT)+1;
+ zzinf_line[zzinf_lap] = line;
+ zzinf_lap++;
+ } while (tok!=zzEOF_TOKEN);
+ zzinf_labase = 0;
+ zzinf_last = zzinf_lap-1;
+
+ /* allocate ptrs to text of ith token */
+ zzinf_text = (char **) calloc(zzinf_last+1,sizeof(char *));
+ if ( zzinf_text == NULL )
+ {
+ fprintf(stderr, "cannot allocate lookahead text buffer (%d)\n",
+ zzinf_text_buffer_size);
+ exit(PCCTS_EXIT_FAILURE);
+ }
+ zzinf_text_buffer_index = 0;
+ zzinf_lap = 0;
+ /* set ptrs so that zzinf_text[i] is the text of the ith token found on input */
+ while (zzinf_lap<=zzinf_last)
+ {
+ zzinf_text[zzinf_lap++] = &zzinf_text_buffer[zzinf_text_buffer_index];
+ zzinf_text_buffer_index += strlen(&zzinf_text_buffer[zzinf_text_buffer_index])+1;
+ }
+}
+#endif
+
+int
+#ifdef __USE_PROTOS
+_zzsetmatch(SetWordType *e, char **zzBadText, char **zzMissText,
+ int *zzMissTok, int *zzBadTok,
+ SetWordType **zzMissSet)
+#else
+_zzsetmatch(e, zzBadText, zzMissText, zzMissTok, zzBadTok, zzMissSet)
+SetWordType *e;
+char **zzBadText;
+char **zzMissText;
+int *zzMissTok, *zzBadTok;
+SetWordType **zzMissSet;
+#endif
+{
+#ifdef DEMAND_LOOK
+#ifdef LL_K
+ if ( zzdirty==LL_K ) {zzCONSUME;}
+#else
+ if ( zzdirty ) {zzCONSUME;}
+#endif
+#endif
+ if ( !zzset_el((unsigned)LA(1), e) ) {
+ *zzBadText = (char*)LATEXT(1); // hide warning [ambs]
+ *zzMissText=NULL;
+ *zzMissTok= 0; *zzBadTok=LA(1);
+ *zzMissSet=e;
+ return 0;
+ }
+#ifdef DEMAND_LOOK
+#ifdef LL_K
+ zzdirty++;
+#else
+ zzdirty = 1;
+#endif
+#endif
+ zzMakeAttr
+ return 1;
+}
+
+int
+#ifdef __USE_PROTOS
+_zzmatch_wdfltsig(int tokenWanted, SetWordType *whatFollows)
+#else
+_zzmatch_wdfltsig(tokenWanted, whatFollows)
+int tokenWanted;
+SetWordType *whatFollows;
+#endif
+{
+#ifdef DEMAND_LOOK
+#ifdef LL_K
+ if ( zzdirty==LL_K ) {
+ zzCONSUME;
+ }
+#else
+ if ( zzdirty ) {zzCONSUME;}
+#endif
+#endif
+
+ if ( LA(1)!=tokenWanted )
+ {
+ fprintf(stderr,
+ "line %d: syntax error at \"%s\" missing %s\n",
+ zzline,
+ (LA(1)==zzEOF_TOKEN)?"<eof>":(char*)LATEXT(1),
+ zztokens[tokenWanted]);
+ zzconsumeUntil( whatFollows );
+ return 0;
+ }
+ else {
+ zzMakeAttr
+#ifdef DEMAND_LOOK
+#ifdef LL_K
+ zzdirty++;
+ zzlabase++;
+#else
+ zzdirty = 1;
+#endif
+#else
+/* zzCONSUME; consume if not demand lookahead */
+#endif
+ return 1;
+ }
+}
+
+int
+#ifdef __USE_PROTOS
+_zzsetmatch_wdfltsig(SetWordType *tokensWanted,
+ int tokenTypeOfSet,
+ SetWordType *whatFollows)
+#else
+_zzsetmatch_wdfltsig(tokensWanted, tokenTypeOfSet, whatFollows)
+SetWordType *tokensWanted;
+int tokenTypeOfSet;
+SetWordType *whatFollows;
+#endif
+{
+#ifdef DEMAND_LOOK
+#ifdef LL_K
+ if ( zzdirty==LL_K ) {zzCONSUME;}
+#else
+ if ( zzdirty ) {zzCONSUME;}
+#endif
+#endif
+ if ( !zzset_el((unsigned)LA(1), tokensWanted) )
+ {
+ fprintf(stderr,
+ "line %d: syntax error at \"%s\" missing %s\n",
+ zzline,
+ (LA(1)==zzEOF_TOKEN)?"<eof>":(char*)LATEXT(1),
+ zztokens[tokenTypeOfSet]);
+ zzconsumeUntil( whatFollows );
+ return 0;
+ }
+ else {
+ zzMakeAttr
+#ifdef DEMAND_LOOK
+#ifdef LL_K
+ zzdirty++;
+ zzlabase++;
+#else
+ zzdirty = 1;
+#endif
+#else
+/* zzCONSUME; consume if not demand lookahead */
+#endif
+ return 1;
+ }
+}
+
+int
+#ifdef __USE_PROTOS
+_zzsetmatch_wsig(SetWordType *e)
+#else
+_zzsetmatch_wsig(e)
+SetWordType *e;
+#endif
+{
+#ifdef DEMAND_LOOK
+#ifdef LL_K
+ if ( zzdirty==LL_K ) {zzCONSUME;}
+#else
+ if ( zzdirty ) {zzCONSUME;}
+#endif
+#endif
+ if ( !zzset_el((unsigned)LA(1), e) ) return 0;
+#ifdef DEMAND_LOOK
+#ifdef LL_K
+ zzdirty++;
+#else
+ zzdirty = 1;
+#endif
+#endif
+ zzMakeAttr
+ return 1;
+}
+
+#ifdef USER_ZZMODE_STACK
+static int zzmstk[ZZMAXSTK] = { -1 };
+static int zzmdep = 0;
+static char zzmbuf[70];
+
+void
+#ifdef __USE_PROTOS
+zzmpush( int m )
+#else
+zzmpush( m )
+int m;
+#endif
+{
+ if(zzmdep == ZZMAXSTK - 1) {
+ snprintf(zzmbuf, 69, "Mode stack overflow "); // slower but easier to track pointer overflows
+ zzerr(zzmbuf);
+ } else {
+ zzmstk[zzmdep++] = zzauto;
+ zzmode(m);
+ }
+}
+
+void
+#ifdef __USE_PROTOS
+zzmpop( void )
+#else
+zzmpop( )
+#endif
+{
+ if(zzmdep == 0) {
+ snprintf(zzmbuf, 69, "Mode stack underflow ");
+ zzerr(zzmbuf);
+ }
+ else
+ { zzmdep--;
+ zzmode(zzmstk[zzmdep]);
+ }
+}
+
+void
+#ifdef __USE_PROTOS
+zzsave_mode_stack( int modeStack[], int *modeLevel )
+#else
+zzsave_mode_stack( modeStack, modeLevel )
+int modeStack[];
+int *modeLevel;
+#endif
+{
+ int i;
+ memcpy(modeStack, zzmstk, sizeof(zzmstk));
+ *modeLevel = zzmdep;
+ zzmdep = 0;
+
+ return;
+}
+
+void
+#ifdef __USE_PROTOS
+zzrestore_mode_stack( int modeStack[], int *modeLevel )
+#else
+zzrestore_mode_stack( modeStack, modeLevel )
+int modeStack[];
+int *modeLevel;
+#endif
+{
+ int i;
+
+ memcpy(zzmstk, modeStack, sizeof(zzmstk));
+ zzmdep = *modeLevel;
+
+ return;
+}
+#endif /* USER_ZZMODE_STACK */
+
+#endif /* ERR_H */
diff --git a/btparse/progs/args.c b/btparse/progs/args.c
new file mode 100644
index 0000000..266ce0b
--- /dev/null
+++ b/btparse/progs/args.c
@@ -0,0 +1,108 @@
+/* ------------------------------------------------------------------------
+@NAME : args.c
+@DESCRIPTION: Data related to the command-line arguments, and code to
+ parse them.
+@GLOBALS :
+@CALLS :
+@CREATED : 1997/01/09-10, Greg Ward
+@MODIFIED :
+@VERSION : $Id$
+@COPYRIGHT : Copyright (c) 1996-97 by Gregory P. Ward. All rights reserved.
+
+ This file is part of the btparse distribution (but not part
+ of the library itself). This is free software; you can
+ redistribute it and/or modify it under the terms of the GNU
+ General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your
+ option) any later version.
+-------------------------------------------------------------------------- */
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "getopt.h"
+#include "args.h"
+
+
+/* default options -- "full processing" */
+#if 0
+parser_options options =
+{
+ {
+ 0, /* check_only */
+ 1, /* delete_quotes */
+ 1, /* expand_macros */
+ 1, /* paste_strings */
+ 1 /* collapse_whitespace */
+ },
+ 0, /* dump_ast */
+ 0 /* whole_file */
+};
+#endif
+
+static boolean quote_strings = FALSE;
+static boolean convert_numbers = TRUE;
+static boolean expand_macros = TRUE;
+static boolean paste_strings = TRUE;
+static boolean collapse_whitespace = TRUE;
+
+static boolean check_only = FALSE;
+static boolean dump_ast = FALSE;
+static boolean whole_file = FALSE;
+
+struct option option_table[] =
+{
+ { "check", 0, &check_only, 1 },
+ { "noquotes", 0, &quote_strings, 0 },
+ { "quote", 0, &quote_strings, 1 },
+ { "convert", 0, &convert_numbers, 1 },
+ { "noconvert", 0, &convert_numbers, 0 },
+ { "expand", 0, &expand_macros, 1 },
+ { "noexpand", 0, &expand_macros, 0 },
+ { "paste", 0, &paste_strings, 1 },
+ { "nopaste", 0, &paste_strings, 0 },
+ { "collapse", 0, &collapse_whitespace, 1 },
+ { "nocollapse", 0, &collapse_whitespace, 0 },
+ { "dump", 0, &dump_ast, 1 },
+ { "nodump", 0, &dump_ast, 0 },
+ { "wholefile", 0, &whole_file, 1 },
+ { NULL, 0, 0, 0 }
+};
+
+parser_options *parse_args (int argc, char **argv)
+{
+ int c;
+ parser_options *options;
+
+ while (1)
+ {
+ c = getopt_long_only (argc, argv, "", option_table, NULL);
+ if (c == -1) break; /* last option? */
+
+ switch (c)
+ {
+ case ':':
+ case '?':
+ fprintf (stderr, "%s: error in command-line\n", argv[0]);
+ exit (1);
+ break;
+ }
+ }
+
+ options = (parser_options *) malloc (sizeof (parser_options));
+
+ options->string_opts = 0;
+ options->string_opts |= (convert_numbers ? BTO_CONVERT : 0);
+ options->string_opts |= (expand_macros ? BTO_EXPAND : 0);
+ options->string_opts |= (paste_strings ? BTO_PASTE : 0);
+ options->string_opts |= (collapse_whitespace ? BTO_COLLAPSE : 0);
+
+ options->other_opts = 0; /* do store macro text */
+
+ options->quote_strings = quote_strings;
+ options->check_only = check_only;
+ options->dump_ast = dump_ast;
+ options->whole_file = whole_file;
+
+ return options;
+
+} /* parse_args () */
diff --git a/btparse/progs/args.h b/btparse/progs/args.h
new file mode 100644
index 0000000..617498c
--- /dev/null
+++ b/btparse/progs/args.h
@@ -0,0 +1,34 @@
+/* ------------------------------------------------------------------------
+@NAME : args.h
+@DESCRIPTION: Typedef and prototype needed for command-line processing
+ by the bibparse program.
+@CREATED : January 1997, Greg Ward
+@MODIFIED :
+@VERSION : $Id$
+@COPYRIGHT : Copyright (c) 1996-97 by Gregory P. Ward. All rights reserved.
+
+ This file is part of the btparse distribution (but not part
+ of the library itself). This is free software; you can
+ redistribute it and/or modify it under the terms of the GNU
+ General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your
+ option) any later version.
+-------------------------------------------------------------------------- */
+#ifndef ARGS_H
+#define ARGS_H
+
+#include <btparse.h>
+
+typedef struct
+{
+ btshort string_opts;
+ btshort other_opts;
+ boolean check_only;
+ boolean quote_strings;
+ boolean dump_ast;
+ boolean whole_file;
+} parser_options;
+
+parser_options *parse_args (int argc, char **argv);
+
+#endif /* ARGS_H */
diff --git a/btparse/progs/biblex.c b/btparse/progs/biblex.c
new file mode 100644
index 0000000..cd61acf
--- /dev/null
+++ b/btparse/progs/biblex.c
@@ -0,0 +1,87 @@
+/* ------------------------------------------------------------------------
+@NAME : biblex.c
+@INPUT : a single BibTeX file
+@OUTPUT : dumps the token stream to stdout
+@RETURNS :
+@DESCRIPTION: Evil, naughty, badly-behaved example program for the btparse
+ library. This goes poking rudly about in the internals of
+ both the library and its lexical scanner to perform only
+ lexical analysis on a BibTeX file. It uses this to dump the
+ token stream to stdout.
+
+ This could actually be useful for quickly (ie. without a full
+ parse) constructing an index of a BibTeX file. Eventually,
+ I'd like to put this sort of functionality into the library
+ itself, so this program would be reduced to just calling some
+ mythical bt_next_token() in a loop, or maybe a single call to
+ bt_token_stream() (also mythical).
+@CREATED : Winter 1997, Greg Ward
+@MODIFIED :
+@VERSION : $Id$
+@COPYRIGHT : Copyright (c) 1996-97 by Gregory P. Ward. All rights reserved.
+
+ This file is part of the btparse distribution (but not part
+ of the library itself). This is free software; you can
+ redistribute it and/or modify it under the terms of the GNU
+ General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your
+ option) any later version.
+-------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <btparse.h>
+#include "stdpccts.h" /* poke about btparse's private bits */
+
+int main (int argc, char *argv[])
+{
+ extern char * InputFilename; /* from input.c in the library */
+ char * filename;
+ FILE * infile;
+
+/*
+ static char zztoktext[ZZLEXBUFSIZE];
+
+ zzlextext = zztoktext;
+*/
+ zzbufsize = ZZLEXBUFSIZE;
+ alloc_lex_buffer (zzbufsize);
+
+ if (argc != 2)
+ {
+ fprintf (stderr, "usage: biblex file.bib\n");
+ exit (1);
+ }
+
+ filename = argv[1];
+ if (filename != NULL && strcmp (filename, "-") != 0)
+ {
+ infile = fopen (filename, "r");
+ if (infile == NULL)
+ {
+ perror (filename);
+ return 0;
+ }
+ }
+ else
+ {
+ filename = "(stdin)";
+ infile = stdin;
+ }
+
+ InputFilename = filename;
+ zzrdstream (infile);
+ while (!feof (infile))
+ {
+ zzgettok ();
+ printf ("%3d %4d-%4d %2d=%-10s >%s<\n",
+ zzline, zzbegcol, zzendcol,
+ zztoken, zztokens[zztoken], zzlextext);
+ if (zzbufovf)
+ {
+ printf ("OH NO!! buffer overflowed!\n");
+ }
+ }
+
+ free_lex_buffer ();
+ return 0;
+}
diff --git a/btparse/progs/bibparse.c b/btparse/progs/bibparse.c
new file mode 100644
index 0000000..cc46bac
--- /dev/null
+++ b/btparse/progs/bibparse.c
@@ -0,0 +1,311 @@
+/* ------------------------------------------------------------------------
+@NAME : bibparse.c
+@DESCRIPTION: Parses a series of BibTeX files, with command-line options
+ to control the string post-processing behaviour of the
+ library. Prints the parsed entries out in a slightly
+ different form that should be dead easy to parse in any
+ language (most punctuation and whitespace gone, format is
+ fixed and strictly line-based).
+@GLOBALS :
+@CREATED : May 1996, Greg Ward
+@MODIFIED :
+@VERSION : $Id$
+@COPYRIGHT : Copyright (c) 1996-97 by Gregory P. Ward. All rights reserved.
+
+ This file is part of the btparse distribution (but not part
+ of the library itself). This is free software; you can
+ redistribute it and/or modify it under the terms of the GNU
+ General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your
+ option) any later version.
+-------------------------------------------------------------------------- */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <limits.h>
+#include <stdarg.h>
+#include <assert.h>
+#include <btparse.h>
+
+#include "getopt.h" /* for optind */
+#include "args.h"
+
+#define DEBUG 0
+
+extern void dump_ast (char *msg, AST *root); /* stolen from btparse */
+
+char * Usage = "usage: bibparse [options] file [...]\n";
+char * Help =
+"\n"
+"Options:\n"
+" -check check syntax only (ie. don't print entries out)\n"
+" -noquote don't quote strings [default]\n"
+" -quote put quotes around strings (warning: not bulletproof)\n"
+" -convert convert numeric values to strings\n"
+" -noconvert don't\n"
+" -expand expand macros [default]\n"
+" -noexand don't\n"
+" -paste paste strings together (ie. obey # operator) [default]\n"
+" -nopaste don't\n"
+" -collapse collapse whitespace within strings [default]\n"
+" -nocollapse don't\n"
+"\n"
+"Default behaviour is \"fully processed\":\n"
+" -noquote -convert -expand -paste -collapse\n"
+"\n";
+
+#if 0
+#if DEBUG
+void dprintf (char *format, ...)
+{
+ va_list arglist;
+
+ va_start (arglist, format);
+ vfprintf (stdout, format, arglist);
+ va_end (arglist);
+}
+#else
+void dprintf (char *format, ...) {}
+#endif
+#endif
+
+/* ------------------------------------------------------------------------
+@NAME : print_assigned_entry()
+@INPUT : stream
+ top
+@OUTPUT :
+@RETURNS :
+@DESCRIPTION: Prints out a "field-assignment"-type entry (ie. a macro
+ definition or regular entry).
+@GLOBALS :
+@CALLS : btparse traversal functions
+@CALLERS : print_entry()
+@CREATED : 1997/08/12, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+static void
+print_assigned_entry (FILE *stream, AST *top, boolean quote_strings)
+{
+ char *type, *key;
+ char *field_name;
+ AST *field;
+
+ type = bt_entry_type (top);
+ key = bt_entry_key (top);
+
+ fprintf (stream, "@%s", type);
+ if (key) fprintf (stream, " %s", key);
+ fputc ('\n', stream);
+
+ field = NULL;
+ while ((field = bt_next_field (top, field, &field_name)))
+ {
+ AST * value;
+ bt_nodetype nodetype;
+ char * text;
+ boolean first;
+
+ fprintf (stream, "%s=", field_name);
+
+ value = NULL;
+ first = TRUE;
+
+ while ((value = bt_next_value (field, value, &nodetype, &text)))
+ {
+ if (!first) fputc ('#', stream);
+ if (text)
+ {
+ if (nodetype == BTAST_STRING && quote_strings)
+ fprintf (stream, "{%s}", text);
+ else
+ fputs (text, stream);
+ }
+ first = FALSE;
+ }
+
+ fputc ('\n', stream); /* newline between fields */
+ }
+
+ fputc ('\n', stream); /* blank line to end the entry */
+} /* print_assigned_entry() */
+
+
+/* ------------------------------------------------------------------------
+@NAME : print_value_entry()
+@INPUT : stream
+ top
+@OUTPUT :
+@RETURNS :
+@DESCRIPTION: Prints out a "value-only" entry (ie. comment or preamble).
+@GLOBALS :
+@CALLS : btparse traversal functions
+@CALLERS : print_entry()
+@CREATED : 1997/08/13, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+static void
+print_value_entry (FILE *stream, AST *top)
+{
+ char * type;
+ AST * value;
+ char * text;
+
+ type = bt_entry_type (top);
+ fprintf (stream, "@%s\n", type);
+
+ value = NULL;
+
+ while ((value = bt_next_value (top, value, NULL, &text)))
+ {
+ if (text) fprintf (stream, "%s\n", text);
+ }
+
+ fputc ('\n', stream);
+
+} /* print_value_entry() */
+
+
+/* ------------------------------------------------------------------------
+@NAME : print_entry()
+@INPUT :
+@OUTPUT :
+@RETURNS :
+@DESCRIPTION: Prints a BibTeX entry in a very simplistic form by calling
+ either print_assigned_entry() or print_value_entry(). These
+ in turn work by calling the AST traversal routines in the
+ btparse library, providing canonical examples of how to use
+ these routines.
+@GLOBALS :
+@CALLS :
+@CREATED : 1997/01/22, GPW
+@MODIFIED : 1997/08/13, GPW: changed to differentiate between the two,
+ ahem, meta-meta-types of entries
+-------------------------------------------------------------------------- */
+static void
+print_entry (FILE *stream, AST *top, boolean quote_strings)
+{
+
+#if DEBUG
+ dump_ast ("print_entry: AST before traversing =\n", top);
+#endif
+
+ switch (bt_entry_metatype (top))
+ {
+ case BTE_MACRODEF:
+ case BTE_REGULAR:
+ print_assigned_entry (stream, top, quote_strings);
+ break;
+
+ case BTE_COMMENT:
+ case BTE_PREAMBLE:
+ print_value_entry (stream, top);
+ break;
+
+ default:
+ fprintf (stderr, "warning: unknown entry type \"%s\"\n",
+ bt_entry_type (top));
+ break;
+ }
+#if DEBUG
+ dump_ast ("print_entry: AST after traversing =\n", top);
+#endif
+} /* print_entry() [2nd version] */
+
+
+/* ------------------------------------------------------------------------
+@NAME : process_file
+@INPUT : filename
+@OUTPUT :
+@RETURNS : true if there were no errors or only trivial errors
+ false if there were serious errors
+@DESCRIPTION: Parses an entire BibTeX file one entry at a time. Each
+ entry is separately read, parsed, and printed back out
+ to minimize memory use.
+@GLOBALS :
+@CALLS :
+@CREATED : Jan 1997, GPW
+@MODIFIED :
+@COMMENTS : this *might* eventually wind up in the library, with
+ a function pointer argument to specify what to do
+ to each entry
+-------------------------------------------------------------------------- */
+static int
+process_file (char *filename, parser_options *options)
+{
+ FILE *infile;
+ AST *cur_entry;
+ boolean status, overall_status;
+
+ /*
+ * If a string was given, and it's *not* "-", then open that filename.
+ * Otherwise just use stdin.
+ */
+
+ if (filename != NULL && strcmp (filename, "-") != 0)
+ {
+ infile = fopen (filename, "r");
+ if (infile == NULL)
+ {
+ perror (filename);
+ return 0;
+ }
+ }
+ else
+ {
+ filename = "(stdin)";
+ infile = stdin;
+ }
+
+ bt_set_stringopts (BTE_MACRODEF, options->string_opts);
+ bt_set_stringopts (BTE_REGULAR, options->string_opts);
+ bt_set_stringopts (BTE_COMMENT, options->string_opts);
+ bt_set_stringopts (BTE_PREAMBLE, options->string_opts);
+
+ overall_status = 1; /* assume success */
+ while (1)
+ {
+ cur_entry = bt_parse_entry (infile, filename,
+ options->other_opts,
+ &status);
+ overall_status &= status;
+ if (!cur_entry) break;
+ if (!options->check_only)
+ print_entry (stdout, cur_entry, options->quote_strings);
+ if (options->dump_ast)
+ dump_ast ("AST for whole entry:\n", cur_entry);
+ bt_free_ast (cur_entry);
+ }
+
+ fclose (infile);
+ return overall_status;
+
+} /* process_file() */
+
+
+int main (int argc, char *argv[])
+{
+ parser_options *options;
+
+ options = parse_args (argc, argv);
+ bt_initialize ();
+
+ if (argv[optind]) /* any leftover arguments (filenames) */
+ {
+ int i;
+
+ for (i = optind; i < argc; i++)
+ process_file (argv[i], options);
+ }
+ else
+ {
+ fprintf (stderr, "%s", Usage);
+ fprintf (stderr, "%s", Help);
+ fprintf (stderr, "Not enough arguments\n");
+ exit (1);
+ }
+
+ bt_cleanup ();
+ free (options);
+ exit (bt_error_status (NULL));
+}
diff --git a/btparse/progs/dumpnames.c b/btparse/progs/dumpnames.c
new file mode 100644
index 0000000..831f1b5
--- /dev/null
+++ b/btparse/progs/dumpnames.c
@@ -0,0 +1,154 @@
+/* ------------------------------------------------------------------------
+@NAME : dumpnames.c
+@INPUT :
+@OUTPUT :
+@RETURNS :
+@DESCRIPTION: Parses a BibTeX file, splitting `author' and `editor'
+ fields into lists of names, and then splitting the
+ individual names, and dumping everything.
+@GLOBALS :
+@CALLS :
+@CALLERS :
+@CREATED : 1997/09/29, Greg Ward
+@MODIFIED :
+@VERSION : $Id$
+-------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "btparse.h"
+
+char *Usage = "usage: dumpnames file\n";
+
+
+/* prototypes */
+void dump_component (char * comp, bt_name * name, int part, char * tail);
+void dump_names (AST * entry);
+boolean process_file (char *filename);
+
+
+void dump_component (char * comp, bt_name * name, int part, char * tail)
+{
+ int i;
+ char ** tokens;
+ int num_tokens;
+
+ tokens = name->parts[part];
+ num_tokens = name->part_len[part];
+
+ if (num_tokens > 0)
+ {
+ printf ("%s: ", comp);
+ for (i = 0; i < num_tokens-1; i++)
+ {
+ printf ("%s/", tokens[i]);
+ }
+ printf ("%s%s", tokens[num_tokens-1], tail ? tail : "");
+ }
+
+} /* dump_component () */
+
+
+void dump_names (AST * entry)
+{
+ AST * field;
+ char * fname;
+ char * value;
+ int i;
+ bt_stringlist *
+ namelist;
+ bt_name * name;
+
+
+ if (bt_entry_metatype (entry) != BTE_REGULAR)
+ {
+ printf ("skipping %s entry\n", bt_entry_type (entry));
+ return;
+ }
+ else
+ {
+ printf ("%s: %s\n", bt_entry_key (entry), bt_entry_type (entry));
+ }
+
+ field = NULL;
+ while ((field = bt_next_field (entry, field, &fname)))
+ {
+ if (strcmp (fname, "author") == 0 ||
+ strcmp (fname, "editor") == 0)
+ {
+ value = bt_get_text (field);
+
+ printf ("field: %s:\n", fname);
+ printf (" %s\n", value);
+
+ namelist = bt_split_list (value, "and", NULL, 0, "name");
+ if (namelist != NULL)
+ {
+ printf (" splits into %d names:\n", namelist->num_items);
+ for (i = 0; i < namelist->num_items; i++)
+ {
+ printf (" %s\n", namelist->items[i]);
+
+ name = bt_split_name (namelist->items[i], NULL, 0, i);
+ printf (" ");
+ dump_component ("first", name, BTN_FIRST, "; ");
+ dump_component ("von", name, BTN_VON, "; ");
+ dump_component ("last", name, BTN_LAST, "; ");
+ dump_component ("jr", name, BTN_JR, NULL);
+ printf ("\n");
+ }
+ }
+ }
+ }
+
+} /* dump_names () */
+
+
+boolean process_file (char *filename)
+{
+ FILE * infile;
+ AST * entry;
+ boolean status,
+ overall_status;
+
+ infile = fopen (filename, "r");
+ if (infile == NULL)
+ {
+ perror (filename);
+ return FALSE;
+ }
+
+ overall_status = TRUE; /* assume success */
+ while (1)
+ {
+ entry = bt_parse_entry (infile, filename, 0, &status);
+ overall_status &= status;
+ if (!entry) break;
+ dump_names (entry);
+ bt_free_ast (entry);
+ }
+
+ return overall_status;
+
+} /* process_file () */
+
+
+int main (int argc, char **argv)
+{
+ char * filename;
+ boolean ok;
+
+ if (argc != 2)
+ {
+ fprintf (stderr, "%s", Usage);
+ fprintf (stderr, "Wrong number of arguments\n");
+ exit (1);
+ }
+
+ bt_initialize ();
+ filename = argv[1];
+ ok = process_file (filename);
+ bt_cleanup ();
+ exit (ok ? 0 : 1);
+}
diff --git a/btparse/progs/getopt.c b/btparse/progs/getopt.c
new file mode 100644
index 0000000..0b270df
--- /dev/null
+++ b/btparse/progs/getopt.c
@@ -0,0 +1,813 @@
+/* Getopt for GNU.
+ NOTE: getopt is now part of the C library, so if you don't know what
+ "Keep this file name-space clean" means, talk to roland@gnu.ai.mit.edu
+ before changing it!
+
+ Copyright (C) 1987, 88, 89, 90, 91, 92, 93, 94
+ Free Software Foundation, Inc.
+
+ This program is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Library General Public License
+ as published by the Free Software Foundation; either version 2, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Library General Public License for more details.
+
+ You should have received a copy of the GNU Library General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+/* This tells Alpha OSF/1 not to define a getopt prototype in <stdio.h>.
+ Ditto for AIX 3.2 and <stdlib.h>. */
+#ifndef _NO_PROTO
+#define _NO_PROTO
+#endif
+
+#ifdef HAVE_CONFIG_H
+#if defined (emacs) || defined (CONFIG_BROKETS)
+/* We use <config.h> instead of "config.h" so that a compilation
+ using -I. -I$srcdir will use ./config.h rather than $srcdir/config.h
+ (which it would do because it found this file in $srcdir). */
+#include <config.h>
+#else
+#include "config.h"
+#endif
+#endif
+
+#ifndef __STDC__
+/* This is a separate conditional since some stdc systems
+ reject `defined (const)'. */
+#ifndef const
+#define const
+#endif
+#endif
+
+#include <stdio.h>
+#include <string.h>
+
+/* Comment out all this code if we are using the GNU C Library, and are not
+ actually compiling the library itself. This code is part of the GNU C
+ Library, but also included in many other GNU distributions. Compiling
+ and linking in this code is a waste when using the GNU C library
+ (especially if it is a shared library). Rather than having every GNU
+ program understand `configure --with-gnu-libc' and omit the object files,
+ it is simpler to just do this in the source for each such file. */
+
+#if defined (_LIBC) || !defined (__GNU_LIBRARY__)
+
+
+/* This needs to come after some library #include
+ to get __GNU_LIBRARY__ defined. */
+#ifdef __GNU_LIBRARY__
+/* Don't include stdlib.h for non-GNU C libraries because some of them
+ contain conflicting prototypes for getopt. */
+#include <stdlib.h>
+#endif /* GNU C library. */
+
+/* This version of `getopt' appears to the caller like standard Unix `getopt'
+ but it behaves differently for the user, since it allows the user
+ to intersperse the options with the other arguments.
+
+ As `getopt' works, it permutes the elements of ARGV so that,
+ when it is done, all the options precede everything else. Thus
+ all application programs are extended to handle flexible argument order.
+
+ Setting the environment variable POSIXLY_CORRECT disables permutation.
+ Then the behavior is completely standard.
+
+ GNU application programs can use a third alternative mode in which
+ they can distinguish the relative order of options and other arguments. */
+
+#include "getopt.h"
+
+/* For communication from `getopt' to the caller.
+ When `getopt' finds an option that takes an argument,
+ the argument value is returned here.
+ Also, when `ordering' is RETURN_IN_ORDER,
+ each non-option ARGV-element is returned here. */
+
+char *optarg = NULL;
+
+/* Index in ARGV of the next element to be scanned.
+ This is used for communication to and from the caller
+ and for communication between successive calls to `getopt'.
+
+ On entry to `getopt', zero means this is the first call; initialize.
+
+ When `getopt' returns EOF, this is the index of the first of the
+ non-option elements that the caller should itself scan.
+
+ Otherwise, `optind' communicates from one call to the next
+ how much of ARGV has been scanned so far. */
+
+/* XXX 1003.2 says this must be 1 before any call. */
+int optind = 0;
+
+/* The next char to be scanned in the option-element
+ in which the last option character we returned was found.
+ This allows us to pick up the scan where we left off.
+
+ If this is zero, or a null string, it means resume the scan
+ by advancing to the next ARGV-element. */
+
+static char *nextchar;
+
+/* Callers store zero here to inhibit the error message
+ for unrecognized options. */
+
+int opterr = 1;
+
+/* Set to an option character which was unrecognized.
+ This must be initialized on some systems to avoid linking in the
+ system's own getopt implementation. */
+
+int optopt = '?';
+
+/* Describe how to deal with options that follow non-option ARGV-elements.
+
+ If the caller did not specify anything,
+ the default is REQUIRE_ORDER if the environment variable
+ POSIXLY_CORRECT is defined, PERMUTE otherwise.
+
+ REQUIRE_ORDER means don't recognize them as options;
+ stop option processing when the first non-option is seen.
+ This is what Unix does.
+ This mode of operation is selected by either setting the environment
+ variable POSIXLY_CORRECT, or using `+' as the first character
+ of the list of option characters.
+
+ PERMUTE is the default. We permute the contents of ARGV as we scan,
+ so that eventually all the non-options are at the end. This allows options
+ to be given in any order, even with programs that were not written to
+ expect this.
+
+ RETURN_IN_ORDER is an option available to programs that were written
+ to expect options and other ARGV-elements in any order and that care about
+ the ordering of the two. We describe each non-option ARGV-element
+ as if it were the argument of an option with character code 1.
+ Using `-' as the first character of the list of option characters
+ selects this mode of operation.
+
+ The special argument `--' forces an end of option-scanning regardless
+ of the value of `ordering'. In the case of RETURN_IN_ORDER, only
+ `--' can cause `getopt' to return EOF with `optind' != ARGC. */
+
+static enum
+{
+ REQUIRE_ORDER, PERMUTE, RETURN_IN_ORDER
+} ordering;
+
+#ifdef __GNU_LIBRARY__
+/* We want to avoid inclusion of string.h with non-GNU libraries
+ because there are many ways it can cause trouble.
+ On some systems, it contains special magic macros that don't work
+ in GCC. */
+#include <string.h>
+#define my_index strchr
+#else
+
+/* Avoid depending on library functions or files
+ whose names are inconsistent. */
+
+char *getenv ();
+
+static char *
+my_index (str, chr)
+ const char *str;
+ int chr;
+{
+ while (*str)
+ {
+ if (*str == chr)
+ return (char *) str;
+ str++;
+ }
+ return 0;
+}
+
+/* If using GCC, we can safely declare strlen this way.
+ If not using GCC, it is ok not to declare it. */
+#ifdef __GNUC__
+/* Note that Motorola Delta 68k R3V7 comes with GCC but not stddef.h.
+ That was relevant to code that was here before. */
+#ifndef __STDC__
+/* gcc with -traditional declares the built-in strlen to return int,
+ and has done so at least since version 2.4.5. -- rms. */
+extern int strlen (const char *);
+#endif /* not __STDC__ */
+#endif /* __GNUC__ */
+
+#endif /* not __GNU_LIBRARY__ */
+
+/* Handle permutation of arguments. */
+
+/* Describe the part of ARGV that contains non-options that have
+ been skipped. `first_nonopt' is the index in ARGV of the first of them;
+ `last_nonopt' is the index after the last of them. */
+
+static int first_nonopt;
+static int last_nonopt;
+
+/* Exchange two adjacent subsequences of ARGV.
+ One subsequence is elements [first_nonopt,last_nonopt)
+ which contains all the non-options that have been skipped so far.
+ The other is elements [last_nonopt,optind), which contains all
+ the options processed since those non-options were skipped.
+
+ `first_nonopt' and `last_nonopt' are relocated so that they describe
+ the new indices of the non-options in ARGV after they are moved. */
+
+static void
+exchange (argv)
+ char **argv;
+{
+ int bottom = first_nonopt;
+ int middle = last_nonopt;
+ int top = optind;
+ char *tem;
+
+ /* Exchange the shorter segment with the far end of the longer segment.
+ That puts the shorter segment into the right place.
+ It leaves the longer segment in the right place overall,
+ but it consists of two parts that need to be swapped next. */
+
+ while (top > middle && middle > bottom)
+ {
+ if (top - middle > middle - bottom)
+ {
+ /* Bottom segment is the short one. */
+ int len = middle - bottom;
+ register int i;
+
+ /* Swap it with the top part of the top segment. */
+ for (i = 0; i < len; i++)
+ {
+ tem = argv[bottom + i];
+ argv[bottom + i] = argv[top - (middle - bottom) + i];
+ argv[top - (middle - bottom) + i] = tem;
+ }
+ /* Exclude the moved bottom segment from further swapping. */
+ top -= len;
+ }
+ else
+ {
+ /* Top segment is the short one. */
+ int len = top - middle;
+ register int i;
+
+ /* Swap it with the bottom part of the bottom segment. */
+ for (i = 0; i < len; i++)
+ {
+ tem = argv[bottom + i];
+ argv[bottom + i] = argv[middle + i];
+ argv[middle + i] = tem;
+ }
+ /* Exclude the moved top segment from further swapping. */
+ bottom += len;
+ }
+ }
+
+ /* Update records for the slots the non-options now occupy. */
+
+ first_nonopt += (optind - last_nonopt);
+ last_nonopt = optind;
+}
+
+/* Initialize the internal data when the first call is made. */
+
+static const char *
+_getopt_initialize (optstring)
+ const char *optstring;
+{
+ /* Start processing options with ARGV-element 1 (since ARGV-element 0
+ is the program name); the sequence of previously skipped
+ non-option ARGV-elements is empty. */
+
+ first_nonopt = last_nonopt = optind = 1;
+
+ nextchar = NULL;
+
+ /* Determine how to handle the ordering of options and nonoptions. */
+
+ if (optstring[0] == '-')
+ {
+ ordering = RETURN_IN_ORDER;
+ ++optstring;
+ }
+ else if (optstring[0] == '+')
+ {
+ ordering = REQUIRE_ORDER;
+ ++optstring;
+ }
+ else if (getenv ("POSIXLY_CORRECT") != NULL)
+ ordering = REQUIRE_ORDER;
+ else
+ ordering = PERMUTE;
+
+ return optstring;
+}
+
+/* Scan elements of ARGV (whose length is ARGC) for option characters
+ given in OPTSTRING.
+
+ If an element of ARGV starts with '-', and is not exactly "-" or "--",
+ then it is an option element. The characters of this element
+ (aside from the initial '-') are option characters. If `getopt'
+ is called repeatedly, it returns successively each of the option characters
+ from each of the option elements.
+
+ If `getopt' finds another option character, it returns that character,
+ updating `optind' and `nextchar' so that the next call to `getopt' can
+ resume the scan with the following option character or ARGV-element.
+
+ If there are no more option characters, `getopt' returns `EOF'.
+ Then `optind' is the index in ARGV of the first ARGV-element
+ that is not an option. (The ARGV-elements have been permuted
+ so that those that are not options now come last.)
+
+ OPTSTRING is a string containing the legitimate option characters.
+ If an option character is seen that is not listed in OPTSTRING,
+ return '?' after printing an error message. If you set `opterr' to
+ zero, the error message is suppressed but we still return '?'.
+
+ If a char in OPTSTRING is followed by a colon, that means it wants an arg,
+ so the following text in the same ARGV-element, or the text of the following
+ ARGV-element, is returned in `optarg'. Two colons mean an option that
+ wants an optional arg; if there is text in the current ARGV-element,
+ it is returned in `optarg', otherwise `optarg' is set to zero.
+
+ If OPTSTRING starts with `-' or `+', it requests different methods of
+ handling the non-option ARGV-elements.
+ See the comments about RETURN_IN_ORDER and REQUIRE_ORDER, above.
+
+ Long-named options begin with `--' instead of `-'.
+ Their names may be abbreviated as long as the abbreviation is unique
+ or is an exact match for some defined option. If they have an
+ argument, it follows the option name in the same ARGV-element, separated
+ from the option name by a `=', or else the in next ARGV-element.
+ When `getopt' finds a long-named option, it returns 0 if that option's
+ `flag' field is nonzero, the value of the option's `val' field
+ if the `flag' field is zero.
+
+ The elements of ARGV aren't really const, because we permute them.
+ But we pretend they're const in the prototype to be compatible
+ with other systems.
+
+ LONGOPTS is a vector of `struct option' terminated by an
+ element containing a name which is zero.
+
+ LONGIND returns the index in LONGOPT of the long-named option found.
+ It is only valid when a long-named option has been found by the most
+ recent call.
+
+ If LONG_ONLY is nonzero, '-' as well as '--' can introduce
+ long-named options. */
+
+#if NLS
+#include "nl_types.h"
+#endif
+
+int
+_getopt_internal (argc, argv, optstring, longopts, longind, long_only)
+ int argc;
+ char *const *argv;
+ const char *optstring;
+ const struct option *longopts;
+ int *longind;
+ int long_only;
+{
+ optarg = NULL;
+
+#if NLS
+ libc_nls_init();
+#endif
+
+ if (optind == 0)
+ optstring = _getopt_initialize (optstring);
+
+ if (nextchar == NULL || *nextchar == '\0')
+ {
+ /* Advance to the next ARGV-element. */
+
+ if (ordering == PERMUTE)
+ {
+ /* If we have just processed some options following some non-options,
+ exchange them so that the options come first. */
+
+ if (first_nonopt != last_nonopt && last_nonopt != optind)
+ exchange ((char **) argv);
+ else if (last_nonopt != optind)
+ first_nonopt = optind;
+
+ /* Skip any additional non-options
+ and extend the range of non-options previously skipped. */
+
+ while (optind < argc
+ && (argv[optind][0] != '-' || argv[optind][1] == '\0'))
+ optind++;
+ last_nonopt = optind;
+ }
+
+ /* The special ARGV-element `--' means premature end of options.
+ Skip it like a null option,
+ then exchange with previous non-options as if it were an option,
+ then skip everything else like a non-option. */
+
+ if (optind != argc && !strcmp (argv[optind], "--"))
+ {
+ optind++;
+
+ if (first_nonopt != last_nonopt && last_nonopt != optind)
+ exchange ((char **) argv);
+ else if (first_nonopt == last_nonopt)
+ first_nonopt = optind;
+ last_nonopt = argc;
+
+ optind = argc;
+ }
+
+ /* If we have done all the ARGV-elements, stop the scan
+ and back over any non-options that we skipped and permuted. */
+
+ if (optind == argc)
+ {
+ /* Set the next-arg-index to point at the non-options
+ that we previously skipped, so the caller will digest them. */
+ if (first_nonopt != last_nonopt)
+ optind = first_nonopt;
+ return EOF;
+ }
+
+ /* If we have come to a non-option and did not permute it,
+ either stop the scan or describe it to the caller and pass it by. */
+
+ if ((argv[optind][0] != '-' || argv[optind][1] == '\0'))
+ {
+ if (ordering == REQUIRE_ORDER)
+ return EOF;
+ optarg = argv[optind++];
+ return 1;
+ }
+
+ /* We have found another option-ARGV-element.
+ Skip the initial punctuation. */
+
+ nextchar = (argv[optind] + 1
+ + (longopts != NULL && argv[optind][1] == '-'));
+ }
+
+ /* Decode the current option-ARGV-element. */
+
+ /* Check whether the ARGV-element is a long option.
+
+ If long_only and the ARGV-element has the form "-f", where f is
+ a valid short option, don't consider it an abbreviated form of
+ a long option that starts with f. Otherwise there would be no
+ way to give the -f short option.
+
+ On the other hand, if there's a long option "fubar" and
+ the ARGV-element is "-fu", do consider that an abbreviation of
+ the long option, just like "--fu", and not "-f" with arg "u".
+
+ This distinction seems to be the most useful approach. */
+
+ if (longopts != NULL
+ && (argv[optind][1] == '-'
+ || (long_only && (argv[optind][2] || !my_index (optstring, argv[optind][1])))))
+ {
+ char *nameend;
+ const struct option *p;
+ const struct option *pfound = NULL;
+ int exact = 0;
+ int ambig = 0;
+ int indfound;
+ int option_index;
+
+ for (nameend = nextchar; *nameend && *nameend != '='; nameend++)
+ /* Do nothing. */ ;
+
+ /* Test all long options for either exact match
+ or abbreviated matches. */
+ for (p = longopts, option_index = 0; p->name; p++, option_index++)
+ if (!strncmp (p->name, nextchar, nameend - nextchar))
+ {
+ if (nameend - nextchar == strlen (p->name))
+ {
+ /* Exact match found. */
+ pfound = p;
+ indfound = option_index;
+ exact = 1;
+ break;
+ }
+ else if (pfound == NULL)
+ {
+ /* First nonexact match found. */
+ pfound = p;
+ indfound = option_index;
+ }
+ else
+ /* Second or later nonexact match found. */
+ ambig = 1;
+ }
+
+ if (ambig && !exact)
+ {
+ if (opterr)
+#if NLS
+ fprintf (stderr,
+ catgets(_libc_cat, GetoptSet, GetoptAmbiguous,
+ "%s: option `%s' is ambiguous\n"),
+ argv[0], argv[optind]);
+#else
+ fprintf (stderr, "%s: option `%s' is ambiguous\n",
+ argv[0], argv[optind]);
+#endif
+ nextchar += strlen (nextchar);
+ optind++;
+ return '?';
+ }
+
+ if (pfound != NULL)
+ {
+ option_index = indfound;
+ optind++;
+ if (*nameend)
+ {
+ /* Don't test has_arg with >, because some C compilers don't
+ allow it to be used on enums. */
+ if (pfound->has_arg)
+ optarg = nameend + 1;
+ else
+ {
+ if (opterr)
+ {
+ if (argv[optind - 1][1] == '-')
+ /* --option */
+#if NLS
+ fprintf (stderr,
+ catgets(_libc_cat, GetoptSet, GetoptNoArgumentsAllowed1,
+ "%s: option `--%s' doesn't allow an argument\n"),
+ argv[0], pfound->name);
+#else
+ fprintf (stderr,
+ "%s: option `--%s' doesn't allow an argument\n",
+ argv[0], pfound->name);
+#endif
+ else
+ /* +option or -option */
+#if NLS
+ fprintf (stderr,
+ catgets(_libc_cat, GetoptSet, GetoptNoArgumentsAllowed2,
+ "%s: option `%c%s' doesn't allow an argument\n"),
+ argv[0], argv[optind - 1][0], pfound->name);
+#else
+ fprintf (stderr,
+ "%s: option `%c%s' doesn't allow an argument\n",
+ argv[0], argv[optind - 1][0], pfound->name);
+#endif
+ }
+ nextchar += strlen (nextchar);
+ return '?';
+ }
+ }
+ else if (pfound->has_arg == 1)
+ {
+ if (optind < argc)
+ optarg = argv[optind++];
+ else
+ {
+ if (opterr)
+#if NLS
+ fprintf (stderr,
+ catgets(_libc_cat, GetoptSet, GetoptRequiresArgument1,
+ "%s: option `%s' requires an argument\n"),
+ argv[0], argv[optind - 1]);
+#else
+ fprintf (stderr, "%s: option `%s' requires an argument\n",
+ argv[0], argv[optind - 1]);
+#endif
+ nextchar += strlen (nextchar);
+ return optstring[0] == ':' ? ':' : '?';
+ }
+ }
+ nextchar += strlen (nextchar);
+ if (longind != NULL)
+ *longind = option_index;
+ if (pfound->flag)
+ {
+ *(pfound->flag) = pfound->val;
+ return 0;
+ }
+ return pfound->val;
+ }
+
+ /* Can't find it as a long option. If this is not getopt_long_only,
+ or the option starts with '--' or is not a valid short
+ option, then it's an error.
+ Otherwise interpret it as a short option. */
+ if (!long_only || argv[optind][1] == '-'
+ || my_index (optstring, *nextchar) == NULL)
+ {
+ if (opterr)
+ {
+ if (argv[optind][1] == '-')
+ /* --option */
+#if NLS
+ fprintf (stderr,
+ catgets(_libc_cat, GetoptSet, GetoptUnrecognized1,
+ "%s: unrecognized option `--%s'\n"),
+ argv[0], nextchar);
+#else
+ fprintf (stderr, "%s: unrecognized option `--%s'\n",
+ argv[0], nextchar);
+#endif
+ else
+ /* +option or -option */
+#if NLS
+ fprintf (stderr,
+ catgets(_libc_cat, GetoptSet, GetoptUnrecognized2,
+ "%s: unrecognized option `%c%s'\n"),
+ argv[0], argv[optind][0], nextchar);
+#else
+ fprintf (stderr, "%s: unrecognized option `%c%s'\n",
+ argv[0], argv[optind][0], nextchar);
+#endif
+ }
+ nextchar = (char *) "";
+ optind++;
+ return '?';
+ }
+ }
+
+ /* Look at and handle the next short option-character. */
+
+ {
+ char c = *nextchar++;
+ char *temp = my_index (optstring, c);
+
+ /* Increment `optind' when we start to process its last character. */
+ if (*nextchar == '\0')
+ ++optind;
+
+ if (temp == NULL || c == ':')
+ {
+ if (opterr)
+ {
+ /* 1003.2 specifies the format of this message. */
+#if NLS
+ fprintf (stderr,
+ catgets(_libc_cat, GetoptSet, GetoptIllegal,
+ "%s: illegal option -- %c\n"),
+ argv[0], c);
+#else
+ fprintf (stderr, "%s: illegal option -- %c\n", argv[0], c);
+#endif
+ }
+ optopt = c;
+ return '?';
+ }
+ if (temp[1] == ':')
+ {
+ if (temp[2] == ':')
+ {
+ /* This is an option that accepts an argument optionally. */
+ if (*nextchar != '\0')
+ {
+ optarg = nextchar;
+ optind++;
+ }
+ else
+ optarg = NULL;
+ nextchar = NULL;
+ }
+ else
+ {
+ /* This is an option that requires an argument. */
+ if (*nextchar != '\0')
+ {
+ optarg = nextchar;
+ /* If we end this ARGV-element by taking the rest as an arg,
+ we must advance to the next element now. */
+ optind++;
+ }
+ else if (optind == argc)
+ {
+ if (opterr)
+ {
+ /* 1003.2 specifies the format of this message. */
+#if NLS
+ fprintf (stderr,
+ catgets(_libc_cat, GetoptSet,
+ GetoptRequiresArgument2,
+ "%s: option requires an argument -- %c\n"),
+ argv[0], c);
+#else
+ fprintf (stderr, "%s: option requires an argument -- %c\n",
+ argv[0], c);
+#endif
+ }
+ optopt = c;
+ if (optstring[0] == ':')
+ c = ':';
+ else
+ c = '?';
+ }
+ else
+ /* We already incremented `optind' once;
+ increment it again when taking next ARGV-elt as argument. */
+ optarg = argv[optind++];
+ nextchar = NULL;
+ }
+ }
+ return c;
+ }
+}
+
+int
+getopt (argc, argv, optstring)
+ int argc;
+ char *const *argv;
+ const char *optstring;
+{
+ return _getopt_internal (argc, argv, optstring,
+ (const struct option *) 0,
+ (int *) 0,
+ 0);
+}
+
+#endif /* _LIBC or not __GNU_LIBRARY__. */
+
+#ifdef TEST
+
+/* Compile with -DTEST to make an executable for use in testing
+ the above definition of `getopt'. */
+
+int
+main (argc, argv)
+ int argc;
+ char **argv;
+{
+ int c;
+ int digit_optind = 0;
+
+ while (1)
+ {
+ int this_option_optind = optind ? optind : 1;
+
+ c = getopt (argc, argv, "abc:d:0123456789");
+ if (c == EOF)
+ break;
+
+ switch (c)
+ {
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ if (digit_optind != 0 && digit_optind != this_option_optind)
+ printf ("digits occur in two different argv-elements.\n");
+ digit_optind = this_option_optind;
+ printf ("option %c\n", c);
+ break;
+
+ case 'a':
+ printf ("option a\n");
+ break;
+
+ case 'b':
+ printf ("option b\n");
+ break;
+
+ case 'c':
+ printf ("option c with value `%s'\n", optarg);
+ break;
+
+ case '?':
+ break;
+
+ default:
+ printf ("?? getopt returned character code 0%o ??\n", c);
+ }
+ }
+
+ if (optind < argc)
+ {
+ printf ("non-option ARGV-elements: ");
+ while (optind < argc)
+ printf ("%s ", argv[optind++]);
+ printf ("\n");
+ }
+
+ exit (0);
+}
+
+#endif /* TEST */
diff --git a/btparse/progs/getopt.h b/btparse/progs/getopt.h
new file mode 100644
index 0000000..d751c6b
--- /dev/null
+++ b/btparse/progs/getopt.h
@@ -0,0 +1,132 @@
+/* Declarations for getopt.
+ Copyright (C) 1989, 1990, 1991, 1992, 1993 Free Software Foundation, Inc.
+
+This file is part of the GNU C Library.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB. If
+not, write to the Free Software Foundation, Inc., 675 Mass Ave,
+Cambridge, MA 02139, USA. */
+
+#ifndef _GETOPT_H
+#define _GETOPT_H 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* For communication from `getopt' to the caller.
+ When `getopt' finds an option that takes an argument,
+ the argument value is returned here.
+ Also, when `ordering' is RETURN_IN_ORDER,
+ each non-option ARGV-element is returned here. */
+
+extern char *optarg;
+
+/* Index in ARGV of the next element to be scanned.
+ This is used for communication to and from the caller
+ and for communication between successive calls to `getopt'.
+
+ On entry to `getopt', zero means this is the first call; initialize.
+
+ When `getopt' returns EOF, this is the index of the first of the
+ non-option elements that the caller should itself scan.
+
+ Otherwise, `optind' communicates from one call to the next
+ how much of ARGV has been scanned so far. */
+
+extern int optind;
+
+/* Callers store zero here to inhibit the error message `getopt' prints
+ for unrecognized options. */
+
+extern int opterr;
+
+/* Set to an option character which was unrecognized. */
+
+extern int optopt;
+
+/* Describe the long-named options requested by the application.
+ The LONG_OPTIONS argument to getopt_long or getopt_long_only is a vector
+ of `struct option' terminated by an element containing a name which is
+ zero.
+
+ The field `has_arg' is:
+ no_argument (or 0) if the option does not take an argument,
+ required_argument (or 1) if the option requires an argument,
+ optional_argument (or 2) if the option takes an optional argument.
+
+ If the field `flag' is not NULL, it points to a variable that is set
+ to the value given in the field `val' when the option is found, but
+ left unchanged if the option is not found.
+
+ To have a long-named option do something other than set an `int' to
+ a compiled-in constant, such as set a value from `optarg', set the
+ option's `flag' field to zero and its `val' field to a nonzero
+ value (the equivalent single-letter option character, if there is
+ one). For long options that have a zero `flag' field, `getopt'
+ returns the contents of the `val' field. */
+
+struct option
+{
+#if __STDC__
+ const char *name;
+#else
+ char *name;
+#endif
+ /* has_arg can't be an enum because some compilers complain about
+ type mismatches in all the code that assumes it is an int. */
+ int has_arg;
+ int *flag;
+ int val;
+};
+
+/* Names for the values of the `has_arg' field of `struct option'. */
+
+#define no_argument 0
+#define required_argument 1
+#define optional_argument 2
+
+#if __STDC__
+#if defined(__GNU_LIBRARY__)
+/* Many other libraries have conflicting prototypes for getopt, with
+ differences in the consts, in stdlib.h. To avoid compilation
+ errors, only prototype getopt for the GNU C library. */
+extern int getopt (int argc, char *const *argv, const char *shortopts);
+#else /* not __GNU_LIBRARY__ */
+extern int getopt ();
+#endif /* not __GNU_LIBRARY__ */
+extern int getopt_long (int argc, char *const *argv, const char *shortopts,
+ const struct option *longopts, int *longind);
+extern int getopt_long_only (int argc, char *const *argv,
+ const char *shortopts,
+ const struct option *longopts, int *longind);
+
+/* Internal only. Users should not call this directly. */
+extern int _getopt_internal (int argc, char *const *argv,
+ const char *shortopts,
+ const struct option *longopts, int *longind,
+ int long_only);
+#else /* not __STDC__ */
+extern int getopt ();
+extern int getopt_long ();
+extern int getopt_long_only ();
+
+extern int _getopt_internal ();
+#endif /* not __STDC__ */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _GETOPT_H */
diff --git a/btparse/progs/getopt1.c b/btparse/progs/getopt1.c
new file mode 100644
index 0000000..90dc12a
--- /dev/null
+++ b/btparse/progs/getopt1.c
@@ -0,0 +1,187 @@
+/* getopt_long and getopt_long_only entry points for GNU getopt.
+ Copyright (C) 1987, 88, 89, 90, 91, 92, 1993
+ Free Software Foundation, Inc.
+
+ This program is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Library General Public License
+ as published by the Free Software Foundation; either version 2, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Library General Public License for more details.
+
+ You should have received a copy of the GNU Library General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+#ifdef HAVE_CONFIG_H
+#if defined (emacs) || defined (CONFIG_BROKETS)
+/* We use <config.h> instead of "config.h" so that a compilation
+ using -I. -I$srcdir will use ./config.h rather than $srcdir/config.h
+ (which it would do because it found this file in $srcdir). */
+#include <config.h>
+#else
+#include "config.h"
+#endif
+#endif
+
+#include "getopt.h"
+
+#ifndef __STDC__
+/* This is a separate conditional since some stdc systems
+ reject `defined (const)'. */
+#ifndef const
+#define const
+#endif
+#endif
+
+#include <stdio.h>
+
+/* Comment out all this code if we are using the GNU C Library, and are not
+ actually compiling the library itself. This code is part of the GNU C
+ Library, but also included in many other GNU distributions. Compiling
+ and linking in this code is a waste when using the GNU C library
+ (especially if it is a shared library). Rather than having every GNU
+ program understand `configure --with-gnu-libc' and omit the object files,
+ it is simpler to just do this in the source for each such file. */
+
+#if defined (_LIBC) || !defined (__GNU_LIBRARY__)
+
+
+/* This needs to come after some library #include
+ to get __GNU_LIBRARY__ defined. */
+#ifdef __GNU_LIBRARY__
+#include <stdlib.h>
+#else
+char *getenv ();
+#endif
+
+#ifndef NULL
+#define NULL 0
+#endif
+
+int
+getopt_long (argc, argv, options, long_options, opt_index)
+ int argc;
+ char *const *argv;
+ const char *options;
+ const struct option *long_options;
+ int *opt_index;
+{
+ return _getopt_internal (argc, argv, options, long_options, opt_index, 0);
+}
+
+/* Like getopt_long, but '-' as well as '--' can indicate a long option.
+ If an option that starts with '-' (not '--') doesn't match a long option,
+ but does match a short option, it is parsed as a short option
+ instead. */
+
+int
+getopt_long_only (argc, argv, options, long_options, opt_index)
+ int argc;
+ char *const *argv;
+ const char *options;
+ const struct option *long_options;
+ int *opt_index;
+{
+ return _getopt_internal (argc, argv, options, long_options, opt_index, 1);
+}
+
+
+#endif /* _LIBC or not __GNU_LIBRARY__. */
+
+#ifdef TEST
+
+#include <stdio.h>
+
+int
+main (argc, argv)
+ int argc;
+ char **argv;
+{
+ int c;
+ int digit_optind = 0;
+
+ while (1)
+ {
+ int this_option_optind = optind ? optind : 1;
+ int option_index = 0;
+ static struct option long_options[] =
+ {
+ {"add", 1, 0, 0},
+ {"append", 0, 0, 0},
+ {"delete", 1, 0, 0},
+ {"verbose", 0, 0, 0},
+ {"create", 0, 0, 0},
+ {"file", 1, 0, 0},
+ {0, 0, 0, 0}
+ };
+
+ c = getopt_long (argc, argv, "abc:d:0123456789",
+ long_options, &option_index);
+ if (c == EOF)
+ break;
+
+ switch (c)
+ {
+ case 0:
+ printf ("option %s", long_options[option_index].name);
+ if (optarg)
+ printf (" with arg %s", optarg);
+ printf ("\n");
+ break;
+
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ if (digit_optind != 0 && digit_optind != this_option_optind)
+ printf ("digits occur in two different argv-elements.\n");
+ digit_optind = this_option_optind;
+ printf ("option %c\n", c);
+ break;
+
+ case 'a':
+ printf ("option a\n");
+ break;
+
+ case 'b':
+ printf ("option b\n");
+ break;
+
+ case 'c':
+ printf ("option c with value `%s'\n", optarg);
+ break;
+
+ case 'd':
+ printf ("option d with value `%s'\n", optarg);
+ break;
+
+ case '?':
+ break;
+
+ default:
+ printf ("?? getopt returned character code 0%o ??\n", c);
+ }
+ }
+
+ if (optind < argc)
+ {
+ printf ("non-option ARGV-elements: ");
+ while (optind < argc)
+ printf ("%s ", argv[optind++]);
+ printf ("\n");
+ }
+
+ exit (0);
+}
+
+#endif /* TEST */
diff --git a/btparse/src/attrib.h b/btparse/src/attrib.h
new file mode 100644
index 0000000..4f4db87
--- /dev/null
+++ b/btparse/src/attrib.h
@@ -0,0 +1,35 @@
+/* ------------------------------------------------------------------------
+@NAME : attrib.h
+@DESCRIPTION: Definition of the Attrib type needed by the PCCTS-
+ generated parser.
+@CREATED : Summer 1996, Greg Ward
+@MODIFIED :
+@VERSION : $Id$
+@COPYRIGHT : Copyright (c) 1996-99 by Gregory P. Ward. All rights reserved.
+
+ This file is part of the btparse library. This library is
+ free software; you can redistribute it and/or modify it under
+ the terms of the GNU Library General Public License as
+ published by the Free Software Foundation; either version 2
+ of the License, or (at your option) any later version.
+-------------------------------------------------------------------------- */
+
+#ifndef ATTRIB_H
+#define ATTRIB_H
+
+/*
+ * Defining Attrib this way (as opposed to making it a pointer to a struct)
+ * avoid the expense of allocating/deallocating a structure for each token;
+ * this way, PCCTS statically allocates the whole stack once and that's
+ * it. (Of course, the stack is four times bigger than it would have been
+ * otherwise.)
+ */
+
+typedef struct {
+ int line;
+ int offset;
+ int token;
+ char *text;
+} Attrib;
+
+#endif /* ATTRIB_H */
diff --git a/btparse/src/bibtex.c b/btparse/src/bibtex.c
new file mode 100644
index 0000000..9ca2ee4
--- /dev/null
+++ b/btparse/src/bibtex.c
@@ -0,0 +1,351 @@
+/*
+ * A n t l r T r a n s l a t i o n H e a d e r
+ *
+ * Terence Parr, Will Cohen, and Hank Dietz: 1989-1994
+ * Purdue University Electrical Engineering
+ * With AHPCRC, University of Minnesota
+ * ANTLR Version 1.33
+ */
+#include <stdio.h>
+#define ANTLR_VERSION 133
+
+#define ZZCOL
+#define USER_ZZSYN
+
+#include "config.h"
+#include "btparse.h"
+#include "attrib.h"
+#include "lex_auxiliary.h"
+#include "error.h"
+#include "my_dmalloc.h"
+#include "parse_auxiliary.h"
+
+extern char * InputFilename; /* for zzcr_ast call in pccts/ast.c */
+#define GENAST
+
+#include "../pccts/ast.h"
+
+#define zzSET_SIZE 4
+#include "../pccts/antlr.h"
+#include "tokens.h"
+#include "../pccts/dlgdef.h"
+#include "mode.h"
+#ifndef PURIFY
+#define PURIFY(r,s)
+#endif
+#include "../pccts/ast.c"
+zzASTgvars
+
+ANTLR_INFO
+
+void
+#ifdef __STDC__
+bibfile(AST**_root)
+#else
+bibfile(_root)
+AST **_root;
+#endif
+{
+ zzRULE;
+ zzBLOCK(zztasp1);
+ zzMake0;
+ {
+ AST *last; (*_root) = NULL;
+ {
+ zzBLOCK(zztasp2);
+ zzMake0;
+ {
+ while ( LA(1)==AT ) {
+ _ast = NULL; entry(&_ast);
+ /* a little creative forestry... */
+ if ((*_root) == NULL)
+ (*_root) = zzastArg(1);
+ else
+ last->right = zzastArg(1);
+ last = zzastArg(1);
+ zzLOOP(zztasp2);
+ }
+ zzEXIT(zztasp2);
+ }
+ }
+ zzEXIT(zztasp1);
+ return;
+fail:
+ zzEXIT(zztasp1);
+ zzsyn(zzMissText, zzBadTok, (ANTLRChar *)"", zzMissSet, zzMissTok, zzErrk, zzBadText);
+ zzresynch(setwd1, 0x1);
+ }
+}
+
+void
+#ifdef __STDC__
+entry(AST**_root)
+#else
+entry(_root)
+AST **_root;
+#endif
+{
+ zzRULE;
+ zzBLOCK(zztasp1);
+ zzMake0;
+ {
+ bt_metatype metatype;
+ zzmatch(AT); zzCONSUME;
+ zzmatch(NAME); zzsubroot(_root, &_sibling, &_tail);
+
+ metatype = entry_metatype();
+ zzastArg(1)->nodetype = BTAST_ENTRY;
+ zzastArg(1)->metatype = metatype;
+ zzCONSUME;
+
+ body(zzSTR, metatype ); zzlink(_root, &_sibling, &_tail);
+ zzEXIT(zztasp1);
+ return;
+fail:
+ zzEXIT(zztasp1);
+ zzsyn(zzMissText, zzBadTok, (ANTLRChar *)"", zzMissSet, zzMissTok, zzErrk, zzBadText);
+ zzresynch(setwd1, 0x2);
+ }
+}
+
+void
+#ifdef __STDC__
+body(AST**_root, bt_metatype metatype )
+#else
+body(_root,metatype)
+AST **_root;
+ bt_metatype metatype ;
+#endif
+{
+ zzRULE;
+ zzBLOCK(zztasp1);
+ zzMake0;
+ {
+ if ( LA(1)==STRING) {
+ if (!(metatype == BTE_COMMENT )) {zzfailed_pred(" metatype == BTE_COMMENT ");}
+ zzmatch(STRING); zzsubchild(_root, &_sibling, &_tail);
+ zzastArg(1)->nodetype = BTAST_STRING;
+ zzCONSUME;
+
+ }
+ else {
+ if ( LA(1)==ENTRY_OPEN) {
+ zzmatch(ENTRY_OPEN); zzCONSUME;
+ contents(zzSTR, metatype ); zzlink(_root, &_sibling, &_tail);
+ zzmatch(ENTRY_CLOSE); zzCONSUME;
+ }
+ else {zzFAIL(1,zzerr1,&zzMissSet,&zzMissText,&zzBadTok,&zzBadText,&zzErrk); goto fail;}
+ }
+ zzEXIT(zztasp1);
+ return;
+fail:
+ zzEXIT(zztasp1);
+ zzsyn(zzMissText, zzBadTok, (ANTLRChar *)"", zzMissSet, zzMissTok, zzErrk, zzBadText);
+ zzresynch(setwd1, 0x4);
+ }
+}
+
+void
+#ifdef __STDC__
+contents(AST**_root, bt_metatype metatype )
+#else
+contents(_root,metatype)
+AST **_root;
+ bt_metatype metatype ;
+#endif
+{
+ zzRULE;
+ zzBLOCK(zztasp1);
+ zzMake0;
+ {
+ if ( (setwd1[LA(1)]&0x8)&&(metatype == BTE_REGULAR /* || metatype == BTE_MODIFY */ ) ) {
+ if (!(metatype == BTE_REGULAR /* || metatype == BTE_MODIFY */ )) {zzfailed_pred(" metatype == BTE_REGULAR /* || metatype == BTE_MODIFY */ ");}
+ {
+ zzBLOCK(zztasp2);
+ zzMake0;
+ {
+ if ( LA(1)==NAME ) {
+ zzmatch(NAME); zzsubchild(_root, &_sibling, &_tail); zzCONSUME;
+ }
+ else {
+ if ( LA(1)==NUMBER) {
+ zzmatch(NUMBER); zzsubchild(_root, &_sibling, &_tail); zzCONSUME;
+ }
+ else {zzFAIL(1,zzerr2,&zzMissSet,&zzMissText,&zzBadTok,&zzBadText,&zzErrk); goto fail;}
+ }
+ zzEXIT(zztasp2);
+ }
+ }
+ zzastArg(1)->nodetype = BTAST_KEY;
+ zzmatch(COMMA); zzCONSUME;
+ fields(zzSTR); zzlink(_root, &_sibling, &_tail);
+ }
+ else {
+ if ( (setwd1[LA(1)]&0x10)&&(metatype == BTE_MACRODEF ) ) {
+ if (!(metatype == BTE_MACRODEF )) {zzfailed_pred(" metatype == BTE_MACRODEF ");}
+ fields(zzSTR); zzlink(_root, &_sibling, &_tail);
+ }
+ else {
+ if ( (setwd1[LA(1)]&0x20)&&(metatype == BTE_PREAMBLE ) ) {
+ if (!(metatype == BTE_PREAMBLE )) {zzfailed_pred(" metatype == BTE_PREAMBLE ");}
+ value(zzSTR); zzlink(_root, &_sibling, &_tail);
+ }
+ else {zzFAIL(1,zzerr3,&zzMissSet,&zzMissText,&zzBadTok,&zzBadText,&zzErrk); goto fail;}
+ }
+ }
+ zzEXIT(zztasp1);
+ return;
+fail:
+ zzEXIT(zztasp1);
+ zzsyn(zzMissText, zzBadTok, (ANTLRChar *)"", zzMissSet, zzMissTok, zzErrk, zzBadText);
+ zzresynch(setwd1, 0x40);
+ }
+}
+
+void
+#ifdef __STDC__
+fields(AST**_root)
+#else
+fields(_root)
+AST **_root;
+#endif
+{
+ zzRULE;
+ zzBLOCK(zztasp1);
+ zzMake0;
+ {
+ if ( LA(1)==NAME) {
+ field(zzSTR); zzlink(_root, &_sibling, &_tail);
+ {
+ zzBLOCK(zztasp2);
+ zzMake0;
+ {
+ if ( LA(1)==COMMA) {
+ zzmatch(COMMA); zzCONSUME;
+ fields(zzSTR); zzlink(_root, &_sibling, &_tail);
+ }
+ zzEXIT(zztasp2);
+ }
+ }
+ }
+ else {
+ if ( LA(1)==ENTRY_CLOSE) {
+ }
+ else {zzFAIL(1,zzerr4,&zzMissSet,&zzMissText,&zzBadTok,&zzBadText,&zzErrk); goto fail;}
+ }
+ zzEXIT(zztasp1);
+ return;
+fail:
+ zzEXIT(zztasp1);
+ zzsyn(zzMissText, zzBadTok, (ANTLRChar *)"", zzMissSet, zzMissTok, zzErrk, zzBadText);
+ zzresynch(setwd1, 0x80);
+ }
+}
+
+void
+#ifdef __STDC__
+field(AST**_root)
+#else
+field(_root)
+AST **_root;
+#endif
+{
+ zzRULE;
+ zzBLOCK(zztasp1);
+ zzMake0;
+ {
+ zzmatch(NAME); zzsubroot(_root, &_sibling, &_tail);
+ zzastArg(1)->nodetype = BTAST_FIELD; check_field_name (zzastArg(1));
+ zzCONSUME;
+
+ zzmatch(EQUALS); zzCONSUME;
+ value(zzSTR); zzlink(_root, &_sibling, &_tail);
+
+#if DEBUG > 1
+ printf ("field: fieldname = %p (%s)\n"
+ " first val = %p (%s)\n",
+ zzastArg(1)->text, zzastArg(1)->text, zzastArg(2)->text, zzastArg(2)->text);
+#endif
+ zzEXIT(zztasp1);
+ return;
+fail:
+ zzEXIT(zztasp1);
+ zzsyn(zzMissText, zzBadTok, (ANTLRChar *)"", zzMissSet, zzMissTok, zzErrk, zzBadText);
+ zzresynch(setwd2, 0x1);
+ }
+}
+
+void
+#ifdef __STDC__
+value(AST**_root)
+#else
+value(_root)
+AST **_root;
+#endif
+{
+ zzRULE;
+ zzBLOCK(zztasp1);
+ zzMake0;
+ {
+ simple_value(zzSTR); zzlink(_root, &_sibling, &_tail);
+ {
+ zzBLOCK(zztasp2);
+ zzMake0;
+ {
+ while ( LA(1)==HASH) {
+ zzmatch(HASH); zzCONSUME;
+ simple_value(zzSTR); zzlink(_root, &_sibling, &_tail);
+ zzLOOP(zztasp2);
+ }
+ zzEXIT(zztasp2);
+ }
+ }
+ zzEXIT(zztasp1);
+ return;
+fail:
+ zzEXIT(zztasp1);
+ zzsyn(zzMissText, zzBadTok, (ANTLRChar *)"", zzMissSet, zzMissTok, zzErrk, zzBadText);
+ zzresynch(setwd2, 0x2);
+ }
+}
+
+void
+#ifdef __STDC__
+simple_value(AST**_root)
+#else
+simple_value(_root)
+AST **_root;
+#endif
+{
+ zzRULE;
+ zzBLOCK(zztasp1);
+ zzMake0;
+ {
+ if ( LA(1)==STRING) {
+ zzmatch(STRING); zzsubchild(_root, &_sibling, &_tail);
+ zzastArg(1)->nodetype = BTAST_STRING;
+ zzCONSUME;
+ }
+ else {
+ if ( LA(1)==NUMBER) {
+ zzmatch(NUMBER); zzsubchild(_root, &_sibling, &_tail);
+ zzastArg(1)->nodetype = BTAST_NUMBER;
+ zzCONSUME;
+ }
+ else {
+ if ( LA(1)==NAME) {
+ zzmatch(NAME); zzsubchild(_root, &_sibling, &_tail);
+ zzastArg(1)->nodetype = BTAST_MACRO;
+ zzCONSUME;
+ }
+ else {zzFAIL(1,zzerr5,&zzMissSet,&zzMissText,&zzBadTok,&zzBadText,&zzErrk); goto fail;}
+ }
+ }
+ zzEXIT(zztasp1);
+ return;
+fail:
+ zzEXIT(zztasp1);
+ zzsyn(zzMissText, zzBadTok, (ANTLRChar *)"", zzMissSet, zzMissTok, zzErrk, zzBadText);
+ zzresynch(setwd2, 0x4);
+ }
+}
diff --git a/btparse/src/bibtex.g b/btparse/src/bibtex.g
new file mode 100644
index 0000000..1dec34f
--- /dev/null
+++ b/btparse/src/bibtex.g
@@ -0,0 +1,413 @@
+/* ------------------------------------------------------------------------
+@NAME : bibtex.g
+@DESCRIPTION: PCCTS-based lexer and parser for BibTeX files. (Or rather,
+ for the BibTeX data description language. This parser
+ enforces nothing about the structure and contents of
+ entries; that's up to higher-level processors. Thus, there's
+ nothing either particularly bibliographic or TeXish about
+ the language accepted by this parser, apart from the affinity
+ for curly braces.)
+
+ There are a few minor differences from the language accepted
+ by BibTeX itself, but these are generally improvements over
+ BibTeX's behaviour. See the comments in the grammar, at least
+ until I write a decent description of the language.
+
+ I have used Gerd Neugebauer's BibTool (yet another BibTeX
+ parser, along with a prettyprinter and specialized language
+ for a common set of bibhacks) as another check of correctness
+ -- there are a few screwball things that BibTeX accepts and
+ BibTool doesn't, so I felt justified in rejecting them as
+ well. In general, this parser is a little stricter than
+ BibTeX, but a little looser than BibTool. YMMV.
+
+ Another source of inspiration is Nelson Beebe's bibclean, or
+ rather Beebe's article describing bibclean (from TUGboat
+ vol. 14 no. 4; also included with the bibclean distribution).
+
+ The product of the parser is an abstract syntax tree that can
+ be traversed to be printed in a simple form (see
+ print_entry() in bibparse.c) or perhaps transformed to a
+ format more convenient for higher-level languages (see my
+ Text::BibTeX Perl module for an example).
+
+ Whole files may be parsed by entering the parser at `bibfile';
+ in this case, the parser really returns a forest (list of
+ ASTs, one per entry). Alternately, you can enter the parser
+ at `entry', which reads and parses a single entry.
+@GLOBALS : the usual DLG and ANTLR cruft
+@CALLS :
+@CREATED : first attempt: May 1996, Greg Ward
+ second attempt (complete rewrite): July 25-28 1996, Greg Ward
+@MODIFIED : Sep 1996, GPW: changed to generate an AST rather than print
+ out each entry as it's encountered
+ Jan 1997, GPW: redid the above, because it was lost when
+ my !%&$#!@ computer was stolen
+ Jun 1997, GPW: greatly simplified the lexer, and added handling
+ of %-comments, @comment and @preamble entries,
+ and proper scanning of between-entry junk
+@VERSION : $Id$
+@COPYRIGHT : Copyright (c) 1996-99 by Gregory P. Ward. All rights reserved.
+
+ This file is part of the btparse library. This library is
+ free software; you can redistribute it and/or modify it under
+ the terms of the GNU Library General Public License as
+ published by the Free Software Foundation; either version 2
+ of the License, or (at your option) any later version.
+-------------------------------------------------------------------------- */
+
+#header
+<<
+#define ZZCOL
+#define USER_ZZSYN
+
+#include "config.h"
+#include "btparse.h"
+#include "attrib.h"
+#include "lex_auxiliary.h"
+#include "error.h"
+#include "my_dmalloc.h"
+
+extern char * InputFilename; /* for zzcr_ast call in pccts/ast.c */
+>>
+
+/*
+ * The lexer has three modes -- START (between entries, hence it's what
+ * we're in initially), LEX_ENTRY (entered once we see an '@' at
+ * top-level), and LEX_STRING (for scanning quoted strings). Note that all
+ * the functions called from lexer actions can be found in lex_auxiliary.c.
+ *
+ * The START mode just looks for '@', discards comments and whitespace,
+ * counts lines, and keeps track of any other junk. The "keeping track"
+ * just consists of counting the number of junk characters, which is then
+ * reported at the next '@' sign. This will hopefully let users clean up
+ * "old style" implicit comments, and possibly catch some legitimate errors
+ * in their files (eg. a complete entry that's missing an '@').
+ */
+
+#token AT "\@" << at_sign (); >>
+#token "\n" << newline (); >>
+#token COMMENT "\%~[\n]*\n" << comment (); >>
+#token "[\ \r\t]+" << zzskip (); >>
+#token "~[\@\n\ \r\t]+"<< toplevel_junk (); >>
+
+#lexclass LEX_ENTRY
+
+/*
+ * The LEX_ENTRY mode is where most of the interesting stuff is -- these
+ * tokens control most of the syntax of BibTeX. First, we duplicate most
+ * of the START lexer, in order to handle newlines, comments, and
+ * whitespace.
+ *
+ * Next comes a "number", which is trivial. This is needed because a
+ * BibTeX simple value may be an unquoted digit string; it has to precede
+ * the definition of "name" tokens, because otherwise a digit string would
+ * be a legitimate "name", which would cause an ambiguity inside entries
+ * ("is this a macro or a number?")
+ *
+ * Then comes the regexp for a BibTeX "name", which is used for entry
+ * types, entry keys, field names, and macro names. This is basically the
+ * same as BibTeX's definition of such "names", with two differences. The
+ * key, fundamental difference is that I have defined names by inclusion
+ * rather than exclusion: this regex lists all characters allowed in a
+ * type/key/field name/macro name, rather than listing those characters not
+ * allowed (as the BibTeX documentation does). The trivial difference is
+ * that I have disallowed a few extra characters: @ \ ~. Allowing @ could
+ * cause confusing BibTeX syntax, and allowing \ or ~ can cause bogus TeX
+ * code: try putting "\cite{foo\bar}" in your LaTeX document and see what
+ * happens! I'm also rather skeptical about some of the more exotic
+ * punctuation characters being allowed, but since people have been using
+ * BibTeX's definition of "names" for a decade or so now, I guess we're
+ * stuck with it. I could always amend name() to warn about any exotic
+ * punctuation that offends me, but that should be an option -- and I don't
+ * have a mechanism for user selectable warnings yet, so it'll have to
+ * wait.
+ *
+ * Also note that defining "number" ahead of "name" precludes a string of
+ * digits from being a name. This is usually a good thing; we don't want
+ * to accept digit strings as article types or field names (BibTeX
+ * doesn't). However -- dubious as it may seem -- digit strings are
+ * legitimate entry keys, so we should accept them there. This is handled
+ * by the grammar; see the `contents' rule below.
+ *
+ * Finally, it should be noted that BibTeX does not seem to apply the same
+ * lexical rules to entry types, entry keys, and field names -- so perhaps
+ * doing so here is not such a great idea. One immediate manifestation of
+ * this is that my grammar in its unassisted state would accept a field
+ * name with leading digits; BibTeX doesn't accept this. I correct this
+ * with the check_field_name() function, called from the `field' rule in
+ * the grammar and defined in parse_auxiliary.c.
+ */
+#token "\n" << newline (); >>
+#token COMMENT "\%~[\n]*\n" << comment (); >>
+#token "[\ \r\t]+" << zzskip (); >>
+#token NUMBER "[0-9]+"
+#token NAME "[a-z0-9\!\$\&\*\+\-\.\/\:\;\<\>\?\[\]\^\_\`\|]+"
+ << name (); >>
+
+/*
+ * Now come the (apparently) easy tokens, i.e. punctuation. There are a
+ * number of tricky bits here, though. First, '{' can have two very
+ * different meanings: at top-level, it's an entry delimiter, and inside an
+ * entry it's a string delimiter. This is handled (in lbrace()) by keeping
+ * track of the "entry state" (top-level, after '@', after type, in
+ * comment, or in entry) and using that to determine what to do on a '{'.
+ * If we're in an entry, lbrace() will switch to the string lexer by
+ * calling start_string(); if we're immediately after an entry type token
+ * (which is just a name following a top-level '@'), then we force the
+ * current token to ENTRY_OPEN, so that '{' and '(' appear identical to the
+ * parser. (This works because the scanner generated by DLG just happens
+ * to assign the token number first, and then executes the action.)
+ * Anywhere else (ie. at top level or immediately after an '@', we print a
+ * warning and leave the token as LBRACE, which will cause a syntax error
+ * (because LBRACE is not used anywhere in the grammar).
+ *
+ * '(' has some similarities to '{', but it's different enough that it
+ * has its own function. In particular, it may be an entry opener just
+ * like '{', but in one particular case it may be a string opener. That
+ * particular case is where it follows '@' and 'comment'; in that case,
+ * lparen() will call start_string() to enter the string lexer.
+ *
+ * The other delimiter characters are easier, but still warrant an
+ * explanation. '}' should only occur inside an entry, and if found there
+ * the token is forced to ENTRY_CLOSER; anywhere else, a warning is printed
+ * and the parser should find a syntax error. ')' should only occur inside
+ * an entry, and likewise will trigger a warning if seen elsewhere.
+ * (String-closing '}' and ')' are handled by the string lexer, below.)
+ *
+ * The other punctuation characters are trivial. Note that a double quote
+ * can start a string anywhere (except at top-level!), but if it occurs in
+ * a weird place a syntax error will eventually occur.
+ */
+#token LBRACE "\{" << lbrace (); >>
+#token RBRACE "\}" << rbrace (); >>
+#token ENTRY_OPEN "\(" << lparen (); >>
+#token ENTRY_CLOSE "\)" << rparen (); >>
+#token EQUALS "="
+#token HASH "\#"
+#token COMMA ","
+#token "\"" << start_string ('"'); >>
+
+
+#lexclass LEX_STRING
+
+/*
+ * Here's a reasonably decent attempt at lexing BibTeX strings. There are
+ * a couple of sneaky tricks going on here that aren't strictly necessary,
+ * but can make the user's life a lot easier.
+ *
+ * First, here's what a simple and straightforward BibTeX string lexer
+ * would do:
+ * - keep track of brace-depth by incrementing/decrementing a counter
+ * whenever it sees `{' or `}'
+ * - if the string was started with a `{' and it sees a `}' which
+ * brings the brace-depth to 0, end the string
+ * - if the string was started with a `"' and it sees another `"' at
+ * brace-depth 0, end the string
+ * - any other characters are left untouched and become part of the
+ * string
+ *
+ * (Note that the simple act of counting braces makes this lexer
+ * non-regular -- there's a bit more going on here than you might
+ * think from reading the regexps. So sue me.)
+ *
+ * The first, most obvious refinement to this is to check for newlines
+ * and other whitespace -- we should convert either one to a single
+ * space (to simplify future processing), as well as increment zzline on
+ * newline. Note that we don't do any collapsing of whitespace yet --
+ * newlines surrounded by spaces make that rather tricky to handle
+ * properly in the lexer (because newlines are handled separately, in
+ * order to increment zzline), so I put it off to a later stage. (That
+ * also gives us the flexibility to collapse whitespace or not,
+ * according to the user's whim.)
+ *
+ * A PCCTS lexer to handle these requirements would look something like this:
+ *
+ * #token "\n" << newline_in_string (); >>
+ * #token "[\r\t]" << zzreplchar (' '); zzmore (); >>
+ * #token "\{" << open_brace(); >>
+ * #token "\}" << close_brace(); >>
+ * #token "\"" << quote_in_string (); >>
+ * #token "~[\n\{\}\"]+" << zzmore (); >>
+ *
+ * where the functions called are the same as currently in lex_auxiliary.c.
+ *
+ * However, I've added some trickery here that lets us heuristically detect
+ * runaway strings. The heuristic is as follows: anytime we have a newline
+ * in a string, that's reason to suspect a runaway. We follow up on this
+ * suspicion by slurping everything that could reasonably be part of the
+ * string and still be in the same line (i.e., a string of anything except
+ * newline, braces, parentheses, double-quote, and backslash), and then
+ * calling check_runaway_string(). This function then "backs up" to the
+ * beginning of the slurped string (the newline), and scans ahead looking
+ * for one of two patterns: "@name[{(]", or "name=" (with optional
+ * whitespace between the "tokens"). (Actually, it first makes a pass over
+ * the string to convert all whitespace characters -- including the sole
+ * newline -- to spaces. So, it's effectively looking for "\ *\@\ *NAME\
+ * *[\{\(]" (DLG regexp syntax) or "\ *NAME\ *=", where
+ * NAME="[a-z][a-z0-9+/:'.-]*" -- that is, something that looks like the
+ * start of an entry or a new field, but in a string (where they almost
+ * certainly shouldn't occur). Of course, there are no explicit regexps
+ * there -- it's all coded as a little hand-crafted automaton in C.
+ *
+ * At any rate, if either one of these patterns is matched,
+ * check_runaway_string() prints a warning and sets a flag so that we don't
+ * print that warning -- or indeed, even scan for the suspect patterns --
+ * more than once for the current string. (Because chances are if it
+ * occurs once, it'll occur again and again and again.)
+ *
+ * There is also some trickery going on to deal with '@comment' entries.
+ * Syntactically, these are just AT NAME STRING, where NAME must be
+ * 'comment'. This means that an '@comment' entry has no delimiters, it
+ * just has a string. To make them look a bit more like the other kinds of
+ * entries (which are delimited with '{' ... '}' or '(' ... ')', the STRING
+ * here is special: it's delimited either by braces or parentheses, rather
+ * than by the usual braces or double-quotes. Thus, we treat parentheses
+ * much like braces in this lexer, to handle the '@comment(...)' case. And
+ * there's an explicit check for the erroneous '@comment"..."' case in
+ * start_string(), just to be complete.
+ *
+ * So that explains all the regexps in this lexer: the first one (starting
+ * with newline) triggers the check for a runaway string. Then, we have a
+ * pattern to convert any single whitespace char (apart from newline) to a
+ * space; note that any whitespace chars that are matched in the
+ * newline-regexp will be converted by check_runaway_string(), and won't be
+ * matched by the whitespace regexp here. Then, we check for braces;
+ * open_brace() and close_brace() take care of counting brace-depth and
+ * determining if we have hit the end of the string. lparen_in_string()
+ * and rparen_in_string() do the same for parentheses, to handle
+ * '@comment(...)'. Then, if a double quote is seen, we call
+ * quote_in_string(); this takes care of ending strings quoted by double
+ * quotes. Finally, the "fall-through" regexp handles most strings (except
+ * for stuff that comes after a newline).
+ */
+#token "\n~[\n\{\}\(\)\"\\]*" << check_runaway_string (); >>
+#token "[\r\t]" << zzreplchar (' '); zzmore (); >>
+#token "\{" << open_brace (); >>
+#token "\}" << close_brace (); >>
+#token "\(" << lparen_in_string (); >>
+#token "\)" << rparen_in_string (); >>
+#token STRING "\"" << quote_in_string (); >>
+#token "~[\n\{\}\(\)\"]+" << zzmore (); >>
+
+#lexclass START
+
+
+/* At last, the grammar! After that lexer, this is a snap. */
+
+/*
+ * `bibfile' is the rule to recognize an entire BibTeX file. Note that I
+ * don't actually use this as the start rule myself; I have a function
+ * bt_parse_entry() (in input.c), which takes care of setting up the lexer
+ * and parser state in such a way that the parser can be entered multiple
+ * times (at the `entry' rule) on the same input stream. Then, the user
+ * calls bt_parse_entry() until end of file is reached, at which point it
+ * cleans up its mess. The `bibfile' rule should work, but I never
+ * actually use it, so it hasn't been tested in quite a while.
+ */
+bibfile! : << AST *last; #0 = NULL; >>
+ ( entry
+ << /* a little creative forestry... */
+ if (#0 == NULL)
+ #0 = #1;
+ else
+ last->right = #1;
+ last = #1;
+ >>
+ )* ;
+
+/*
+ * `entry' is the rule that I actually use to enter the parser -- it parses
+ * a single entry from the input stream (that is, the lexer scans past
+ * junk until an '@' is seen at top-level, and that '@' becomes the AT
+ * token which starts an entry).
+ *
+ * `entry_metatype()' returns the value of a global variable maintained by
+ * lex_auxiliary.c that tells us how to parse the entry. This is needed
+ * because, while the different things that look like BibTeX entries
+ * (string definition, preamble, actual entry, etc.) have a similar lexical
+ * makeup, the syntax is different. In `entry', we just use the entry
+ * metatype to determine the nodetype field of the AST node for the entry;
+ * below, in `body' and `contents', we'll actually use it (in the form of
+ * semantic predicates) to select amongst the various syntax options.
+ */
+entry : << bt_metatype metatype; >>
+ AT! NAME^
+ <<
+ metatype = entry_metatype();
+ #1->nodetype = BTAST_ENTRY;
+ #1->metatype = metatype;
+ >>
+ body[metatype]
+ ;
+
+/*
+ * `body' is what comes after AT NAME: either a single string, delimited by
+ * {} or () (where NAME == 'comment'), or the more usual case of the entry
+ * contents, delimited by an entry 'opener' and 'closer' (either
+ * parentheses or braces).
+ */
+body [bt_metatype metatype]
+ : << metatype == BTE_COMMENT >>?
+ STRING << #1->nodetype = BTAST_STRING; >>
+ | ENTRY_OPEN! contents[metatype] ENTRY_CLOSE!
+ ;
+
+/*
+ * `contents' is where we select and accept the syntax for the guts of the
+ * entry, based on the type of entry that we're parsing. We find this
+ * out from the `nodetype' field of the top AST node for the entry, which
+ * is passed in as `entry_type'. General entries (ie. any unrecognized
+ * entry type) and `modify' entries have a name (the key), a comma, and
+ * list of "field = value" assignments. Macro definitions ('@string') are
+ * similar, but without the key-comma pair. Preambles have just a single
+ * value, and aliases have a single "field = value" assignment. (Note that
+ * '@modify' and '@alias' are BibTeX 1.0 additions -- I'll have to check
+ * the compatibility of my syntax with BibTeX 1.0 when it is released.)
+ * '@comment' entries are handled differently, by the `body' rule above.
+ */
+contents [bt_metatype metatype]
+ : << metatype == BTE_REGULAR /* || metatype == BTE_MODIFY */ >>?
+ ( NAME | NUMBER ) << #1->nodetype = BTAST_KEY; >>
+ COMMA!
+ fields
+ | << metatype == BTE_MACRODEF >>?
+ fields
+ | << metatype == BTE_PREAMBLE >>?
+ value
+// | << metatype == BTE_ALIAS >>?
+// field
+ ;
+
+/*
+ * `fields' is a comma-separated list of fields. Note that BibTeX has a
+ * little wart in that it allows a single extra comma after the last field
+ * only. This is easy enough to handle, we just have to do it in the
+ * traditional BNFish way (loop by recursion) rather than use EBNF
+ * trickery.
+ */
+fields : field { COMMA! fields }
+ | /* epsilon */
+ ;
+
+/* `field' recognizes a single "field = value" assignment. */
+field : NAME^
+ << #1->nodetype = BTAST_FIELD; check_field_name (#1); >>
+ EQUALS! value
+ <<
+#if DEBUG > 1
+ printf ("field: fieldname = %p (%s)\n"
+ " first val = %p (%s)\n",
+ #1->text, #1->text, #2->text, #2->text);
+#endif
+ >>
+ ;
+
+/* `value' is a sequence of simple_values, joined by the '#' operator. */
+value : simple_value ( HASH! simple_value )* ;
+
+/* `simple_value' is a single string, number, or macro invocation. */
+simple_value : STRING << #1->nodetype = BTAST_STRING; >>
+ | NUMBER << #1->nodetype = BTAST_NUMBER; >>
+ | NAME << #1->nodetype = BTAST_MACRO; >>
+ ;
diff --git a/btparse/src/bibtex_ast.c b/btparse/src/bibtex_ast.c
new file mode 100644
index 0000000..932e828
--- /dev/null
+++ b/btparse/src/bibtex_ast.c
@@ -0,0 +1,63 @@
+/* ------------------------------------------------------------------------
+@NAME : bibtex_ast.c
+@DESCRIPTION: Data and functions for internal display/manipulation of AST
+ nodes. (Stuff for external consumption, and for processing
+ whole trees, is to be found in traversal.c.)
+@GLOBALS :
+@CREATED : 1997/08/12, Greg Ward
+@MODIFIED :
+@VERSION : $Id$
+@COPYRIGHT : Copyright (c) 1996-99 by Gregory P. Ward. All rights reserved.
+
+ This file is part of the btparse library. This library is
+ free software; you can redistribute it and/or modify it under
+ the terms of the GNU Library General Public License as
+ published by the Free Software Foundation; either version 2
+ of the License, or (at your option) any later version.
+-------------------------------------------------------------------------- */
+
+#include "bt_config.h"
+#include "btparse.h"
+#include "prototypes.h"
+#include "my_dmalloc.h"
+
+
+char *nodetype_names[] =
+{
+ "bogus", "entry", "key", "field", "string", "number", "macro"
+};
+
+
+static void dump (AST *root, int depth)
+{
+ AST *cur;
+
+ if (root == NULL)
+ {
+ printf ("[empty]\n");
+ return;
+ }
+
+ cur = root;
+ while (cur != NULL)
+ {
+ printf ("%*s[%s]: ", 2*depth, "", nodetype_names[cur->nodetype]);
+ if (cur->text != NULL)
+ printf ("(%s)\n", cur->text);
+ else
+ printf ("(null)\n");
+
+ if (cur->down != NULL)
+ dump (cur->down, depth+1);
+ cur = cur->right;
+ }
+}
+
+
+void dump_ast (char *msg, AST *root)
+{
+ if (msg != NULL)
+ printf ("%s",msg);
+ dump (root, 0);
+ printf ("\n");
+}
diff --git a/btparse/src/bibtex_ast.h b/btparse/src/bibtex_ast.h
new file mode 100644
index 0000000..fc46079
--- /dev/null
+++ b/btparse/src/bibtex_ast.h
@@ -0,0 +1,21 @@
+#ifndef BIBTEX_AST_H
+#define BIBTEX_AST_H
+
+typedef enum
+{
+ AST_BOGUS, /* to detect uninitialized nodes */
+ AST_ECOMMENT, AST_EPREAMBLE, AST_EMACRODEF, AST_EALIAS, AST_EMODIFY,
+ AST_ENTRY, AST_KEY, AST_FIELD, AST_STRING, AST_NUMBER, AST_MACRO
+} nodetype_t;
+
+#define AST_FIELDS int line, offset; nodetype_t nodetype; char *text;
+#define zzcr_ast(ast,attr,tok,txt) \
+{ \
+ (ast)->line = (attr)->line; \
+ (ast)->offset = (attr)->offset; \
+ (ast)->text = strdup ((attr)->text); \
+}
+#define zzd_ast(ast) \
+ if ((ast)->text != NULL) free ((ast)->text);
+
+#endif /* BIBTEX_AST_H */
diff --git a/btparse/src/bt_config.h.in b/btparse/src/bt_config.h.in
new file mode 100644
index 0000000..8d5dcab
--- /dev/null
+++ b/btparse/src/bt_config.h.in
@@ -0,0 +1,115 @@
+/* src/bt_config.h.in. Generated from configure.ac by autoheader. */
+
+/* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP
+ systems. This function is required for `alloca.c' support on those systems.
+ */
+#undef CRAY_STACKSEG_END
+
+/* Define to 1 if using `alloca.c'. */
+#undef C_ALLOCA
+
+/* Define to 1 if you have `alloca', as a function or macro. */
+#undef HAVE_ALLOCA
+
+/* Define to 1 if you have <alloca.h> and it should be used (not on Ultrix).
+ */
+#[% ALLOCA_H %]
+
+#undef HAVE_STRLCAT
+/* Have strlcat? */
+#[% STRLCAT %]
+
+
+
+/* Define to 1 if the system has the type `boolean'. */
+#undef HAVE_BOOLEAN
+
+/* Define to 1 if you have the <dlfcn.h> header file. */
+#undef HAVE_DLFCN_H
+
+/* Define to 1 if you don't have `vprintf' but do have `_doprnt.' */
+#undef HAVE_DOPRNT
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#undef HAVE_INTTYPES_H
+
+/* Define to 1 if you have the <limits.h> header file. */
+#undef HAVE_LIMITS_H
+
+/* Define to 1 if you have the <memory.h> header file. */
+#undef HAVE_MEMORY_H
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#undef HAVE_STDINT_H
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#undef HAVE_STDLIB_H
+
+/* Define to 1 if you have the `strdup' function. */
+#undef HAVE_STRDUP
+
+/* set if strdup is declared in <string.h> */
+#undef HAVE_STRDUP_DECL
+
+/* Define to 1 if you have the <strings.h> header file. */
+#undef HAVE_STRINGS_H
+
+/* Define to 1 if you have the <string.h> header file. */
+#undef HAVE_STRING_H
+
+/* Define to 1 if you have the `strlwr' function. */
+#undef HAVE_STRLWR
+
+/* Define to 1 if you have the `strupr' function. */
+#undef HAVE_STRUPR
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#undef HAVE_SYS_STAT_H
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#undef HAVE_SYS_TYPES_H
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#undef HAVE_UNISTD_H
+
+/* Define to 1 if you have the `vprintf' function. */
+#undef HAVE_VPRINTF
+
+/* Define to 1 if you have the `vsnprintf' function. */
+#[% VSNPRINTF %]
+
+/* Define to the sub-directory in which libtool stores uninstalled libraries.
+ */
+#undef LT_OBJDIR
+
+/* Name of package */
+#define PACKAGE [% PACKAGE %]
+
+/* Define to the full name of this package. */
+#define PACKAGE_NAME [% PACKAGE %]
+
+/* Define to the full name and version of this package. */
+#define PACKAGE_STRING [% PACKAGE %]
+
+/* Define to the version of this package. */
+#define PACKAGE_VERSION [% FPACKAGE %]
+
+/* If using the C implementation of alloca, define if you know the
+ direction of stack growth for your system; otherwise it will be
+ automatically deduced at runtime.
+ STACK_DIRECTION > 0 => grows toward higher addresses
+ STACK_DIRECTION < 0 => grows toward lower addresses
+ STACK_DIRECTION = 0 => direction of growth unknown */
+#undef STACK_DIRECTION
+
+/* Define to 1 if you have the ANSI C header files. */
+#undef STDC_HEADERS
+
+/* Version number of package */
+#define VERSION [% VERSION %]
+
+/* Define if using the dmalloc debugging malloc package */
+#undef WITH_DMALLOC
+
+/* Define to empty if `const' does not conform to ANSI C. */
+#undef const
diff --git a/btparse/src/bt_debug.h b/btparse/src/bt_debug.h
new file mode 100644
index 0000000..44156b2
--- /dev/null
+++ b/btparse/src/bt_debug.h
@@ -0,0 +1,38 @@
+/* ------------------------------------------------------------------------
+@NAME : bt_debug.h
+@DESCRIPTION: Defines various macros needed for compile-time selection
+ of debugging code.
+@GLOBALS :
+@CREATED :
+@MODIFIED :
+@VERSION : $Id$
+@COPYRIGHT : Copyright (c) 1996-99 by Gregory P. Ward. All rights reserved.
+
+ This file is part of the btparse library. This library is
+ free software; you can redistribute it and/or modify it under
+ the terms of the GNU Library General Public License as
+ published by the Free Software Foundation; either version 2
+ of the License, or (at your option) any later version.
+-------------------------------------------------------------------------- */
+
+#ifndef BT_DEBUG_H
+#define BT_DEBUG_H
+
+/*
+ * DEBUG is the debug level -- an integer, defaults to 0
+ * DBG_ACTION is a macro to conditionally execute a bit of code --
+ * must have compiled with DEBUG true, and the debug level
+ * must be >= `level' (the macro argument)
+ */
+
+#ifndef DEBUG
+# define DEBUG 0
+#endif
+
+#if DEBUG
+# define DBG_ACTION(level,action) if (DEBUG >= level) { action; }
+#else
+# define DBG_ACTION(level,action)
+#endif
+
+#endif /* BT_DEBUG_H */
diff --git a/btparse/src/btparse.h b/btparse/src/btparse.h
new file mode 100644
index 0000000..e7ae3a9
--- /dev/null
+++ b/btparse/src/btparse.h
@@ -0,0 +1,341 @@
+/* src/btparse.h. Generated from btparse.h.in by configure. */
+/* ------------------------------------------------------------------------
+@NAME : btparse.h
+@DESCRIPTION: Declarations and types for users of the btparse library.
+@GLOBALS :
+@CALLS :
+@CREATED : 1997/01/19, Greg Ward
+@MODIFIED :
+@VERSION : $Id$
+@COPYRIGHT : Copyright (c) 1996-97 by Gregory P. Ward. All rights reserved.
+
+ This file is part of the btparse library. This library is
+ free software; you can redistribute it and/or modify it under
+ the terms of the GNU Library General Public License as
+ published by the Free Software Foundation; either version 2
+ of the License, or (at your option) any later version.
+-------------------------------------------------------------------------- */
+#ifndef BTPARSE_H
+#define BTPARSE_H
+
+#include "bt_config.h"
+#include <sys/types.h>
+#include <stdio.h>
+
+
+typedef unsigned short btshort;
+typedef int boolean;
+
+
+#ifndef TRUE
+# define TRUE 1
+# define FALSE 0
+#endif
+
+/* Parsing (and post-processing) options */
+
+#define BTO_CONVERT 1 /* convert numbers to strings? */
+#define BTO_EXPAND 2 /* expand macros? */
+#define BTO_PASTE 4 /* paste substrings together? */
+#define BTO_COLLAPSE 8 /* collapse whitespace? */
+
+#define BTO_NOSTORE 16
+
+#define BTO_FULL (BTO_CONVERT | BTO_EXPAND | BTO_PASTE | BTO_COLLAPSE)
+#define BTO_MACRO (BTO_CONVERT | BTO_EXPAND | BTO_PASTE)
+#define BTO_MINIMAL 0
+
+#define BTO_STRINGMASK (BTO_CONVERT | BTO_EXPAND | BTO_PASTE | BTO_COLLAPSE)
+
+#define BT_VALID_NAMEPARTS "fvlj"
+#define BT_MAX_NAMEPARTS 4
+
+typedef enum
+{
+ BTE_UNKNOWN,
+ BTE_REGULAR,
+ BTE_COMMENT,
+ BTE_PREAMBLE,
+ BTE_MACRODEF
+/*
+ BTE_ALIAS,
+ BTE_MODIFY
+*/
+} bt_metatype;
+
+#define NUM_METATYPES ((int) BTE_MACRODEF + 1)
+
+typedef enum
+{
+ BTAST_BOGUS, /* to detect uninitialized nodes */
+ BTAST_ENTRY,
+ BTAST_KEY,
+ BTAST_FIELD,
+ BTAST_STRING,
+ BTAST_NUMBER,
+ BTAST_MACRO
+} bt_nodetype;
+
+typedef enum
+{
+ BTN_FIRST, BTN_VON, BTN_LAST, BTN_JR, BTN_NONE
+} bt_namepart;
+
+typedef enum
+{
+ BTJ_MAYTIE, /* "discretionary" tie between words */
+ BTJ_SPACE, /* force a space between words */
+ BTJ_FORCETIE, /* force a tie (~ in TeX) */
+ BTJ_NOTHING /* nothing between words */
+} bt_joinmethod;
+
+
+#define USER_DEFINED_AST 1
+
+#define zzcr_ast(ast,attr,tok,txt) \
+{ \
+ (ast)->filename = InputFilename; \
+ (ast)->line = (attr)->line; \
+ (ast)->offset = (attr)->offset; \
+ (ast)->text = strdup ((attr)->text); \
+}
+
+#define zzd_ast(ast) \
+/* printf ("zzd_ast: free'ing ast node with string %p (%s)\n", \
+ (ast)->text, (ast)->text); */ \
+ if ((ast)->text != NULL) free ((ast)->text);
+
+
+#ifdef USER_DEFINED_AST
+typedef struct _ast
+{
+ struct _ast *right, *down;
+ char * filename;
+ int line;
+ int offset;
+ bt_nodetype nodetype;
+ bt_metatype metatype;
+ char * text;
+} AST;
+#endif /* USER_DEFINED_AST */
+
+
+typedef struct
+{
+ /*
+ * `string' is the string that has been split; items[0] ...
+ * items[num_items-1] are pointers into `string', or NULL for empty
+ * substrings. Note that `string' is actually a copy of the string
+ * passed in to bt_split_list() with NULs inserted between substrings.
+ */
+
+ char * string;
+ int num_items;
+ char ** items;
+} bt_stringlist;
+
+
+typedef struct
+{
+ bt_stringlist * tokens; /* flat list of all tokens in name */
+ char ** parts[BT_MAX_NAMEPARTS]; /* each elt. is list of pointers */
+ /* into `tokens->string' */
+ int part_len[BT_MAX_NAMEPARTS]; /* length in tokens */
+} bt_name;
+
+
+typedef struct tex_tree_s
+{
+ char * start;
+ int len;
+ struct tex_tree_s
+ * child,
+ * next;
+} bt_tex_tree;
+
+
+typedef struct
+{
+ /* These determine the order (and presence) of parts in the name. */
+ int num_parts;
+ bt_namepart parts[BT_MAX_NAMEPARTS];
+
+ /*
+ * These lists are always in the order of the bt_namepart enum -- *not*
+ * dependent on the particular order of parts the user specified! (This
+ * will make it a bit harder if I ever allow more than one occurrence of
+ * a part in a format; since I don't allow that, I'm not [yet] worried
+ * about it!)
+ */
+ char * pre_part[BT_MAX_NAMEPARTS];
+ char * post_part[BT_MAX_NAMEPARTS];
+ char * pre_token[BT_MAX_NAMEPARTS];
+ char * post_token[BT_MAX_NAMEPARTS];
+ boolean abbrev[BT_MAX_NAMEPARTS];
+ bt_joinmethod join_tokens[BT_MAX_NAMEPARTS];
+ bt_joinmethod join_part[BT_MAX_NAMEPARTS];
+} bt_name_format;
+
+
+typedef enum
+{
+ BTERR_NOTIFY, /* notification about next action */
+ BTERR_CONTENT, /* warning about the content of a record */
+ BTERR_LEXWARN, /* warning in lexical analysis */
+ BTERR_USAGEWARN, /* warning about library usage */
+ BTERR_LEXERR, /* error in lexical analysis */
+ BTERR_SYNTAX, /* error in parser */
+ BTERR_USAGEERR, /* fatal error in library usage */
+ BTERR_INTERNAL /* my fault */
+} bt_errclass;
+
+typedef enum
+{
+ BTACT_NONE, /* do nothing on error */
+ BTACT_CRASH, /* call exit(1) */
+ BTACT_ABORT /* call abort() */
+} bt_erraction;
+
+typedef struct
+{
+ bt_errclass class;
+ char * filename;
+ int line;
+ char * item_desc;
+ int item;
+ char * message;
+} bt_error;
+
+typedef void (*bt_err_handler) (bt_error *);
+
+
+#if defined(__cplusplus__) || defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+/* Function prototypes */
+
+/*
+ * First, we might need a prototype for strdup() (because the zzcr_ast
+ * macro uses it, and that macro is used in pccts/ast.c -- which I don't
+ * want to modify if I can help it, because it's someone else's code).
+ * This is to accomodate AIX, where including <string.h> apparently doesn't
+ * declare strdup() (reported by Reiner Schlotte
+ * <schlotte@geo.palmod.uni-bremen.de>), and compiling bibtex.c (which
+ * includes pccts/ast.c) crashes because of this (yes, yes, I know it
+ * should just be a warning -- I don't know what's going on there!).
+ *
+ * Unfortunately, this duplicates code in bt_config.h -- I can't include
+ * bt_config.h here, because this header must be freestanding; I don't want
+ * to include bt_config.h in pccts/ast.c, because I don't want to touch the
+ * PCCTS code if I can help it; but I don't want every source file that
+ * uses strdup() to have to include btparse.h. Hence the duplication.
+ * Yuck.
+ */
+#define HAVE_STRDUP_DECL 1
+#if !HAVE_STRDUP_DECL
+extern char *strdup (const char *s);
+#endif
+
+
+/* init.c */
+void bt_initialize (void);
+void bt_free_ast (AST *ast);
+void bt_cleanup (void);
+
+/* input.c */
+void bt_set_stringopts (bt_metatype metatype, btshort options);
+AST * bt_parse_entry_s (char * entry_text,
+ char * filename,
+ int line,
+ btshort options,
+ boolean * status);
+AST * bt_parse_entry (FILE * infile,
+ char * filename,
+ btshort options,
+ boolean * status);
+AST * bt_parse_file (char * filename,
+ btshort options,
+ boolean * overall_status);
+
+/* post_parse.c */
+void bt_postprocess_string (char * s, btshort options);
+char * bt_postprocess_value (AST * value, btshort options, boolean replace);
+char * bt_postprocess_field (AST * field, btshort options, boolean replace);
+void bt_postprocess_entry (AST * entry, btshort options);
+
+/* error.c */
+void bt_reset_error_counts (void);
+int bt_get_error_count (bt_errclass errclass);
+int * bt_get_error_counts (int *counts);
+btshort bt_error_status (int *saved_counts);
+
+/* macros.c */
+void bt_add_macro_value (AST *assignment, btshort options);
+void bt_add_macro_text (char * macro, char * text, char * filename, int line);
+void bt_delete_macro (char * macro);
+void bt_delete_all_macros (void);
+int bt_macro_length (char *macro);
+char * bt_macro_text (char * macro, char * filename, int line);
+
+/* traversal.c */
+AST *bt_next_entry (AST *entry_list, AST *prev_entry);
+bt_metatype bt_entry_metatype (AST *entry);
+char *bt_entry_type (AST *entry);
+char *bt_entry_key (AST *entry);
+AST *bt_next_field (AST *entry, AST *prev, char **name);
+AST *bt_next_macro (AST *entry, AST *prev, char **name);
+AST *bt_next_value (AST *head,
+ AST *prev,
+ bt_nodetype *nodetype,
+ char **text);
+char *bt_get_text (AST *node);
+
+/* modify.c */
+void bt_set_text (AST * node, char * new_text);
+void bt_entry_set_key (AST * entry, char * new_key);
+
+/* names.c */
+bt_stringlist * bt_split_list (char * string,
+ char * delim,
+ char * filename,
+ int line,
+ char * description);
+void bt_free_list (bt_stringlist *list);
+bt_name * bt_split_name (char * name,
+ char * filename,
+ int line,
+ int name_num);
+void bt_free_name (bt_name * name);
+
+/* tex_tree.c */
+bt_tex_tree * bt_build_tex_tree (char * string);
+void bt_free_tex_tree (bt_tex_tree **top);
+void bt_dump_tex_tree (bt_tex_tree *node, int depth, FILE *stream);
+char * bt_flatten_tex_tree (bt_tex_tree *top);
+
+/* string_util.c */
+void bt_purify_string (char * string, btshort options);
+void bt_change_case (char transform, char * string, btshort options);
+
+/* format_name.c */
+bt_name_format * bt_create_name_format (char * parts, boolean abbrev_first);
+void bt_free_name_format (bt_name_format * format);
+void bt_set_format_text (bt_name_format * format,
+ bt_namepart part,
+ char * pre_part,
+ char * post_part,
+ char * pre_token,
+ char * post_token);
+void bt_set_format_options (bt_name_format * format,
+ bt_namepart part,
+ boolean abbrev,
+ bt_joinmethod join_tokens,
+ bt_joinmethod join_part);
+char * bt_format_name (bt_name * name, bt_name_format * format);
+
+#if defined(__cplusplus__) || defined(__cplusplus) || defined(c_plusplus)
+}
+#endif
+
+#endif /* BTPARSE_H */
diff --git a/btparse/src/err.c b/btparse/src/err.c
new file mode 100644
index 0000000..2fcf302
--- /dev/null
+++ b/btparse/src/err.c
@@ -0,0 +1,75 @@
+/*
+ * A n t l r S e t s / E r r o r F i l e H e a d e r
+ *
+ * Generated from: bibtex.g
+ *
+ * Terence Parr, Russell Quong, Will Cohen, and Hank Dietz: 1989-1995
+ * Parr Research Corporation
+ * with Purdue University Electrical Engineering
+ * With AHPCRC, University of Minnesota
+ * ANTLR Version 1.33
+ */
+
+#include <stdio.h>
+#define ANTLR_VERSION 133
+
+#define ZZCOL
+#define USER_ZZSYN
+
+#include "config.h"
+#include "btparse.h"
+#include "attrib.h"
+#include "lex_auxiliary.h"
+#include "error.h"
+#include "my_dmalloc.h"
+
+extern char * InputFilename; /* for zzcr_ast call in pccts/ast.c */
+#define zzSET_SIZE 4
+#include "../pccts/antlr.h"
+#include "../pccts/ast.h"
+#include "tokens.h"
+#include "../pccts/dlgdef.h"
+#include "../pccts/err.h"
+
+ANTLRChar *zztokens[27]={
+ /* 00 */ "Invalid",
+ /* 01 */ "@",
+ /* 02 */ "AT",
+ /* 03 */ "\\n",
+ /* 04 */ "COMMENT",
+ /* 05 */ "[\\ \\r\\t]+",
+ /* 06 */ "~[\\@\\n\\ \\r\\t]+",
+ /* 07 */ "\\n",
+ /* 08 */ "[\\ \\r\\t]+",
+ /* 09 */ "NUMBER",
+ /* 10 */ "NAME",
+ /* 11 */ "LBRACE",
+ /* 12 */ "RBRACE",
+ /* 13 */ "ENTRY_OPEN",
+ /* 14 */ "ENTRY_CLOSE",
+ /* 15 */ "EQUALS",
+ /* 16 */ "HASH",
+ /* 17 */ "COMMA",
+ /* 18 */ "\"",
+ /* 19 */ "\\n~[\\n\\{\\}\\(\\)\"\\]*",
+ /* 20 */ "[\\r\\t]",
+ /* 21 */ "\\{",
+ /* 22 */ "\\}",
+ /* 23 */ "\\(",
+ /* 24 */ "\\)",
+ /* 25 */ "STRING",
+ /* 26 */ "~[\\n\\{\\}\\(\\)\"]+"
+};
+SetWordType zzerr1[4] = {0x0,0x20,0x0,0x2};
+SetWordType zzerr2[4] = {0x0,0x6,0x0,0x0};
+SetWordType zzerr3[4] = {0x0,0x46,0x0,0x2};
+SetWordType zzerr4[4] = {0x0,0x44,0x0,0x0};
+SetWordType setwd1[27] = {0x0,0x7,0x6,0x0,0x0,0x0,0x0,
+ 0x0,0x0,0x28,0x38,0x0,0x0,0x0,0xd0,
+ 0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,
+ 0x0,0x0,0x20,0x0};
+SetWordType zzerr5[4] = {0x0,0x6,0x0,0x2};
+SetWordType setwd2[27] = {0x0,0x0,0x0,0x0,0x0,0x0,0x0,
+ 0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x7,
+ 0x0,0x4,0x7,0x0,0x0,0x0,0x0,0x0,
+ 0x0,0x0,0x0,0x0};
diff --git a/btparse/src/error.c b/btparse/src/error.c
new file mode 100644
index 0000000..6077e1f
--- /dev/null
+++ b/btparse/src/error.c
@@ -0,0 +1,350 @@
+/* ------------------------------------------------------------------------
+@NAME : error.c
+@DESCRIPTION: Anything relating to reporting or recording errors and
+ warnings.
+@GLOBALS : errclass_names
+ err_actions
+ err_handlers
+ errclass_counts
+ error_buf
+@CALLS :
+@CREATED : 1996/08/28, Greg Ward
+@MODIFIED :
+@VERSION : $Id$
+@COPYRIGHT : Copyright (c) 1996-99 by Gregory P. Ward. All rights reserved.
+
+ This file is part of the btparse library. This library is
+ free software; you can redistribute it and/or modify it under
+ the terms of the GNU Library General Public License as
+ published by the Free Software Foundation; either version 2
+ of the License, or (at your option) any later version.
+-------------------------------------------------------------------------- */
+
+#include "bt_config.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <string.h>
+#include "btparse.h"
+#include "error.h"
+#include "my_dmalloc.h"
+
+
+#define NUM_ERRCLASSES ((int) BTERR_INTERNAL + 1)
+
+
+static char *errclass_names[NUM_ERRCLASSES] =
+{
+ NULL, /* BTERR_NOTIFY */
+ "warning", /* BTERR_CONTENT */
+ "warning", /* BTERR_LEXWARN */
+ "warning", /* BTERR_USAGEWARN */
+ "error", /* BTERR_LEXERR */
+ "syntax error", /* BTERR_SYNTAX */
+ "fatal error", /* BTERR_USAGEERR */
+ "internal error" /* BTERR_INTERNAL */
+};
+
+static bt_erraction err_actions[NUM_ERRCLASSES] =
+{
+ BTACT_NONE, /* BTERR_NOTIFY */
+ BTACT_NONE, /* BTERR_CONTENT */
+ BTACT_NONE, /* BTERR_LEXWARN */
+ BTACT_NONE, /* BTERR_USAGEWARN */
+ BTACT_NONE, /* BTERR_LEXERR */
+ BTACT_NONE, /* BTERR_SYNTAX */
+ BTACT_CRASH, /* BTERR_USAGEERR */
+ BTACT_ABORT /* BTERR_INTERNAL */
+};
+
+void print_error (bt_error *err);
+
+static bt_err_handler err_handlers[NUM_ERRCLASSES] =
+{
+ print_error,
+ print_error,
+ print_error,
+ print_error,
+ print_error,
+ print_error,
+ print_error,
+ print_error
+};
+
+static int errclass_counts[NUM_ERRCLASSES] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+static char error_buf[MAX_ERROR+1];
+
+
+/* ----------------------------------------------------------------------
+ * Error-handling functions.
+ */
+
+void print_error (bt_error *err)
+{
+ char * name;
+ boolean something_printed;
+
+ something_printed = FALSE;
+
+ if (err->filename)
+ {
+ fprintf (stderr, "%s", err->filename);
+ something_printed = TRUE;
+ }
+ if (err->line > 0) /* going to print a line number? */
+ {
+ if (something_printed)
+ fprintf (stderr, ", ");
+ fprintf (stderr, "line %d", err->line);
+ something_printed = TRUE;
+ }
+ if (err->item_desc && err->item > 0) /* going to print an item number? */
+ {
+ if (something_printed)
+ fprintf (stderr, ", ");
+ fprintf (stderr, "%s %d", err->item_desc, err->item);
+ something_printed = TRUE;
+ }
+
+ name = errclass_names[(int) err->class];
+ if (name)
+ {
+ if (something_printed)
+ fprintf (stderr, ", ");
+ fprintf (stderr, "%s", name);
+ something_printed = TRUE;
+ }
+
+ if (something_printed)
+ fprintf (stderr, ": ");
+
+ fprintf (stderr, "%s\n", err->message);
+
+ fflush(stderr);
+
+} /* print_error() */
+
+
+
+/* ----------------------------------------------------------------------
+ * Error-reporting functions: these are called anywhere in the library
+ * when we encounter an error.
+ */
+
+void
+report_error (bt_errclass class,
+ char * filename,
+ int line,
+ char * item_desc,
+ int item,
+ char * fmt,
+ va_list arglist)
+{
+ bt_error err;
+#if !HAVE_VSNPRINTF
+ int msg_len;
+#endif
+
+ err.class = class;
+ err.filename = filename;
+ err.line = line;
+ err.item_desc = item_desc;
+ err.item = item;
+
+ errclass_counts[(int) class]++;
+
+
+ /*
+ * Blech -- we're writing to a static buffer because there's no easy
+ * way to know how long the error message is going to be. (Short of
+ * reimplementing printf(), or maybe printf()'ing to a dummy file
+ * and using the return value -- ugh!) The GNU C library conveniently
+ * supplies vsnprintf(), which neatly solves this problem by truncating
+ * the output string if it gets too long. (I could check for this
+ * truncation if I wanted to, but I don't think it's necessary given the
+ * ample size of the message buffer.) For non-GNU systems, though,
+ * we're stuck with using vsprintf()'s return value. This can't be
+ * trusted on all systems -- thus there's a check for it in configure.
+ * Also, this won't necessarily trigger the internal_error() if we
+ * do overflow; it's conceivable that vsprintf() itself would crash.
+ * At least doing it this way we avoid the possibility of vsprintf()
+ * silently corrupting some memory, and crashing unpredictably at some
+ * later point.
+ */
+
+#if HAVE_VSNPRINTF
+ vsnprintf (error_buf, MAX_ERROR, fmt, arglist);
+#else
+ msg_len = vsprintf (error_buf, fmt, arglist); // protected by cpp
+ if (msg_len > MAX_ERROR)
+ internal_error ("static error message buffer overflowed");
+#endif
+
+ err.message = error_buf;
+ if (err_handlers[class])
+ (*err_handlers[class]) (&err);
+
+ switch (err_actions[class])
+ {
+ case BTACT_NONE: return;
+ case BTACT_CRASH: exit (1);
+ case BTACT_ABORT: abort ();
+ default: internal_error ("invalid error action %d for class %d (%s)",
+ (int) err_actions[class],
+ (int) class, errclass_names[class]);
+ }
+
+} /* report_error() */
+
+
+GEN_ERRFUNC (general_error,
+ (bt_errclass class,
+ char * filename,
+ int line,
+ char * item_desc,
+ int item,
+ char * fmt,
+ ...),
+ class, filename, line, item_desc, item, fmt)
+
+GEN_ERRFUNC (error,
+ (bt_errclass class,
+ char * filename,
+ int line,
+ char * fmt,
+ ...),
+ class, filename, line, NULL, -1, fmt)
+
+GEN_ERRFUNC (ast_error,
+ (bt_errclass class,
+ AST * ast,
+ char * fmt,
+ ...),
+ class, ast->filename, ast->line, NULL, -1, fmt)
+
+GEN_ERRFUNC (notify,
+ (char * fmt, ...),
+ BTERR_NOTIFY, NULL, -1, NULL, -1, fmt)
+
+GEN_ERRFUNC (usage_warning,
+ (char * fmt, ...),
+ BTERR_USAGEWARN, NULL, -1, NULL, -1, fmt)
+
+GEN_ERRFUNC (usage_error,
+ (char * fmt, ...),
+ BTERR_USAGEERR, NULL, -1, NULL, -1, fmt)
+
+GEN_ERRFUNC (internal_error,
+ (char * fmt, ...),
+ BTERR_INTERNAL, NULL, -1, NULL, -1, fmt)
+
+
+/* ======================================================================
+ * Functions to be used outside of the library
+ */
+
+/* ------------------------------------------------------------------------
+@NAME : bt_reset_error_counts()
+@INPUT :
+@OUTPUT :
+@RETURNS :
+@DESCRIPTION: Resets all the error counters to zero.
+@GLOBALS :
+@CALLS :
+@CREATED : 1997/01/08, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+void bt_reset_error_counts (void)
+{
+ int i;
+
+ for (i = 0; i < NUM_ERRCLASSES; i++)
+ errclass_counts[i] = 0;
+}
+
+
+/* ------------------------------------------------------------------------
+@NAME : bt_get_error_count()
+@INPUT : errclass
+@OUTPUT :
+@RETURNS :
+@DESCRIPTION: Returns number of errors seen in the specified class.
+@GLOBALS : errclass_counts
+@CALLS :
+@CREATED :
+@MODIFIED :
+-------------------------------------------------------------------------- */
+int bt_get_error_count (bt_errclass errclass)
+{
+ return errclass_counts[errclass];
+}
+
+
+/* ------------------------------------------------------------------------
+@NAME : bt_get_error_counts()
+@INPUT : counts - pointer to an array big enough to hold all the counts
+ if NULL, the array will be allocated for you (and you
+ must free() it when done with it)
+@OUTPUT :
+@RETURNS : counts - either the passed-in pointer, or the newly-
+ allocated array if you pass in NULL
+@DESCRIPTION: Returns a newly-allocated array with the number of errors
+ in each error class, indexed by the members of the
+ eclass_t enum.
+@GLOBALS : errclass_counts
+@CALLS :
+@CREATED : 1997/01/06, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+int *bt_get_error_counts (int *counts)
+{
+ int i;
+
+ if (counts == NULL)
+ counts = (int *) malloc (sizeof (int) * NUM_ERRCLASSES);
+ for (i = 0; i < NUM_ERRCLASSES; i++)
+ counts[i] = errclass_counts[i];
+
+ return counts;
+}
+
+
+/* ------------------------------------------------------------------------
+@NAME : bt_error_status
+@INPUT : saved_counts - an array of error counts as returned by
+ bt_get_error_counts, or NULL not to compare
+ to a previous checkpoint
+@OUTPUT :
+@RETURNS :
+@DESCRIPTION: Computes a bitmap where a bit is set for each error class
+ that has more errors now than it used to have (or, if
+ saved_counts is NULL, the bit is set of there are have been
+ any errors in the corresponding error class).
+
+ Eg. "x & (1<<E_SYNTAX)" (where x is returned by bt_error_status)
+ is true if there have been any syntax errors.
+@GLOBALS :
+@CALLS :
+@CREATED :
+@MODIFIED :
+-------------------------------------------------------------------------- */
+btshort bt_error_status (int *saved_counts)
+{
+ int i;
+ btshort status;
+
+ status = 0;
+
+ if (saved_counts)
+ {
+ for (i = 0; i < NUM_ERRCLASSES; i++)
+ status |= ( (errclass_counts[i] > saved_counts[i]) << i);
+ }
+ else
+ {
+ for (i = 0; i < NUM_ERRCLASSES; i++)
+ status |= ( (errclass_counts[i] > 0) << i);
+ }
+
+ return status;
+} /* bt_error_status () */
diff --git a/btparse/src/error.h b/btparse/src/error.h
new file mode 100644
index 0000000..7f47121
--- /dev/null
+++ b/btparse/src/error.h
@@ -0,0 +1,65 @@
+/* ------------------------------------------------------------------------
+@NAME : error.c
+@DESCRIPTION: Prototypes for the error-generating functions (i.e. functions
+ defined in error.c, and meant only for use elswhere in the
+ library).
+@CREATED : Summer 1996, Greg Ward
+@MODIFIED :
+@VERSION : $Id$
+@COPYRIGHT : Copyright (c) 1996-99 by Gregory P. Ward. All rights reserved.
+
+ This file is part of the btparse library. This library is
+ free software; you can redistribute it and/or modify it under
+ the terms of the GNU Library General Public License as
+ published by the Free Software Foundation; either version 2
+ of the License, or (at your option) any later version.
+-------------------------------------------------------------------------- */
+
+#ifndef ERROR_H
+#define ERROR_H
+
+#include <stdarg.h>
+#include "btparse.h" /* for AST typedef */
+
+#define MAX_ERROR 1024
+
+#define ERRFUNC_BODY(class,filename,line,item_desc,item,format) \
+{ \
+ va_list arglist; \
+ \
+ va_start (arglist, format); \
+ report_error (class, filename, line, item_desc, item, format, arglist); \
+ va_end (arglist); \
+}
+
+#define GEN_ERRFUNC(name,params,class,filename,line,item_desc,item,format) \
+void name params \
+ERRFUNC_BODY (class, filename, line, item_desc, item, format)
+
+#define GEN_PRIVATE_ERRFUNC(name,params, \
+ class,filename,line,item_desc,item,format) \
+static GEN_ERRFUNC(name,params,class,filename,line,item_desc,item,format)
+
+/*
+ * Prototypes for functions exported by error.c but only used within
+ * the library -- functions that can be called by outsiders are declared
+ * in btparse.h.
+ */
+
+void print_error (bt_error *err);
+void report_error (bt_errclass class,
+ char * filename, int line, char * item_desc, int item,
+ char * format, va_list arglist);
+
+void general_error (bt_errclass class,
+ char * filename, int line, char * item_desc, int item,
+ char * format, ...);
+void error (bt_errclass class, char * filename, int line, char * format, ...);
+void ast_error (bt_errclass class, AST * ast, char * format, ...);
+
+void notify (char *format,...);
+void usage_warning (char * format, ...);
+void usage_error (char * format, ...);
+void internal_error (char * format, ...);
+
+#endif
diff --git a/btparse/src/file_header.c b/btparse/src/file_header.c
new file mode 100644
index 0000000..bc1d9cb
--- /dev/null
+++ b/btparse/src/file_header.c
@@ -0,0 +1,20 @@
+/* ------------------------------------------------------------------------
+@NAME :
+@DESCRIPTION:
+@GLOBALS :
+@CREATED :
+@MODIFIED :
+@VERSION : $Id$
+@COPYRIGHT : Copyright (c) 1996-99 by Gregory P. Ward. All rights reserved.
+
+ This file is part of the btparse library. This library is
+ free software; you can redistribute it and/or modify it under
+ the terms of the GNU Library General Public License as
+ published by the Free Software Foundation; either version 2
+ of the License, or (at your option) any later version.
+-------------------------------------------------------------------------- */
+
+#include "bt_config.h"
+
+
+#include "my_dmalloc.h"
diff --git a/btparse/src/format_name.c b/btparse/src/format_name.c
new file mode 100644
index 0000000..862f223
--- /dev/null
+++ b/btparse/src/format_name.c
@@ -0,0 +1,974 @@
+/* ------------------------------------------------------------------------
+@NAME : format_name.c
+@DESCRIPTION: bt_format_name() and support functions: everything needed
+ to turn a bt_name structure (as returned by bt_split_name())
+ back into a string according to a highly customizable format.
+@GLOBALS :
+@CREATED :
+@MODIFIED :
+@VERSION : $Id: format_name.c 9577 2011-02-15 21:34:08Z ambs $
+@COPYRIGHT : Copyright (c) 1996-99 by Gregory P. Ward. All rights reserved.
+
+ This file is part of the btparse library. This library is
+ free software; you can redistribute it and/or modify it under
+ the terms of the GNU Library General Public License as
+ published by the Free Software Foundation; either version 2
+ of the License, or (at your option) any later version.
+-------------------------------------------------------------------------- */
+
+#include "bt_config.h"
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include "btparse.h"
+#include "prototypes.h"
+#include "error.h"
+#include "my_dmalloc.h"
+#include "bt_debug.h"
+
+
+static char EmptyString[] = "";
+
+
+#if DEBUG
+/* prototypes to shut "gcc -Wmissing-prototypes" up */
+void print_tokens (char *partname, char **tokens, int num_tokens);
+void dump_name (bt_name * name);
+void dump_format (bt_name_format * format);
+#endif
+
+
+/* ----------------------------------------------------------------------
+ * Interface to create/customize bt_name_format structures
+ */
+
+/* ------------------------------------------------------------------------
+@NAME : bt_create_name_format
+@INPUT : parts - a string of letters (maximum four, from the set
+ f, v, l, j, with no repetition) denoting the order
+ and presence of name parts. Also used to determine
+ certain pre-part text strings.
+ abbrev_first - flag: should first names be abbreviated?
+@OUTPUT :
+@RETURNS :
+@DESCRIPTION: Creates a bt_name_format structure, slightly customized
+ according to the caller's choice of token order and
+ whether to abbreviate the first name. Use
+ bt_free_name_format() to free the structure (and any sub-
+ structures that may be allocated here). Use
+ bt_set_format_text() and bt_set_format_options() for
+ further customization of the format structure; do not
+ fiddle its fields directly.
+
+ Fills in the structures `parts' field according to `parts'
+ string: 'f' -> BTN_FIRST, and so on.
+
+ Sets token join methods: inter-token join (within each part)
+ is set to BTJ_MAYTIE (a "discretionary tie") for all parts;
+ inter-part join is set to BTJ_SPACE, except for a 'von'
+ token immediately preceding a 'last' token; there, we have
+ a discretionary tie.
+
+ Sets abbreviation flags: FALSE for everything except `first',
+ which follows `abbrev_first' argument.
+
+ Sets surrounding text (pre- and post-part, pre- and post-
+ token): empty string for everything, except:
+ - post-token for 'first' is "." if abbrev_first true
+ - if 'jr' immediately preceded by 'last':
+ pre-part for 'jr' is ", ", join for 'last' is nothing
+ - if 'first' immediately preceded by 'last'
+ pre-part for 'first' is ", " , join for 'last' is nothing
+ - if 'first' immediately preceded by 'jr' and 'jr' immediately
+ preceded by 'last':
+ pre-part for 'first' and 'jr' is ", " ,
+ join for 'last' and 'jr' is nothing
+@CREATED : 1997/11/02, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+bt_name_format *
+bt_create_name_format (char * parts, boolean abbrev_first)
+{
+ int num_parts;
+ int num_valid_parts;
+ bt_name_format *
+ format;
+ int part_pos[BT_MAX_NAMEPARTS];
+ int i;
+
+ for (i = 0; i < BT_MAX_NAMEPARTS; i++)
+ part_pos[i] = -2;
+
+ /*
+ * Check that the part list (a string with one letter -- f, v, l, or j
+ * -- for each part is valid: no longer than four characters, and no
+ * invalid characters.
+ */
+
+ num_parts = strlen (parts);
+ num_valid_parts = strspn (parts, BT_VALID_NAMEPARTS);
+ if (num_parts > BT_MAX_NAMEPARTS)
+ {
+ usage_error ("bt_create_name_format: part list must have no more than "
+ "%d letters", BT_MAX_NAMEPARTS);
+ }
+ if (num_valid_parts != num_parts)
+ {
+ usage_error ("bt_create_name_format: bad part abbreviation \"%c\" "
+ "(must be one of \"%s\")",
+ parts[num_valid_parts], BT_VALID_NAMEPARTS);
+ }
+
+
+ /* User input is OK -- let's create the structure */
+
+ format = (bt_name_format *) malloc (sizeof (bt_name_format));
+ format->num_parts = num_parts;
+ for (i = 0; i < num_parts; i++)
+ {
+ switch (parts[i])
+ {
+ case 'f': format->parts[i] = BTN_FIRST; break;
+ case 'v': format->parts[i] = BTN_VON; break;
+ case 'l': format->parts[i] = BTN_LAST; break;
+ case 'j': format->parts[i] = BTN_JR; break;
+ default: internal_error ("bad part abbreviation \"%c\"", parts[i]);
+ }
+ part_pos[format->parts[i]] = i;
+ }
+ for (; i < BT_MAX_NAMEPARTS; i++)
+ {
+ format->parts[i] = BTN_NONE;
+ }
+
+
+ /*
+ * Set the token join methods: between tokens for all parts is a
+ * discretionary tie, and the join between parts is a space (except for
+ * 'von': if followed by 'last', we will have a discretionary tie).
+ */
+
+ // INITIALIZA ALL!!!! PARTS
+ for (i = 0; i < BT_MAX_NAMEPARTS; i++)
+ {
+ format->join_tokens[i] = BTJ_MAYTIE;
+ format->join_part[i] = BTJ_SPACE;
+ }
+ if (part_pos[BTN_VON] + 1 == part_pos[BTN_LAST])
+ format->join_part[BTN_VON] = BTJ_MAYTIE;
+
+
+ /*
+ * Now the abbreviation flags: follow 'abbrev_first' flag for 'first',
+ * and FALSE for everything else.
+ */
+ format->abbrev[BTN_FIRST] = abbrev_first;
+ format->abbrev[BTN_VON] = FALSE;
+ format->abbrev[BTN_LAST] = FALSE;
+ format->abbrev[BTN_JR] = FALSE;
+
+
+
+ /*
+ * Now fill in the "surrounding text" fields (pre- and post-part, pre-
+ * and post-token) -- start out with everything NULL (empty string),
+ * and then tweak it to handle abbreviated first names, 'jr' following
+ * 'last', and 'first' following 'last' or 'last' and 'jr'. In the
+ * last three cases, we put in some pre-part text (", "), and also
+ * set the join method for the *previous* part (jr or last) to
+ * BTJ_NOTHING, so we don't get extraneous space before the ", ".
+ */
+ for (i = 0; i < BT_MAX_NAMEPARTS; i++)
+ {
+ format->pre_part[i] = EmptyString;
+ format->post_part[i] = EmptyString;
+ format->pre_token[i] = EmptyString;
+ format->post_token[i] = EmptyString;
+ }
+
+ /* abbreviated first name:
+ * "Blow J" -> "Blow J.", or "J Blow" -> "J. Blow"
+ */
+ if (abbrev_first)
+ {
+ format->post_token[BTN_FIRST] = ".";
+ }
+ /* 'jr' after 'last': "Joe Blow Jr." -> "Joe Blow, Jr." */
+ if (part_pos[BTN_JR] == part_pos[BTN_LAST]+1)
+ {
+ format->pre_part[BTN_JR] = ", ";
+ format->join_part[BTN_LAST] = BTJ_NOTHING;
+ /* 'first' after 'last' and 'jr': "Blow, Jr. Joe"->"Blow, Jr., Joe" */
+ if (part_pos[BTN_FIRST] == part_pos[BTN_JR]+1)
+ {
+ format->pre_part[BTN_FIRST] = ", ";
+ format->join_part[BTN_JR] = BTJ_NOTHING;
+ }
+ }
+ /* first after last: "Blow Joe" -> "Blow, Joe" */
+ if (part_pos[BTN_FIRST] == part_pos[BTN_LAST]+1)
+ {
+ format->pre_part[BTN_FIRST] = ", ";
+ format->join_part[BTN_LAST] = BTJ_NOTHING;
+ }
+
+ DBG_ACTION
+ (1, printf ("bt_create_name_format(): returning structure %p\n", format))
+
+ return format;
+
+} /* bt_create_name_format() */
+
+
+/* ------------------------------------------------------------------------
+@NAME : bt_free_name_format()
+@INPUT : format - free()'d, so this is an invalid pointer after the call
+@OUTPUT :
+@RETURNS :
+@DESCRIPTION: Frees a bt_name_format structure created by
+ bt_create_name_format().
+@CREATED : 1997/11/02, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+void
+bt_free_name_format (bt_name_format * format)
+{
+ free (format);
+}
+
+
+
+/* ------------------------------------------------------------------------
+@NAME : bt_set_format_text
+@INPUT : format - the format structure to update
+ part - which name-part to change the surrounding text for
+ pre_part - "pre-part" text, or NULL to leave alone
+ post_part - "post-part" text, or NULL to leave alone
+ pre_token - "pre-token" text, or NULL to leave alone
+ post_token - "post-token" text, or NULL to leave alone
+@OUTPUT : format - pre_part, post_part, pre_token, post_token
+ arrays updated (only those with corresponding
+ non-NULL parameters are touched)
+@RETURNS :
+@DESCRIPTION: Sets the "surrounding text" for a particular name part in
+ a name format structure.
+@CREATED : 1997/11/02, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+void
+bt_set_format_text (bt_name_format * format,
+ bt_namepart part,
+ char * pre_part,
+ char * post_part,
+ char * pre_token,
+ char * post_token)
+{
+ if (pre_part) format->pre_part[part] = pre_part;
+ if (post_part) format->post_part[part] = post_part;
+ if (pre_token) format->pre_token[part] = pre_token;
+ if (post_token) format->post_token[part] = post_token;
+}
+
+
+/* ------------------------------------------------------------------------
+@NAME : bt_set_format_options()
+@INPUT : format
+ part
+ abbrev
+ join_tokens
+ join_part
+@OUTPUT : format - abbrev, join_tokens, join_part arrays all updated
+@RETURNS :
+@DESCRIPTION: Sets various formatting options for a particular name part in
+ a name format structure.
+@CREATED : 1997/11/02, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+void
+bt_set_format_options (bt_name_format * format,
+ bt_namepart part,
+ boolean abbrev,
+ bt_joinmethod join_tokens,
+ bt_joinmethod join_part)
+{
+ format->abbrev[part] = abbrev;
+ format->join_tokens[part] = join_tokens;
+ format->join_part[part] = join_part;
+}
+
+
+/* ----------------------------------------------------------------------
+ * Functions for actually formatting a name (given a name and a name
+ * format structure).
+ */
+
+/* ------------------------------------------------------------------------
+@NAME : count_virtual_char()
+@INPUT : string
+ offset
+@OUTPUT : vchar_count
+@INOUT : depth
+ in_special
+@RETURNS :
+@DESCRIPTION: Munches a single physical character from a string, updating
+ the virtual character count, the depth, and an "in special
+ character" flag.
+
+ The virtual character count is incremented by any character
+ not part of a special character, and also by the right-brace
+ that closes a special character. The depth is incremented by
+ a left brace, and decremented by a right brace. in_special
+ is set to TRUE when we encounter a left brace at depth zero
+ that is immediately followed by a backslash; it is set to
+ false when we encounter the end of the special character,
+ i.e. when in_special is TRUE and we hit a right brace that
+ brings us back to depth zero.
+
+ *vchar_count and *depth should both be set to zero the first
+ time you call count_virtual_char() on a particular string,
+ and in_special should be set to FALSE.
+@CALLS :
+@CALLERS : string_length()
+ string_prefix()
+@CREATED : 1997/11/03, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+static void
+count_virtual_char (char * string,
+ int offset,
+ int * vchar_count,
+ int * depth,
+ boolean * in_special,
+ int * utf8_length)
+{
+ switch (string[offset])
+ {
+ case '{':
+ {
+ /* start of a special char? */
+ if (*depth == 0 && string[offset+1] == '\\')
+ *in_special = TRUE;
+ (*depth)++;
+ break;
+
+ }
+ case '}':
+ {
+ /* end of a special char? */
+ if (*depth == 1 && *in_special)
+ {
+ *in_special = FALSE;
+ (*vchar_count)++;
+ }
+ (*depth)--;
+ break;
+
+ }
+ default:
+ {
+ /* anything else? (possibly inside a special char) */
+ if (! *in_special)
+ /* Have to take care with UTF-8 chars here - we need to increment
+ only when we have a full character which could be multi-byte */
+ {
+ /* not tracking utf8 char yet, so start */
+ if (*utf8_length == 0)
+ *utf8_length = get_uchar(string, offset);
+ /* Final byte in utf8 char so count this as a "character" */
+ if (*utf8_length == 1)
+ {
+ (*vchar_count)++;
+ *utf8_length = 0;
+ }
+ /* Inside a multi-byte utf-8 char so decrement the count as we move along
+ the bytes */
+ if (*utf8_length > 1)
+ (*utf8_length)--;
+ }
+ }
+ }
+} /* count_virtual_char () */
+
+
+/* this should probably be publicly available, documented, etc. */
+/* ------------------------------------------------------------------------
+@NAME : string_length()
+@INPUT : string
+@OUTPUT :
+@RETURNS : "virtual length" of `string'
+@DESCRIPTION: Counts the number of "virtual characters" in a string. A
+ virtual character is either an entire BibTeX special character,
+ or any character outside of a special character.
+
+ Thus, "Hello" has virtual length 5, and so does
+ "H{\\'e}ll{\\\"o}". "{\\noop Hello there how are you?}" has
+ virtual length one.
+@CALLS : count_virtual_char()
+@CALLERS : format_name()
+@CREATED : 1997/11/03, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+static int
+string_length (char * string)
+{
+ int length;
+ int depth;
+ boolean in_special;
+ int utf8_length;
+ int i;
+
+ if (string == NULL)
+ return 0;
+
+ length = 0;
+ depth = 0;
+ in_special = FALSE;
+ utf8_length = 0;
+
+ for (i = 0; string[i] != 0; i++)
+ {
+ count_virtual_char (string, i, &length, &depth, &in_special, &utf8_length);
+ }
+
+ return length;
+} /* string_length() */
+
+
+/* ------------------------------------------------------------------------
+@NAME : string_prefix()
+@INPUT : string
+ prefix_len
+@OUTPUT :
+@RETURNS : physical length of the prefix of `string' with a virtual length
+ of `prefix_len'
+@DESCRIPTION: Counts the number of physical characters from the beginning
+ of `string' needed to extract a sub-string with virtual
+ length `prefix_len'. There is a special case emulating BibTeX
+ where we want to ignore beginning '{' which are not escaping
+ a virtual char, for example '{Some Organization}' with prefix_len
+ 1 should return "S".
+@CALLS : count_virtual_char()
+@CALLERS : format_name()
+@CREATED : 1997/11/03, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+static int
+string_prefix (char * string, int prefix_start, int prefix_len)
+{
+ int i;
+ int vchars_seen;
+ int depth;
+ boolean in_special;
+ int utf8_length;
+
+ vchars_seen = 0;
+ depth = 0;
+ in_special = FALSE;
+ utf8_length = 0;
+
+ for (i = prefix_start; string[i] != 0; i++)
+ {
+ count_virtual_char (string, i, &vchars_seen, &depth, &in_special, &utf8_length);
+ if (vchars_seen == prefix_len)
+ return (i+1)-prefix_start;
+ }
+
+ return i-prefix_start;
+
+} /* string_prefix() */
+
+
+/* ------------------------------------------------------------------------
+@NAME : string_prefix_start()
+@INPUT : string
+@OUTPUT :
+@RETURNS : index where we need to start looking at name part when
+ abbreviating
+@DESCRIPTION: If we are not in a special but depth == 1 then we need
+ start at index 1 (examples "{John Henry} Ford" or
+ "{Some Organisation Inc.}
+@CALLS :
+@CALLERS : format_name()
+@CREATED : 2010/03/13, PK
+@MODIFIED :
+-------------------------------------------------------------------------- */
+static int
+string_prefix_start (char * string, int index)
+{
+ int i;
+ int vchars_seen;
+ int depth;
+ boolean in_special;
+ int utf8_length;
+
+ vchars_seen = 0;
+ depth = 0;
+ in_special = FALSE;
+ utf8_length = 0;
+
+ count_virtual_char (string, index, &vchars_seen, &depth, &in_special, &utf8_length);
+ if (! in_special && depth == 1)
+ return index+1;
+
+ return index;
+
+} /* string_prefix_start() */
+
+
+
+/* ------------------------------------------------------------------------
+@NAME : append_text()
+@INOUT : string
+@INPUT : offset
+ text
+ start
+ len
+@OUTPUT :
+@RETURNS : number of characters copied from text+start to string+offset
+@DESCRIPTION: Copies at most `len' characters from text+start to
+ string+offset. (I don't use strcpy() or strncpy() for this
+ because I need to get the number of characters actually
+ copied.)
+@CALLS :
+@CALLERS : format_name()
+@CREATED : 1997/11/03, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+static int
+append_text (char * string,
+ int offset,
+ char * text,
+ int start,
+ int len)
+{
+ int i;
+
+ if (text == NULL) return 0; /* no text -- none appended! */
+
+ for (i = 0; text[start+i] != 0; i++)
+ {
+ if (len > 0 && i == len)
+ break; /* exit loop without i++, right?!? */
+ string[offset+i] = text[start+i];
+ } /* for i */
+
+ return i; /* number of characters copied */
+
+} /* append_text () */
+
+
+/* ------------------------------------------------------------------------
+@NAME : append_join
+@INOUT : string
+@INPUT : offset
+ method
+ should_tie
+@OUTPUT :
+@RETURNS : number of charactersa appended to string+offset (either 0 or 1)
+@DESCRIPTION: Copies a "join character" ('~' or ' ') or nothing to
+ string+offset, according to the join method specified by
+ `method' and the `should_tie' flag.
+
+ Specifically: if `method' is BTJ_SPACE, a space is appended
+ and 1 is returned; if `method' is BTJ_FORCETIE, a TeX "tie"
+ character ('~') is appended and 1 is returned. If `method'
+ is BTJ_NOTHING, `string' is unchanged and 0 is returned. If
+ `method' is BTJ_MAYTIE then either a tie (if should_tie is
+ true) or a space (otherwise) is appended, and 1 is returned.
+@CALLS :
+@CALLERS : format_name()
+@CREATED : 1997/11/03, GPW
+@MODIFIED :
+@COMMENTS : This should allow "tie" strings other than TeX's '~' -- I
+ think this could be done by putting a "tie string" field in
+ the name format structure, and using it here.
+-------------------------------------------------------------------------- */
+static int
+append_join (char * string,
+ int offset,
+ bt_joinmethod method,
+ boolean should_tie)
+{
+ switch (method)
+ {
+ case BTJ_MAYTIE: /* a "discretionary tie" -- pay */
+ { /* attention to should_tie */
+ if (should_tie)
+ string[offset] = '~';
+ else
+ string[offset] = ' ';
+ return 1;
+ }
+ case BTJ_SPACE:
+ {
+ string[offset] = ' ';
+ return 1;
+ }
+ case BTJ_FORCETIE:
+ {
+ string[offset] = '~';
+ return 1;
+ }
+ case BTJ_NOTHING:
+ {
+ return 0;
+ }
+ default:
+ internal_error ("bad token join method %d", (int) method);
+ }
+
+ return 0; /* can't happen -- just here to */
+ /* keep gcc -Wall happy */
+} /* append_join () */
+
+
+#define STRLEN(s) (s == NULL) ? 0 : strlen (s)
+
+/* ------------------------------------------------------------------------
+@NAME : format_firstpass()
+@INPUT : name
+ format
+@OUTPUT :
+@RETURNS :
+@DESCRIPTION: Makes the first pass over a name for formatting, in order to
+ establish an upper bound on the length of the formatted name.
+@CALLS :
+@CALLERS : bt_format_name()
+@CREATED : 1997/11/03, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+static unsigned
+format_firstpass (bt_name * name,
+ bt_name_format * format)
+{
+ int i; /* loop over parts */
+ int j; /* loop over tokens */
+ unsigned max_length;
+ bt_namepart part;
+ char ** tok;
+ int num_tok;
+
+ max_length = 0;
+
+ for (i = 0; i < format->num_parts; i++)
+ {
+ part = format->parts[i]; /* 'cause I'm a lazy typist */
+ tok = name->parts[part];
+ num_tok = name->part_len[part];
+
+ assert ((tok != NULL) == (num_tok > 0));
+ if (tok)
+ {
+ max_length += STRLEN (format->pre_part[part]);
+ max_length += STRLEN (format->post_part[part]);
+ max_length += STRLEN (format->pre_token[part]) * num_tok;
+ max_length += STRLEN (format->post_token[part]) * num_tok;
+ max_length += num_tok + 1; /* one join char per token, plus */
+ /* join char to next part */
+
+ /*
+ * We ignore abbreviation here -- just overestimates the maximum
+ * length, so no big deal. Also saves us the bother of computing
+ * the physical length of the prefix of virtual length 1.
+ */
+ for (j = 0; j < num_tok; j++)
+ max_length += STRLEN (tok[j]);
+ }
+
+ } /* for i (loop over parts) */
+
+ return max_length;
+
+} /* format_firstpass() */
+
+
+/* ------------------------------------------------------------------------
+@NAME : format_name()
+@INPUT : format
+ tokens - token list (eg. from format_firstpass())
+ num_tokens - token count list (eg. from format_firstpass())
+@OUTPUT : fname - filled in, must be preallocated by caller
+@RETURNS :
+@DESCRIPTION: Performs the second pass over a name and format, to actually
+ put the name into a single string according to `format'.
+@CALLS :
+@CALLERS : bt_format_name()
+@CREATED : 1997/11/03, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+static void
+format_name (bt_name_format * format,
+ char *** tokens,
+ int * num_tokens,
+ char * fname)
+{
+ bt_namepart parts[BT_MAX_NAMEPARTS]; /* culled list from format */
+ int num_parts;
+
+ int offset; /* into fname */
+ int tmpoffset;
+ int i; /* loop over parts */
+ int j; /* loop over tokens */
+ int k; /* loop within tokens */
+ bt_namepart part;
+ int prefix_len;
+ int abbrev_prefix_len;
+ int prefix_start; /* Index where to start looking for abbrev */
+ int abbrev_prefix_start; /* Index where to start looking for abbrev
+ but taking into account post-part token
+ to deal with hyphens in terse abbrevs */
+ int token_len; /* "physical" length (characters) */
+ int token_vlen; /* "virtual" length (special char */
+ /* counts as one character) */
+ boolean should_tie;
+ boolean hyphen_todo;
+
+ int vchars_seen;
+ int depth;
+ boolean in_special;
+ int utf8_length;
+
+ /*
+ * Cull format->parts down by keeping only those parts that are actually
+ * present in the current name (keeps the main loop simpler: makes it
+ * easy to know if the "next part" is present or not, so we know whether
+ * to append a join character.
+ */
+ num_parts = 0;
+ for (i = 0; i < format->num_parts; i++)
+ {
+ part = format->parts[i];
+ if (tokens[part]) /* name actually has this part */
+ parts[num_parts++] = part;
+ }
+
+ offset = 0;
+ token_vlen = -1; /* sanity check, and keeps */
+ /* "gcc -O -Wall" happy */
+
+ for (i = 0; i < num_parts; i++)
+ {
+ part = parts[i];
+
+ offset += append_text (fname, offset,
+ format->pre_part[part], 0, -1);
+
+ for (j = 0; j < num_tokens[part]; j++)
+ {
+ offset += append_text (fname, offset,
+ format->pre_token[part], 0, -1);
+
+ if (format->abbrev[part])
+ {
+ /* Set up tracking of depth and specials so we can ignore
+ hyphenated token parts within protected braces */
+ vchars_seen = 0;
+ depth = 0;
+ in_special = FALSE;
+ utf8_length = 0;
+
+ for (k = 0 ; tokens[part][j][k] != 0; k++)
+ {
+
+ count_virtual_char (tokens[part][j], k, &vchars_seen, &depth, &in_special, &utf8_length);
+ prefix_start = string_prefix_start (tokens[part][j], k);
+
+ /* Add initial from the begining of the string or beginning of after-hyphen
+ string */
+ if (k == 0 || hyphen_todo)
+ {
+ prefix_len = string_prefix (tokens[part][j], prefix_start, 1);
+ token_len = append_text (fname, offset,
+ tokens[part][j], prefix_start, prefix_len);
+ offset += token_len;
+ hyphen_todo = 0;
+ }
+ /* Potentially add a hyphen unless in protecting braces */
+ if (tokens[part][j][k] == '-' && depth == 0 && in_special == FALSE)
+ {
+ /* Add any post token part e. g. ('.') */
+ tmpoffset = 0;
+ tmpoffset = append_text (fname, offset,
+ format->post_token[part], 0, -1);
+ offset += tmpoffset;
+
+ /* copy the hyphen */
+ tmpoffset = append_text (fname, offset,
+ tokens[part][j],
+ k, 1);
+ offset += tmpoffset;
+
+ /* Set a flag to say we need to get the post-hyphen initial */
+ hyphen_todo = 1;
+ }
+ }
+ token_vlen = 1;
+ }
+ else
+ {
+ token_len = append_text (fname, offset,
+ tokens[part][j], 0, -1);
+ offset += token_len;
+ token_vlen = string_length (tokens[part][j]);
+ }
+
+ offset += append_text (fname, offset,
+ format->post_token[part], 0, -1);
+
+ /* join to next token, but only if there is a next token! */
+ if (j < num_tokens[part]-1)
+ {
+ should_tie = (num_tokens[part] > 1)
+ && (((j == 0) && (token_vlen < 3))
+ || (j == num_tokens[part]-2));
+ offset += append_join (fname, offset,
+ format->join_tokens[part], should_tie);
+ }
+
+ } /* for j */
+
+ offset += append_text (fname, offset,
+ format->post_part[part], 0, -1);
+ /* join to the next part, but again only if there is a next part */
+ if (i < num_parts-1)
+ {
+ if (token_vlen == -1)
+ {
+ internal_error ("token_vlen uninitialized -- no tokens in a part "
+ "that I checked existed");
+ }
+ should_tie = (num_tokens[part] == 1 && token_vlen < 3);
+ offset += append_join (fname, offset,
+ format->join_part[part], should_tie);
+ }
+
+ } /* for i (loop over parts) */
+
+ fname[offset] = 0;
+
+} /* format_name () */
+
+
+#if DEBUG
+
+#define STATIC /* so BibTeX.xs can call 'em too */
+
+/* borrowed print_tokens() and dump_name() from t/name_test.c */
+STATIC void
+print_tokens (char *partname, char **tokens, int num_tokens)
+{
+ int i;
+
+ if (tokens)
+ {
+ printf ("%s = (", partname);
+ for (i = 0; i < num_tokens; i++)
+ {
+ printf ("%s%c", tokens[i], i == num_tokens-1 ? ')' : '|');
+ }
+ putchar ('\n');
+ }
+}
+
+
+STATIC void
+dump_name (bt_name * name)
+{
+ if (name == NULL)
+ {
+ printf (" name: null\n");
+ return;
+ }
+
+ if (name->tokens == NULL)
+ {
+ printf (" name: null token list\n");
+ return;
+ }
+
+ printf (" name (%p):\n", name);
+ printf (" total number of tokens = %d\n", name->tokens->num_items);
+ print_tokens (" first", name->parts[BTN_FIRST], name->part_len[BTN_FIRST]);
+ print_tokens (" von", name->parts[BTN_VON], name->part_len[BTN_VON]);
+ print_tokens (" last", name->parts[BTN_LAST], name->part_len[BTN_LAST]);
+ print_tokens (" jr", name->parts[BTN_JR], name->part_len[BTN_JR]);
+}
+
+
+STATIC void
+dump_format (bt_name_format * format)
+{
+ int i;
+ static char * nameparts[] = { "first", "von", "last", "jr" };
+ static char * joinmethods[] = {"may tie", "space", "force tie", "nothing"};
+
+ printf (" name format (%p):\n", format);
+ printf (" order:");
+ for (i = 0; i < format->num_parts; i++)
+ printf (" %s", nameparts[format->parts[i]]);
+ printf ("\n");
+
+ for (i = 0; i < BT_MAX_NAMEPARTS; i++)
+ {
+ int j;
+ for (j = 0; j < format->num_parts; j++)
+ if (i == format->parts[j])
+ break;
+ if (j == format->num_parts) continue;
+
+ printf (" %-5s: pre-part=%p (%s), post-part=%p (%s)\n",
+ nameparts[i],
+ format->pre_part[i], format->pre_part[i],
+ format->post_part[i], format->post_part[i]);
+ printf (" %-5s pre-token=%p (%s), post-token=%p (%s)\n",
+ "",
+ format->pre_token[i], format->pre_token[i],
+ format->post_token[i],format->post_token[i]);
+ printf (" %-5s abbrev=%s, join_tokens=%s, join_parts=%s\n",
+ "",
+ format->abbrev[i] ? "yes" : "no",
+ joinmethods[format->join_tokens[i]],
+ joinmethods[format->join_part[i]]);
+ }
+}
+#endif
+
+
+/* ------------------------------------------------------------------------
+@NAME : bt_format_name()
+@INPUT : name
+ format
+@OUTPUT :
+@RETURNS : formatted name (allocated with malloc(); caller must free() it)
+@DESCRIPTION: Formats an already-split name according to a pre-constructed
+ format structure.
+@GLOBALS :
+@CALLS : format_firstpass(), format_name()
+@CALLERS :
+@CREATED : 1997/11/03, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+char *
+bt_format_name (bt_name * name,
+ bt_name_format * format)
+{
+ unsigned max_length;
+ char * fname;
+
+#if DEBUG >= 2
+ printf ("bt_format_name():\n");
+ dump_name (name);
+ dump_format (format);
+#endif
+
+ max_length = format_firstpass (name, format);
+ fname = (char *) malloc ((max_length+1) * sizeof (char));
+#if 0
+ memset (fname, '_', max_length);
+ fname[max_length] = 0;
+#endif
+ format_name (format, name->parts, name->part_len, fname);
+ assert (strlen (fname) <= max_length);
+ return fname;
+
+} /* bt_format_name() */
diff --git a/btparse/src/function_header.c b/btparse/src/function_header.c
new file mode 100644
index 0000000..6209e5b
--- /dev/null
+++ b/btparse/src/function_header.c
@@ -0,0 +1,12 @@
+/* ------------------------------------------------------------------------
+@NAME :
+@INPUT :
+@OUTPUT :
+@RETURNS :
+@DESCRIPTION:
+@GLOBALS :
+@CALLS :
+@CALLERS :
+@CREATED :
+@MODIFIED :
+-------------------------------------------------------------------------- */
diff --git a/btparse/src/init.c b/btparse/src/init.c
new file mode 100644
index 0000000..ee898af
--- /dev/null
+++ b/btparse/src/init.c
@@ -0,0 +1,42 @@
+/* ------------------------------------------------------------------------
+@NAME : init.c
+@DESCRIPTION: Initialization and cleanup functions for the btparse library.
+@GLOBALS :
+@CALLS :
+@CREATED : 1997/01/19, Greg Ward
+@MODIFIED :
+@VERSION : $Id$
+@COPYRIGHT : Copyright (c) 1996-99 by Gregory P. Ward. All rights reserved.
+
+ This file is part of the btparse library. This library is
+ free software; you can redistribute it and/or modify it under
+ the terms of the GNU Library General Public License as
+ published by the Free Software Foundation; either version 2
+ of the License, or (at your option) any later version.
+-------------------------------------------------------------------------- */
+
+#include "bt_config.h"
+#include "stdpccts.h" /* for zzfree_ast() prototype */
+#include "parse_auxiliary.h" /* for fix_token_names() proto */
+#include "prototypes.h" /* for other prototypes */
+#include "my_dmalloc.h"
+
+void bt_initialize (void)
+{
+ /* Initialize data structures */
+
+ fix_token_names ();
+ init_macros ();
+}
+
+
+void bt_free_ast (AST *ast)
+{
+ zzfree_ast (ast);
+}
+
+
+void bt_cleanup (void)
+{
+ done_macros ();
+}
diff --git a/btparse/src/input.c b/btparse/src/input.c
new file mode 100644
index 0000000..6b03163
--- /dev/null
+++ b/btparse/src/input.c
@@ -0,0 +1,515 @@
+/* ------------------------------------------------------------------------
+@NAME : input.c
+@DESCRIPTION: Routines for input of BibTeX data.
+@GLOBALS : InputFilename
+ StringOptions
+@CALLS :
+@CREATED : 1997/10/14, Greg Ward (from code in bibparse.c)
+@MODIFIED :
+@VERSION : $Id$
+@COPYRIGHT : Copyright (c) 1996-99 by Gregory P. Ward. All rights reserved.
+
+ This file is part of the btparse library. This library is
+ free software; you can redistribute it and/or modify it under
+ the terms of the GNU Library General Public License as
+ published by the Free Software Foundation; either version 2
+ of the License, or (at your option) any later version.
+-------------------------------------------------------------------------- */
+#include "bt_config.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <limits.h>
+#include <assert.h>
+#include "stdpccts.h"
+#include "lex_auxiliary.h"
+#include "prototypes.h"
+#include "error.h"
+#include "my_dmalloc.h"
+
+
+
+
+char * InputFilename;
+btshort StringOptions[NUM_METATYPES] =
+{
+ 0, /* BTE_UNKNOWN */
+ BTO_FULL, /* BTE_REGULAR */
+ BTO_MINIMAL, /* BTE_COMMENT */
+ BTO_MINIMAL, /* BTE_PREAMBLE */
+ BTO_MACRO /* BTE_MACRODEF */
+};
+
+
+/* ------------------------------------------------------------------------
+@NAME : bt_set_filename
+@INPUT : filename
+@OUTPUT :
+@RETURNS :
+@DESCRIPTION: Sets the current input filename -- used for generating
+ error and warning messages.
+@GLOBALS : InputFilename
+@CALLS :
+@CREATED : Feb 1997, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+#if 0
+void bt_set_filename (char *filename)
+{
+ InputFilename = filename;
+}
+#endif
+
+/* ------------------------------------------------------------------------
+@NAME : bt_set_stringopts
+@INPUT : metatype
+ options
+@OUTPUT :
+@RETURNS :
+@DESCRIPTION: Sets the string-processing options for a particular
+ entry metatype. Used later on by bt_parse_* to determine
+ just how to post-process each particular entry.
+@GLOBALS : StringOptions
+@CREATED : 1997/08/24, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+void bt_set_stringopts (bt_metatype metatype, btshort options)
+{
+ if (metatype < BTE_REGULAR || metatype > BTE_MACRODEF)
+ usage_error ("bt_set_stringopts: illegal metatype");
+ if (options & ~BTO_STRINGMASK)
+ usage_error ("bt_set_stringopts: illegal options "
+ "(must only set string option bits");
+
+ StringOptions[metatype] = options;
+}
+
+
+/* ------------------------------------------------------------------------
+@NAME : start_parse
+@INPUT : infile input stream we'll read from (or NULL if reading
+ from string)
+ instring input string we'll read from (or NULL if reading
+ from stream)
+ line line number of the start of the string (just
+ use 1 if the string is standalone and independent;
+ if it comes from a file, you should supply the
+ line number where it starts for better error
+ messages) (ignored if infile != NULL)
+@OUTPUT :
+@RETURNS :
+@DESCRIPTION: Prepares things for parsing, in particular initializes the
+ lexical state and lexical buffer, prepares DLG for
+ reading (either from a stream or a string), and reads
+ the first token.
+@GLOBALS :
+@CALLS : initialize_lexer_state()
+ alloc_lex_buffer()
+ zzrdstream() or zzrdstr()
+ zzgettok()
+@CALLERS :
+@CREATED : 1997/06/21, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+static void
+start_parse (FILE *infile, char *instring, int line)
+{
+ if ( (infile == NULL) == (instring == NULL) )
+ {
+ internal_error ("start_parse(): exactly one of infile and "
+ "instring may be non-NULL");
+ }
+ initialize_lexer_state ();
+ alloc_lex_buffer (ZZLEXBUFSIZE);
+ if (infile)
+ {
+ zzrdstream (infile);
+ }
+ else
+ {
+ zzrdstr ((unsigned char*)instring);
+ zzline = line;
+ }
+
+ zzendcol = zzbegcol = 0;
+ zzgettok ();
+}
+
+
+
+/* ------------------------------------------------------------------------
+@NAME : finish_parse()
+@INPUT : err_counts - pointer to error count list (which is local to
+ the parsing functions, hence has to be passed in)
+@OUTPUT :
+@RETURNS :
+@DESCRIPTION: Frees up what was needed to parse a whole file or a sequence
+ of strings: the lexical buffer and the error count list.
+@GLOBALS :
+@CALLS : free_lex_buffer()
+@CALLERS :
+@CREATED : 1997/06/21, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+static void
+finish_parse (int **err_counts)
+{
+ free_lex_buffer ();
+ free (*err_counts);
+ *err_counts = NULL;
+}
+
+
+/* ------------------------------------------------------------------------
+@NAME : parse_status()
+@INPUT : saved_counts
+@OUTPUT :
+@RETURNS : false if there were serious errors in the recently-parsed input
+ true otherwise (no errors or just warnings)
+@DESCRIPTION: Gets the "error status" bitmap relative to a saved set of
+ error counts and masks of non-serious errors.
+@GLOBALS :
+@CALLS :
+@CALLERS :
+@CREATED : 1997/06/21, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+static boolean
+parse_status (int *saved_counts)
+{
+ btshort ignore_emask;
+
+ /*
+ * This bit-twiddling fetches the error status (which has a bit
+ * for each error class), masks off the bits for trivial errors
+ * to get "true" if there were any serious errors, and then
+ * returns the opposite of that.
+ */
+ ignore_emask =
+ (1<<BTERR_NOTIFY) | (1<<BTERR_CONTENT) | (1<<BTERR_LEXWARN);
+ return !(bt_error_status (saved_counts) & ~ignore_emask);
+}
+
+
+/* ------------------------------------------------------------------------
+@NAME : bt_parse_entry_s()
+@INPUT : entry_text - string containing the entire entry to parse,
+ or NULL meaning we're done, please cleanup
+ options - standard btparse options bitmap
+ line - current line number (if that makes any sense)
+ -- passed to the parser to set zzline, so that
+ lexical and syntax errors are properly localized
+@OUTPUT : *top - newly-allocated AST for the entry
+ (or NULL if entry_text was NULL, ie. at EOF)
+@RETURNS : 1 with *top set to AST for entry on successful read/parse
+ 1 with *top==NULL if entry_text was NULL, ie. at EOF
+ 0 if any serious errors seen in input (*top is still
+ set to the AST, but only for as much of the input as we
+ were able to parse)
+ (A "serious" error is a lexical or syntax error; "trivial"
+ errors such as warnings and notifications count as "success"
+ for the purposes of this function's return value.)
+@DESCRIPTION: Parses a BibTeX entry contained in a string.
+@GLOBALS :
+@CALLS : ANTLR
+@CREATED : 1997/01/18, GPW (from code in bt_parse_entry())
+@MODIFIED :
+-------------------------------------------------------------------------- */
+AST * bt_parse_entry_s (char * entry_text,
+ char * filename,
+ int line,
+ btshort options,
+ boolean * status)
+{
+ AST * entry_ast = NULL;
+ static int * err_counts = NULL;
+
+ if (options & BTO_STRINGMASK) /* any string options set? */
+ {
+ usage_error ("bt_parse_entry_s: illegal options "
+ "(string options not allowed");
+ }
+
+ InputFilename = filename;
+ err_counts = bt_get_error_counts (err_counts);
+
+ if (entry_text == NULL) /* signal to clean up */
+ {
+ finish_parse (&err_counts);
+ if (status) *status = TRUE;
+ return NULL;
+ }
+
+ zzast_sp = ZZAST_STACKSIZE; /* workaround apparent pccts bug */
+
+ start_parse (NULL, entry_text, line);
+
+ entry (&entry_ast); /* enter the parser */
+ ++zzasp; /* why is this done? */
+
+ if (entry_ast == NULL) /* can happen with very bad input */
+ {
+ if (status) *status = FALSE;
+ return entry_ast;
+ }
+
+#if DEBUG
+ dump_ast ("bt_parse_entry_s: single entry, after parsing:\n",
+ entry_ast);
+#endif
+ bt_postprocess_entry (entry_ast,
+ StringOptions[entry_ast->metatype] | options);
+#if DEBUG
+ dump_ast ("bt_parse_entry_s: single entry, after post-processing:\n",
+ entry_ast);
+#endif
+
+ if (status) *status = parse_status (err_counts);
+ return entry_ast;
+
+} /* bt_parse_entry_s () */
+
+
+/* ------------------------------------------------------------------------
+@NAME : bt_parse_entry()
+@INPUT : infile - file to read next entry from,
+ or NULL meaning we're done, please cleanup
+ options - standard btparse options bitmap
+@OUTPUT : *top - AST for the entry, or NULL if no entries left in file
+@RETURNS : same as bt_parse_entry_s()
+@DESCRIPTION: Starts (or continues) parsing from a file.
+@GLOBALS :
+@CALLS :
+@CREATED : Jan 1997, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+AST * bt_parse_entry (FILE * infile,
+ char * filename,
+ btshort options,
+ boolean * status)
+{
+ AST * entry_ast = NULL;
+ static int * err_counts = NULL;
+ static FILE * prev_file = NULL;
+
+ if (options & BTO_STRINGMASK) /* any string options set? */
+ {
+ usage_error ("bt_parse_entry: illegal options "
+ "(string options not allowed)");
+ }
+
+ if (infile == NULL)
+ {
+ if (prev_file != NULL) /* haven't already done the cleanup */
+ {
+ prev_file = NULL;
+ finish_parse (&err_counts);
+ }
+
+ if (status) *status = TRUE;
+ return NULL;
+ }
+
+ if (prev_file != NULL && infile != prev_file)
+ {
+ usage_error ("bt_parse_entry: you can't interleave calls "
+ "across different files");
+ }
+
+ InputFilename = filename;
+ err_counts = bt_get_error_counts (err_counts);
+
+ if (feof (infile))
+ {
+ if (prev_file != NULL) /* haven't already done the cleanup */
+ {
+ prev_file = NULL;
+ finish_parse (&err_counts);
+ }
+ else
+ {
+ usage_warning ("bt_parse_entry: second attempt to read past eof");
+ }
+
+ if (status) *status = TRUE;
+ return NULL;
+ }
+
+ /*
+ * Here we do some nasty poking about the innards of PCCTS in order to
+ * enter the parser multiple times on the same input stream. This code
+ * comes from expanding the macro invokation:
+ *
+ * ANTLR (entry (top), infile);
+ *
+ * When LL_K, ZZINF_LOOK, and DEMAND_LOOK are all undefined, this
+ * ultimately expands to
+ *
+ * zzbufsize = ZZLEXBUFSIZE;
+ * {
+ * static char zztoktext[ZZLEXBUFSIZE];
+ * zzlextext = zztoktext;
+ * zzrdstream (f);
+ * zzgettok();
+ * }
+ * entry (top);
+ * ++zzasp;
+ *
+ * (I'm expanding hte zzenterANTLR, zzleaveANTLR, and zzPrimateLookAhead
+ * macros, but leaving ZZLEXBUFSIZE -- a simple constant -- alone.)
+ *
+ * There are two problems with this: 1) zztoktext is a statically
+ * allocated buffer, and when it overflows we just ignore further
+ * characters that should belong to that lexeme; and 2) zzrdstream() and
+ * zzgettok() are called every time we enter the parser, which means the
+ * token left over from the previous entry will be discarded when we
+ * parse entries 2 .. N.
+ *
+ * I handle the static buffer problem with alloc_lex_buffer() and
+ * realloc_lex_buffer() (in lex_auxiliary.c), and by rewriting the ZZCOPY
+ * macro to call realloc_lex_buffer() when overflow is detected.
+ *
+ * I handle the extra token-read by hanging on to a static file
+ * pointer, prev_file, between calls to bt_parse_entry() -- when
+ * the program starts it is NULL, and we reset it to NULL on
+ * finishing a file. Thus, any call that is the first on a given
+ * file will allocate the lexical buffer and read the first token;
+ * thereafter, we skip those steps, and free the buffer on reaching
+ * end-of-file. Currently, this method precludes interleaving
+ * calls to bt_parse_entry() on different files -- perhaps I could
+ * fix this with the zz{save,restore}_{antlr,dlg}_state()
+ * functions?
+ */
+
+ zzast_sp = ZZAST_STACKSIZE; /* workaround apparent pccts bug */
+
+#if defined(LL_K) || defined(ZZINF_LOOK) || defined(DEMAND_LOOK)
+# error One of LL_K, ZZINF_LOOK, or DEMAND_LOOK was defined
+#endif
+ if (prev_file == NULL) /* only read from input stream if */
+ { /* starting afresh with a file */
+ start_parse (infile, NULL, 0);
+ prev_file = infile;
+ }
+ assert (prev_file == infile);
+
+ entry (&entry_ast); /* enter the parser */
+ ++zzasp; /* why is this done? */
+
+ if (entry_ast == NULL) /* can happen with very bad input */
+ {
+ if (status) *status = FALSE;
+ return entry_ast;
+ }
+
+#if DEBUG
+ dump_ast ("bt_parse_entry(): single entry, after parsing:\n",
+ entry_ast);
+#endif
+ bt_postprocess_entry (entry_ast,
+ StringOptions[entry_ast->metatype] | options);
+#if DEBUG
+ dump_ast ("bt_parse_entry(): single entry, after post-processing:\n",
+ entry_ast);
+#endif
+
+ if (status) *status = parse_status (err_counts);
+ return entry_ast;
+
+} /* bt_parse_entry() */
+
+
+/* ------------------------------------------------------------------------
+@NAME : bt_parse_file ()
+@INPUT : filename - name of file to open. If NULL or "-", we read
+ from stdin rather than opening a new file.
+ options
+@OUTPUT : top
+@RETURNS : 0 if any entries in the file had serious errors
+ 1 if all entries were OK
+@DESCRIPTION: Parses an entire BibTeX file, and returns a linked list
+ of ASTs (or, if you like, a forest) for the entries in it.
+ (Any entries with serious errors are omitted from the list.)
+@GLOBALS :
+@CALLS : bt_parse_entry()
+@CREATED : 1997/01/18, from process_file() in bibparse.c
+@MODIFIED :
+@COMMENTS : This function bears a *striking* resemblance to bibparse.c's
+ process_file(). Eventually, I plan to replace this with
+ a generalized process_file() that takes a function pointer
+ to call for each entry. Until I decide on the right interface
+ for that, though, I'm sticking with this simpler (but possibly
+ memory-intensive) approach.
+-------------------------------------------------------------------------- */
+AST * bt_parse_file (char * filename,
+ btshort options,
+ boolean * status)
+{
+ FILE * infile;
+ AST * entries,
+ * cur_entry,
+ * last;
+ boolean entry_status,
+ overall_status;
+
+ if (options & BTO_STRINGMASK) /* any string options set? */
+ {
+ usage_error ("bt_parse_file: illegal options "
+ "(string options not allowed");
+ }
+
+ /*
+ * If a string was given, and it's *not* "-", then open that filename.
+ * Otherwise just use stdin.
+ */
+
+ if (filename != NULL && strcmp (filename, "-") != 0)
+ {
+ InputFilename = filename;
+ infile = fopen (filename, "r");
+ if (infile == NULL)
+ {
+ perror (filename);
+ return 0;
+ }
+ }
+ else
+ {
+ InputFilename = "(stdin)";
+ infile = stdin;
+ }
+
+ entries = NULL;
+ last = NULL;
+
+#if 1
+ /* explicit loop over entries, with junk cleaned out by read_entry () */
+
+ overall_status = TRUE; /* assume success */
+ while ((cur_entry = bt_parse_entry
+ (infile, InputFilename, options, &entry_status)))
+ {
+ overall_status &= entry_status;
+ if (!entry_status) continue; /* bad entry -- try next one */
+ if (!cur_entry) break; /* at eof -- we're done */
+ if (last == NULL) /* this is the first entry */
+ entries = cur_entry;
+ else /* have already seen one */
+ last->right = cur_entry;
+
+ last = cur_entry;
+ }
+
+#else
+ /* let the PCCTS lexer/parser handle everything */
+
+ initialize_lexer_state ();
+ ANTLR (bibfile (top), infile);
+
+#endif
+
+ fclose (infile);
+ InputFilename = NULL;
+ if (status) *status = overall_status;
+ return entries;
+
+} /* bt_parse_file() */
diff --git a/btparse/src/lex_auxiliary.c b/btparse/src/lex_auxiliary.c
new file mode 100644
index 0000000..3e84bf0
--- /dev/null
+++ b/btparse/src/lex_auxiliary.c
@@ -0,0 +1,942 @@
+/* ------------------------------------------------------------------------
+@NAME : lex_auxiliary.c
+@INPUT :
+@OUTPUT :
+@RETURNS :
+@DESCRIPTION: The code and global variables here have three main purposes:
+ - maintain the lexical buffer (zztoktext, which
+ traditionally with PCCTS is a static array; I have
+ changed things so that it's dynamically allocated and
+ resized on overflow)
+ - keep track of lexical state that's not handled by PCCTS
+ code (like "where are we in terms of BibTeX entries?" or
+ "what are the delimiters for the current entry/string?")
+ - everything called from lexical actions is here, to keep
+ the grammar file itself neat and clean
+@GLOBALS :
+@CALLS :
+@CALLERS :
+@CREATED : Greg Ward, 1996/07/25-28
+@MODIFIED : Jan 1997
+ Jun 1997
+@VERSION : $Id$
+@COPYRIGHT : Copyright (c) 1996-99 by Gregory P. Ward. All rights reserved.
+
+ This file is part of the btparse library. This library is
+ free software; you can redistribute it and/or modify it under
+ the terms of the GNU Library General Public License as
+ published by the Free Software Foundation; either version 2
+ of the License, or (at your option) any later version.
+-------------------------------------------------------------------------- */
+
+#include "bt_config.h"
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <stdarg.h>
+#include <assert.h>
+#include "lex_auxiliary.h"
+#include "stdpccts.h"
+#include "error.h"
+#include "prototypes.h"
+#include "my_dmalloc.h"
+
+#define DUPE_TEXT 0
+
+extern char * InputFilename; /* from input.c */
+
+GEN_PRIVATE_ERRFUNC (lexical_warning, (char * fmt, ...),
+ BTERR_LEXWARN, InputFilename, zzline, NULL, -1, fmt)
+GEN_PRIVATE_ERRFUNC (lexical_error, (char * fmt, ...),
+ BTERR_LEXERR, InputFilename, zzline, NULL, -1, fmt)
+
+
+
+/* ----------------------------------------------------------------------
+ * Global variables
+ */
+
+/* First, the lexical buffer. This is used elsewhere, so can't be static */
+char * zztoktext = NULL;
+
+/*
+ * Now, the lexical state -- first, stuff that arises from scanning
+ * at top-level and the beginnings of entries;
+ * EntryState:
+ * toplevel when we start scanning a file, or when we are in in_entry
+ * mode and see '}' or ')'
+ * after_at when we are in toplevel mode and see an '@'
+ * after_type when we are in after_at mode and see a name (!= 'comment')
+ * in_comment when we are in after_at mode and see a name (== 'comment')
+ * in_entry when we are in after_type mode and see '{' or '('
+ * EntryOpener:
+ * the character ('(' or '{') which opened the entry currently being
+ * scanned (we use this to make sure that the entry opener and closer
+ * match; if not, we issue a warning)
+ * EntryMetatype: (NB. typedef for bt_metatype is in btparse.h)
+ * classifies entries according to the syntax we will use to parse them;
+ * also winds up (after being changed to a bt_nodetype value) in the
+ * node that roots the entry AST:
+ * comment - anything between () or {}
+ * preamble - a single compound value
+ * string - a list of "name = compound_value" assignments; no key
+ * alias - a single "name = compound_value" assignment (where
+ * the compound value in this case is presumably a
+ * name, rather than a string -- this is not syntactically
+ * checked though)
+ * modify,
+ * entry - a key followed by a list of "name = compound_value"
+ * assignments
+ * JunkCount:
+ * the number of non-whitespace, non-'@' characters seen at toplevel
+ * between two entries (used to print out a warning when we hit
+ * the beginning of entry, to help people catch "old style" implicit
+ * comments
+ */
+static enum { toplevel, after_at, after_type, in_comment, in_entry }
+ EntryState;
+static char EntryOpener; /* '(' or '{' */
+static bt_metatype
+ EntryMetatype;
+static int JunkCount; /* non-whitespace chars at toplevel */
+
+/*
+ * String state -- these are maintained and used by the functions called
+ * from actions in the string lexer.
+ * BraceDepth:
+ * brace depth within a string; we can only end the current string
+ * when this is zero
+ * ParenDepth:
+ * parenthesis depth within a string; needed for @comment entries
+ * that are paren-delimited (because the comment in that case is
+ * a paren-delimited string)
+ * StringOpener:
+ * similar to EntryOpener, but stronger than merely warning of token
+ * mismatch -- this determines which character ('"' or '}') can
+ * actually end the string
+ * StringStart:
+ * line on which current string started; if we detect an apparent
+ * runaway, this is used to report where the runaway started
+ * ApparentRunaway:
+ * flags if we have already detected (and warned) that the current
+ * string appears to be a runaway, so that we don't warn again
+ * (and again and again and again)
+ *
+ * (See bibtex.g for an explanation of my runaway string detection heuristic.)
+ */
+static char StringOpener = '\0'; /* '{' or '"' */
+static int BraceDepth; /* depth of brace-nesting */
+static int ParenDepth; /* depth of parenthesis-nesting */
+static int StringStart = -1; /* start line of current string */
+static int ApparentRunaway; /* current string looks like runaway */
+
+/* ----------------------------------------------------------------------
+ * Miscellaneous functions:
+ * lex_info() (handy for debugging)
+ * zzcr_attr() (called from PCCTS-generated code)
+ */
+
+void lex_info (void)
+{
+ printf ("LA(1) = \"%s\" token %d, %s\n", LATEXT(1), LA(1), zztokens[LA(1)]);
+#ifdef LL_K
+ printf ("LA(2) = \"%s\" token %d, %s\n", LATEXT(2), LA(2), zztokens[LA(2)]);
+#endif
+}
+
+
+void zzcr_attr (Attrib *a, int tok, char *txt)
+{
+ if (tok == STRING)
+ {
+ int len = strlen (txt);
+
+ assert (
+ (txt[0] == '{' && txt[len-1] == '}')
+ ||
+ (txt[0] == '\"' && txt[len-1] == '\"')
+ );
+ txt[len-1] = (char) 0; /* remove closing quote from string */
+ txt++; /* so we'll skip the opening quote */
+ }
+
+#if DUPE_TEXT
+ a->text = strdup (txt);
+#else
+ a->text = txt;
+#endif
+ a->token = tok;
+ a->line = zzline;
+ a->offset = zzbegcol;
+#if DEBUG > 1
+ dprintf ("zzcr_attr: input txt = %p (%s)\n", txt, txt);
+ dprintf (" dupe txt = %p (%s)\n", a->text, a->text);
+#endif
+}
+
+
+#if DUPE_TEXT
+void zzd_attr (Attrib *attr)
+{
+ free (attr->text);
+}
+#endif
+
+
+/* ----------------------------------------------------------------------
+ * Lexical buffer functions:
+ * alloc_lex_buffer()
+ * realloc_lex_buffer()
+ * free_lex_buffer()
+ * lexer_overflow()
+ * zzcopy() (only if ZZCOPY_FUNCTION is defined and true)
+ */
+
+
+/*
+ * alloc_lex_buffer()
+ *
+ * allocates the lexical buffer with `size' characters. Clears the buffer,
+ * points zzlextext at it, and sets zzbufsize to `size'.
+ *
+ * Does nothing if the buffer is already allocated.
+ *
+ * globals: zztoktext, zzlextext, zzbufsize
+ * callers: bt_parse_entry() (in input.c)
+ */
+void alloc_lex_buffer (int size)
+{
+ if (zztoktext == NULL)
+ {
+ zztoktext = (char *) malloc (size * sizeof (char));
+ memset (zztoktext, 0, size);
+ zzlextext = (unsigned char*)zztoktext;
+ zzbufsize = size;
+ }
+} /* alloc_lex_buffer() */
+
+
+/*
+ * realloc_lex_buffer()
+ *
+ * Reallocates the lexical buffer -- size is increased by `size_increment'
+ * characters (which could be negative). Updates all globals that point
+ * to or into the buffer (zzlextext, zzbegexpr, zzendexpr), as well as
+ * zztoktext (the buffer itself) zzbufsize (the buffer size).
+ *
+ * This is only meant to be called (ultimately) from zzgettok(), part of
+ * the DLG code. (In fact, zzgettok() invokes the ZZCOPY() macro, which
+ * calls lexer_overflow() on buffer overflow, which calls
+ * realloc_lex_buffer(). Whatever.) The `lastpos' and `nextpos' arguments
+ * correspond, respectively, to a local variable in zzgettok() and a static
+ * global in dlgauto.h (hence really in scan.c). They both point into
+ * the lexical buffer, so have to be passed by reference here so that
+ * we can update them to point into the newly-reallocated buffer.
+ *
+ * globals: zztottext, zzbufsize, zzlextext, zzbegexpr, zzendexpr
+ * callers: lexer_overflow()
+ */
+static void
+realloc_lex_buffer (int size_increment,
+ unsigned char ** lastpos,
+ unsigned char ** nextpos)
+{
+ int beg, end, next;
+
+ if (zztoktext == NULL)
+ internal_error ("attempt to reallocate unallocated lexical buffer");
+
+ zztoktext = (char *) realloc (zztoktext, zzbufsize+size_increment);
+ memset (zztoktext+zzbufsize, 0, size_increment);
+ zzbufsize += size_increment;
+
+ beg = zzbegexpr - zzlextext;
+ end = zzendexpr - zzlextext;
+ next = *nextpos - (unsigned char *) zzlextext;
+ zzlextext = (unsigned char*)zztoktext;
+
+ if (lastpos != NULL)
+ *lastpos = zzlextext+zzbufsize-1;
+ zzbegexpr = zzlextext + beg;
+ zzendexpr = zzlextext + end;
+ *nextpos = zzlextext + next;
+
+} /* realloc_lex_buffer() */
+
+
+/*
+ * free_lex_buffer()
+ *
+ * Frees the lexical buffer allocated by alloc_lex_buffer().
+ */
+void free_lex_buffer (void)
+{
+ if (zztoktext == NULL)
+ internal_error ("attempt to free unallocated (or already freed) "
+ "lexical buffer");
+
+ free (zztoktext);
+ zztoktext = NULL;
+} /* free_lex_buffer() */
+
+
+/*
+ * lexer_overflow()
+ *
+ * Prints a warning and calls realloc_lex_buffer() to increase the size
+ * of the lexical buffer by ZZLEXBUFSIZE (a constant -- hence the buffer
+ * size increases linearly, not exponentially).
+ *
+ * Also prints a couple of lines of useful debugging stuff if DEBUG is true.
+ */
+void lexer_overflow (unsigned char **lastpos, unsigned char **nextpos)
+{
+#if DEBUG
+ char head[16], tail[16];
+
+ printf ("zzcopy: overflow detected\n");
+ printf (" zzbegcol=%d, zzendcol=%d, zzline=%d\n",
+ zzbegcol, zzendcol, zzline);
+ strncpy (head, zzlextext, 15); head[15] = 0;
+ strncpy (tail, zzlextext+ZZLEXBUFSIZE-15, 15); tail[15] = 0;
+ printf (" zzlextext=>%s...%s< (last char=%d (%c))\n",
+ head, tail,
+ zzlextext[ZZLEXBUFSIZE-1], zzlextext[ZZLEXBUFSIZE-1]);
+ printf (" zzchar = %d (%c), zzbegexpr=zzlextext+%d\n",
+ zzchar, zzchar, zzbegexpr-zzlextext);
+#endif
+
+ /* Removed this as it's not that useful to know and is disconcerting
+ for Text::BibTeX users */
+ /* notify ("lexical buffer overflowed (reallocating to %d bytes)",
+ zzbufsize+ZZLEXBUFSIZE); */
+ realloc_lex_buffer (ZZLEXBUFSIZE, lastpos, nextpos);
+
+} /* lexer_overflow () */
+
+
+#if ZZCOPY_FUNCTION
+/*
+ * zzcopy()
+ *
+ * Does the same as the ZZCOPY macro (in lex_auxiliary.h), but as a
+ * function for easier debugging.
+ */
+void zzcopy (char **nextpos, char **lastpos, int *ovf_flag)
+{
+ if (*nextpos >= *lastpos)
+ {
+ lexer_overflow (lastpos, nextpos);
+ }
+
+ **nextpos = zzchar;
+ (*nextpos)++;
+}
+#endif
+
+
+
+/* ----------------------------------------------------------------------
+ * Report/maintain lexical state
+ * report_state() (only meaningful if DEBUG)
+ * initialize_lexer_state()
+ *
+ * Note that the lexical action functions, below, also fiddle with
+ * the lexical state variables an awful lot.
+ */
+
+#if DEBUG
+char *state_names[] =
+ { "toplevel", "after_at", "after_type", "in_comment", "in_entry" };
+char *metatype_names[] =
+ { "unknown", "comment", "preamble", "string", "alias", "modify", "entry" };
+
+static void
+report_state (char *where)
+{
+ printf ("%s: lextext=%s (line %d, offset %d), token=%d, "
+ "EntryState=%s\n",
+ where, zzlextext, zzline, zzbegcol, NLA,
+ state_names[EntryState]);
+}
+#else
+# define report_state(where)
+/*
+static void
+report_state (char *where) { }
+*/
+#endif
+
+void initialize_lexer_state (void)
+{
+ zzmode (START);
+ EntryState = toplevel;
+ EntryOpener = (char) 0;
+ EntryMetatype = BTE_UNKNOWN;
+ JunkCount = 0;
+}
+
+
+bt_metatype entry_metatype (void)
+{
+ return EntryMetatype;
+}
+
+
+
+/* ----------------------------------------------------------------------
+ * Lexical actions (START and LEX_ENTRY modes)
+ */
+
+/*
+ * newline ()
+ *
+ * Does everything needed to handle newline outside of a quoted string:
+ * increments line counter and skips the newline.
+ */
+void newline (void)
+{
+ zzline++;
+ zzskip();
+}
+
+
+void comment (void)
+{
+ zzline++;
+ zzskip();
+}
+
+
+void at_sign (void)
+{
+ if (EntryState == toplevel)
+ {
+ EntryState = after_at;
+ zzmode (LEX_ENTRY);
+ if (JunkCount > 0)
+ {
+ lexical_warning ("%d characters of junk seen at toplevel", JunkCount);
+ JunkCount = 0;
+ }
+ }
+ else
+ {
+ /* internal_error ("lexer recognized \"@\" at other than top-level"); */
+ lexical_warning ("\"@\" in strange place -- should get syntax error");
+ }
+ report_state ("at_sign");
+}
+
+
+void toplevel_junk (void)
+{
+ JunkCount += strlen ((const char*)zzlextext);
+ zzskip ();
+}
+
+
+void name (void)
+{
+ report_state ("name (pre)");
+
+ switch (EntryState)
+ {
+ case toplevel:
+ {
+ internal_error ("junk at toplevel (\"%s\")", zzlextext);
+ break;
+ }
+ case after_at:
+ {
+ char * etype = (char*)zzlextext;
+ EntryState = after_type;
+
+ if (strcasecmp (etype, "comment") == 0)
+ {
+ EntryMetatype = BTE_COMMENT;
+ EntryState = in_comment;
+ }
+
+ else if (strcasecmp (etype, "preamble") == 0)
+ EntryMetatype = BTE_PREAMBLE;
+
+ else if (strcasecmp (etype, "string") == 0)
+ EntryMetatype = BTE_MACRODEF;
+/*
+ else if (strcasecmp (etype, "alias") == 0)
+ EntryMetatype = BTE_ALIAS;
+
+ else if (strcasecmp (etype, "modify") == 0)
+ EntryMetatype = BTE_MODIFY;
+*/
+ else
+ EntryMetatype = BTE_REGULAR;
+
+ break;
+ }
+ case after_type:
+ case in_comment:
+ case in_entry:
+ break; /* do nothing */
+ }
+
+ report_state ("name (post)");
+
+}
+
+
+void lbrace (void)
+{
+ /*
+ * Currently takes a restrictive view of "when an lbrace is an entry
+ * opener" -- ie. *only* after '@name' (as determined by EntryState),
+ * where name is not 'comment'. This means that lbrace usually
+ * determines a string (in particular, when it's seen at toplevel --
+ * which will happen under certain error situations), which in turn
+ * means that some unexpected things can become strings (like whole
+ * entries).
+ */
+
+ if (EntryState == in_entry || EntryState == in_comment)
+ {
+ start_string ('{');
+ }
+ else if (EntryState == after_type)
+ {
+ EntryState = in_entry;
+ EntryOpener = '{';
+ NLA = ENTRY_OPEN;
+ }
+ else
+ {
+ lexical_warning ("\"{\" in strange place -- should get a syntax error");
+ }
+
+ report_state ("lbrace");
+}
+
+
+void rbrace (void)
+{
+ if (EntryState == in_entry)
+ {
+ if (EntryOpener == '(')
+ lexical_warning ("entry started with \"(\", but ends with \"}\"");
+ NLA = ENTRY_CLOSE;
+ initialize_lexer_state ();
+ }
+ else
+ {
+ lexical_warning ("\"}\" in strange place -- should get a syntax error");
+ }
+ report_state ("rbrace");
+}
+
+
+void lparen (void)
+{
+ if (EntryState == in_comment)
+ {
+ start_string ('(');
+ }
+ else if (EntryState == after_type)
+ {
+ EntryState = in_entry;
+ EntryOpener = '(';
+ }
+ else
+ {
+ lexical_warning ("\"(\" in strange place -- should get a syntax error");
+ }
+ report_state ("lparen");
+}
+
+
+void rparen (void)
+{
+ if (EntryState == in_entry)
+ {
+ if (EntryOpener == '{')
+ lexical_warning ("entry started with \"{\", but ends with \")\"");
+ initialize_lexer_state ();
+ }
+ else
+ {
+ lexical_warning ("\")\" in strange place -- should get a syntax error");
+ }
+ report_state ("rparen");
+}
+
+
+/* ----------------------------------------------------------------------
+ * Stuff for processing strings.
+ */
+
+
+/*
+ * start_string ()
+ *
+ * Called when we see a '{' or '"' in the field data. Records which quote
+ * character was used, and calls open_brace() to increment the depth
+ * counter if it was a '{'. Switches to LEX_STRING mode, and tells the
+ * lexer to continue slurping characters into the same buffer.
+ */
+void start_string (char start_char)
+{
+ StringOpener = start_char;
+ BraceDepth = 0;
+ ParenDepth = 0;
+ StringStart = zzline;
+ ApparentRunaway = 0;
+ if (start_char == '{')
+ open_brace ();
+ if (start_char == '(')
+ ParenDepth++;
+ if (start_char == '"' && EntryState == in_comment)
+ {
+ lexical_error ("comment entries must be delimited by either braces or parentheses");
+ EntryState = toplevel;
+ zzmode (START);
+ return;
+ }
+
+#ifdef USER_ZZMODE_STACK
+ if (zzauto != LEX_ENTRY || EntryState != in_entry)
+#else
+ if (EntryState != in_entry && EntryState != in_comment)
+#endif
+ {
+ lexical_warning ("start of string seen at weird place");
+ }
+
+ zzmore ();
+ zzmode (LEX_STRING);
+}
+
+
+/*
+ * end_string ()
+ *
+ * Called when we see either a '"' (at depth 0) or '}' (if it brings us
+ * down to depth 0) in a quoted string. Just makes sure that braces are
+ * balanced, and then goes back to the LEX_FIELD mode.
+ */
+void end_string (char end_char)
+{
+ char match;
+
+#ifndef ALLOW_WARNINGS
+ match = (char) 0; /* silence "might be used" */
+ /* uninitialized" warning */
+#endif
+
+ switch (end_char)
+ {
+ case '}': match = '{'; break;
+ case ')': match = '('; break;
+ case '"': match = '"'; break;
+ default:
+ internal_error ("end_string(): invalid end_char \"%c\"", end_char);
+ }
+
+ assert (StringOpener == match);
+
+ /*
+ * If we're at non-zero BraceDepth, that probably means mismatched braces
+ * somewhere -- complain about it and reset BraceDepth to minimize future
+ * confusion.
+ */
+
+ if (BraceDepth > 0)
+ {
+ lexical_error ("unbalanced braces: too many {'s");
+ BraceDepth = 0;
+ }
+
+ StringOpener = (char) 0;
+ StringStart = -1;
+ NLA = STRING;
+
+ if (EntryState == in_comment)
+ {
+ int len = strlen ((const char*)zzlextext);
+
+ /*
+ * ARG! no, this is wrong -- what if unbalanced braces in the string
+ * and we try to output put it later?
+ *
+ * ARG! again, this is no more wrong than when we strip quotes in
+ * post_parse.c, and blithely assume that we can put them back on
+ * later for output in BibTeX syntax. Hmmm.
+ *
+ * Actually, it looks like this isn't a problem after all: you
+ * can't have unbalanced braces in a BibTeX string (at least
+ * not as parsed by btparse).
+ */
+
+ if (zzlextext[0] == '(') /* convert to standard quote delims */
+ {
+ zzlextext[ 0] = '{';
+ zzlextext[len-1] = '}';
+ }
+
+ EntryState = toplevel;
+ zzmode (START);
+ }
+ else
+ {
+ zzmode (LEX_ENTRY);
+ }
+
+ report_state ("string");
+}
+
+
+/*
+ * open_brace ()
+ *
+ * Called when we see a '{', either to start a string (in which case
+ * it's called from start_string()) or inside a string (called directly
+ * from the lexer).
+ */
+void open_brace (void)
+{
+ BraceDepth++;
+ zzmore ();
+ report_state ("open_brace");
+}
+
+
+/*
+ * close_brace ()
+ *
+ * Called when we see a '}' inside a string. Decrements the depth counter
+ * and checks to see if we are down to depth 0, in which case the string is
+ * ended and the current lookahead token is set to STRING. Otherwise,
+ * just tells the lexer to keep slurping characters into the buffer.
+ */
+void close_brace (void)
+{
+ BraceDepth--;
+ if (StringOpener == '{' && BraceDepth == 0)
+ {
+ end_string ('}');
+ }
+
+ /*
+ * This could happen if some bonehead puts an unmatched right-brace
+ * in a quote-delimited string (eg. "Hello}"). To attempt to recover,
+ * we reset the depth to zero and continue slurping into the string.
+ */
+ else if (BraceDepth < 0)
+ {
+ lexical_error ("unbalanced braces: too many }'s");
+ BraceDepth = 0;
+ zzmore ();
+ }
+
+ /* Otherwise, it's just any old right brace in a string -- keep eating */
+ else
+ {
+ zzmore ();
+ }
+ report_state ("close_brace");
+}
+
+
+void lparen_in_string (void)
+{
+ ParenDepth++;
+ zzmore ();
+}
+
+
+void rparen_in_string (void)
+{
+ ParenDepth--;
+ if (StringOpener == '(' && ParenDepth == 0)
+ {
+ end_string (')');
+ }
+ else
+ {
+ zzmore ();
+ }
+}
+
+
+/*
+ * quote_in_string ()
+ *
+ * Called when we see '"' in a string. Ends the string if the quote is at
+ * depth 0 and the string was started with a quote, otherwise instructs the
+ * lexer to continue munching happily along. (Also prints a warning,
+ * assuming that input is destined for processing by TeX and you really
+ * want either `` or '' rather than ".)
+ */
+void quote_in_string (void)
+{
+ if (StringOpener == '"' && BraceDepth == 0)
+ {
+ end_string ('"');
+ }
+ else
+ {
+ boolean at_top = FALSE;;
+
+ /*
+ * Note -- this warning assumes that strings are destined
+ * to be processed by TeX, so it should be optional. Hmmm.
+ */
+
+ if (StringOpener == '"' || StringOpener == '(')
+ at_top = (BraceDepth == 0);
+ else if (StringOpener == '{')
+ at_top = (BraceDepth == 1);
+ else
+ internal_error ("Illegal string opener \"%c\"", StringOpener);
+
+ zzmore ();
+ }
+}
+
+
+/*
+ * check_runaway_string ()
+ *
+ * Called from the lexer whenever we see a newline in a string. See
+ * bibtex.g for a detailed explanation; basically, this function
+ * looks for an entry start ("@name{") or new field ("name=") immediately
+ * after a newline (with possible whitespace). This is a heuristic
+ * check for runaway strings, under the assumption that text that looks
+ * like a new entry or new field won't actually occur inside a string
+ * very often.
+ */
+void check_runaway_string (void)
+{
+ int len;
+ int i;
+
+ /*
+ * could these be made significantly more efficient by a 256-element
+ * lookup table instead of calling strchr()?
+ */
+ static char *alpha_chars = "abcdefghijklmnopqrstuvwxyz";
+ static char *name_chars = "abcdefghijklmnopqrstuvwxyz0123456789:+/'.-";
+
+ /*
+ * on entry: zzlextext contains the whole string, starting with {
+ * and with newlines/tabs converted to space; zzbegexpr points to
+ * a chunk of the string starting with newline (newlines and
+ * tabs have not yet been converted)
+ */
+
+#if DEBUG > 1
+ printf ("check_runaway_string(): zzline=%d\n", zzline);
+ printf ("zzlextext=>%s<\nzzbegexpr=>%s<\n",
+ zzlextext, zzbegexpr);
+#endif
+
+
+ /*
+ * increment zzline to take the leading newline into account -- but
+ * first a sanity check to be sure that newline is there!
+ */
+
+ if (zzbegexpr[0] != '\n')
+ {
+ lexical_warning ("huh? something's wrong (buffer overflow?) near "
+ "offset %d (line %d)", zzendcol, zzline);
+ /* internal_error ("zzbegexpr (line %d, offset %d-%d, "
+ "text >%s<, expr >%s<)"
+ "should start with a newline",
+ zzline, zzbegcol, zzendcol, zzlextext, zzbegexpr);
+ */
+ }
+ else
+ {
+ zzline++;
+ }
+
+ len = strlen ((const char*)zzbegexpr);
+ for (i = 0; i < len; i++)
+ {
+ /* standardise whitespace (convert all to space). We are not using
+ ctype isspace() as this is unreliable on many modern systems which
+ try to be clever and count as spaces some special things like ASCII
+ 160 (non-breaking space) and 133 (horizontal ellipsis). This breaks
+ lots of Unicode chars as they pass through here. What we mean by
+ "whitespace" is nothing so clever, it's just the usual few ASCII
+ chars that should appear as normal spaces.
+ */
+
+ /* if (isspace (zzbegexpr[i])) */
+ if (zzbegexpr[i] == 9 ||
+ zzbegexpr[i] == 10 ||
+ zzbegexpr[i] == 11 ||
+ zzbegexpr[i] == 12 ||
+ zzbegexpr[i] == 13)
+ zzbegexpr[i] = ' ';
+ }
+
+
+ if (!ApparentRunaway) /* haven't already warned about it */
+ {
+ enum { none, entry, field, giveup } guess;
+
+ i = 1;
+ guess = none;
+ while (i < len && zzbegexpr[i] == ' ') i++;
+
+ if (zzbegexpr[i] == '@')
+ {
+ i++;
+ while (i < len && zzbegexpr[i] == ' ') i++;
+ guess = entry;
+ }
+
+ if (strchr (alpha_chars, tolower (zzbegexpr[i])) != NULL)
+ {
+ while (i < len && strchr (name_chars, tolower (zzbegexpr[i])) != NULL)
+ i++;
+ while (i < len && zzbegexpr[i] == ' ') i++;
+ if (i == len)
+ {
+ guess = giveup;
+ }
+ else
+ {
+ if (guess == entry)
+ {
+ if (zzbegexpr[i] != '{' && zzbegexpr[i] != '(')
+ guess = giveup;
+ }
+ else /* assume it's a field */
+ {
+ if (zzbegexpr[i] == '=')
+ guess = field;
+ else
+ guess = giveup;
+ }
+ }
+ }
+ else /* no name seen after WS or @ */
+ {
+ guess = giveup;
+ }
+
+ if (guess == none)
+ internal_error ("gee, I should have made a guess by now");
+
+ if (guess != giveup)
+ {
+ lexical_warning ("possible runaway string started at line %d",
+ StringStart);
+ ApparentRunaway = 1;
+ }
+ }
+
+ zzmore();
+}
+
diff --git a/btparse/src/lex_auxiliary.h b/btparse/src/lex_auxiliary.h
new file mode 100644
index 0000000..87b94b8
--- /dev/null
+++ b/btparse/src/lex_auxiliary.h
@@ -0,0 +1,71 @@
+/* ------------------------------------------------------------------------
+@NAME : lex_auxiliary.h
+@DESCRIPTION: Macros and function prototypes needed by the lexical scanner.
+ Some of these are called from internal PCCTS code, and some
+ are explicitly called from the lexer actions in bibtex.g.
+@CREATED : Summer 1996, Greg Ward
+@MODIFIED :
+@VERSION : $Id$
+@COPYRIGHT : Copyright (c) 1996-99 by Gregory P. Ward. All rights reserved.
+
+ This file is part of the btparse library. This library is
+ free software; you can redistribute it and/or modify it under
+ the terms of the GNU Library General Public License as
+ published by the Free Software Foundation; either version 2
+ of the License, or (at your option) any later version.
+-------------------------------------------------------------------------- */
+#ifndef LEX_AUXILIARY_H
+#define LEX_AUXILIARY_H
+
+#include "btparse.h"
+#include "attrib.h"
+
+#define ZZCOPY_FUNCTION 0
+
+#if ZZCOPY_FUNCTION
+#define ZZCOPY zzcopy (&zznextpos, &lastpos, &zzbufovf)
+#else
+#define ZZCOPY \
+ if (zznextpos >= lastpos) \
+ { \
+ lexer_overflow (&lastpos, &zznextpos); \
+ } \
+ *(zznextpos++) = zzchar;
+#endif
+
+
+/* Function prototypes: */
+
+void lex_info (void);
+void zzcr_attr (Attrib *a, int tok, char *txt);
+
+void alloc_lex_buffer (int size);
+void free_lex_buffer (void);
+void lexer_overflow (unsigned char **lastpos, unsigned char **nextpos);
+#if ZZCOPY_FUNCTION
+void zzcopy (char **nextpos, char **lastpos, int *ovf_flag);
+#endif
+
+void initialize_lexer_state (void);
+bt_metatype entry_metatype (void);
+
+void newline (void);
+void comment (void);
+void at_sign (void);
+void toplevel_junk (void);
+void name (void);
+void lbrace (void);
+void rbrace (void);
+void lparen (void);
+void rparen (void);
+
+void start_string (char start_char);
+void end_string (char end_char);
+void open_brace (void);
+void close_brace (void);
+void lparen_in_string (void);
+void rparen_in_string (void);
+void quote_in_string (void);
+void check_runaway_string (void);
+
+#endif /* ! defined LEX_AUXILIARY_H */
diff --git a/btparse/src/line_offsets.c b/btparse/src/line_offsets.c
new file mode 100644
index 0000000..da0b12e
--- /dev/null
+++ b/btparse/src/line_offsets.c
@@ -0,0 +1,91 @@
+/*
+ * line_offsets.c
+ *
+ * Data structure and code for recording the offset (zero-based) into an
+ * input file of every line.
+ *
+ * Problems: what happens at eof? perhaps need special code in lexer...
+ * not tested for large (> 1024 lines) files -- does the array
+ * grow properly?
+ *
+ * GPW 1996/08/29
+ */
+
+#include <stdlib.h>
+#include <assert.h>
+#include <stdio.h> /* only for dump_line_offsets() */
+
+#include "line_offsets.h"
+
+typedef struct
+{
+ int num_slots, num_lines;
+ int *offsets;
+} line_offsets_t;
+
+static line_offsets_t line_offsets = { 0, 0, NULL };
+
+
+void
+initialize_line_offsets (void)
+{
+
+ /*
+ * If the structure is completely unused (ie. we're starting the first
+ * file) then malloc() it from scratch.
+ */
+
+ if (line_offsets.num_slots == 0)
+ {
+ line_offsets.num_slots = 1024;
+ line_offsets.offsets =
+ (int *) malloc (line_offsets.num_slots * sizeof (int));
+ }
+
+ /* In any case, initialize the array with the offset of line 0,
+ * and chalk up "one line counted" (line 0) so far.
+ */
+
+ line_offsets.offsets[0] = 0;
+ line_offsets.num_lines = 1;
+}
+
+
+void
+record_line_offset (int line, int offset)
+{
+ assert (line_offsets.num_slots > 0); /* make sure the structure has been */
+ /* allocated */
+
+ if (line >= line_offsets.num_slots)
+ {
+ line_offsets.num_slots *= 2;
+ line_offsets.offsets =
+ (int *) realloc ((void *) line_offsets.offsets,
+ line_offsets.num_slots * sizeof (int));
+ }
+
+ assert (line_offsets.num_lines == line);
+ line_offsets.offsets[line] = offset;
+ line_offsets.num_lines++;
+}
+
+
+int
+line_offset (int line)
+{
+ return line_offsets.offsets[line-1];
+}
+
+
+void
+dump_line_offsets (char *filename, FILE *stream)
+{
+ int i;
+
+ fprintf (stream, "Line offsets in %s:\n", filename);
+ fprintf (stream, "%4s %6s\n", "Line", "Offset");
+ for (i = 0; i < line_offsets.num_lines; i++)
+ fprintf (stream, "%4d %6d (%04x)\n",
+ i+1, line_offsets.offsets[i], line_offsets.offsets[i]);
+}
diff --git a/btparse/src/line_offsets.h b/btparse/src/line_offsets.h
new file mode 100644
index 0000000..57af2e0
--- /dev/null
+++ b/btparse/src/line_offsets.h
@@ -0,0 +1,13 @@
+#ifndef LINEDATA_H
+#define LINEDATA_H
+
+#include <stdio.h>
+
+/* Prototypes for functions exported from linedata.c: */
+
+void initialize_line_offsets (void);
+void record_line_offset (int line, int offset);
+int line_offset (int line);
+void dump_line_offsets (char *filename, FILE *stream);
+
+#endif
diff --git a/btparse/src/macros.c b/btparse/src/macros.c
new file mode 100644
index 0000000..49bbaa7
--- /dev/null
+++ b/btparse/src/macros.c
@@ -0,0 +1,369 @@
+/* ------------------------------------------------------------------------
+@NAME : macros.c
+@DESCRIPTION: Front-end to the standard PCCTS symbol table code (sym.c)
+ to abstract my "macro table".
+@GLOBALS :
+@CALLS :
+@CREATED : 1997/01/12, Greg Ward
+@MODIFIED :
+@VERSION : $Id$
+@COPYRIGHT : Copyright (c) 1996-99 by Gregory P. Ward. All rights reserved.
+
+ This file is part of the btparse library. This library is
+ free software; you can redistribute it and/or modify it under
+ the terms of the GNU Library General Public License as
+ published by the Free Software Foundation; either version 2
+ of the License, or (at your option) any later version.
+-------------------------------------------------------------------------- */
+#include "bt_config.h"
+#include <stdlib.h>
+#include <string.h>
+#include "sym.h"
+#include "prototypes.h"
+#include "error.h"
+#include "my_dmalloc.h"
+#include "bt_debug.h"
+
+
+/*
+ * NUM_MACROS and STRING_SIZE define the size of the static data
+ * structure that holds the macro table. The defaults are to allocate
+ * 4096 bytes of string space that will be divided up amongst 547
+ * macros. This should be fine for most applications, but if you have a
+ * big macro table you might need to change these and recompile (don't
+ * forget to rebuild and reinstall Text::BibTeX if you're using it!).
+ * You can set these as high as you like; just remember that a block of
+ * STRING_SIZE bytes will be allocated and not freed as long as you're
+ * using btparse. Also, NUM_MACROS defines the size of a hashtable, so
+ * it should probably be a prime a bit greater than a power of 2 -- or
+ * something like that. I'm not sure of the exact Knuthian
+ * specification.
+ */
+/* Increased 100-fold to accomodate large string libraries - PK 27/10/2011 */
+
+#define NUM_MACROS 5449
+#define STRING_SIZE 400960
+
+Sym *AllMacros = NULL; /* `scope' so we can get back list */
+ /* of all macros when done */
+
+
+GEN_PRIVATE_ERRFUNC (macro_warning,
+ (char * filename, int line, char * fmt, ...),
+ BTERR_CONTENT, filename, line, NULL, -1, fmt)
+
+
+/* ------------------------------------------------------------------------
+@NAME : init_macros()
+@INPUT :
+@OUTPUT :
+@RETURNS :
+@DESCRIPTION: Initializes the symbol table used to store macro values.
+@GLOBALS : AllMacros
+@CALLS : zzs_init(), zzs_scope() (sym.c)
+@CALLERS : bt_initialize() (init.c)
+@CREATED : Jan 1997, GPW
+-------------------------------------------------------------------------- */
+void
+init_macros (void)
+{
+ zzs_init (NUM_MACROS, STRING_SIZE);
+ zzs_scope (&AllMacros);
+}
+
+
+/* ------------------------------------------------------------------------
+@NAME : done_macros()
+@INPUT :
+@OUTPUT :
+@RETURNS :
+@DESCRIPTION: Frees up all the macro values in the symbol table, and
+ then frees up the symbol table itself.
+@GLOBALS : AllMacros
+@CALLS : zzs_rmscope(), zzs_done()
+@CALLERS : bt_cleanup() (init.c)
+@CREATED : Jan 1997, GPW
+-------------------------------------------------------------------------- */
+void
+done_macros (void)
+{
+ bt_delete_all_macros ();
+ zzs_done ();
+}
+
+
+static void
+delete_macro_entry (Sym * sym)
+{
+ Sym * cur;
+ Sym * prev;
+
+ /*
+ * Yechh! All this mucking about with the scope list really
+ * ought to be handled by the symbol table code. Must write
+ * my own someday.
+ */
+
+ /* Find this entry in the list of all macro table entries */
+ cur = AllMacros;
+ prev = NULL;
+ while (cur != NULL && cur != sym)
+ {
+ prev = cur;
+ cur = cur->scope;
+ }
+
+ if (cur == NULL) /* uh-oh -- wasn't found! */
+ {
+ internal_error ("macro table entry for \"%s\" not found in scope list",
+ sym->symbol);
+ }
+
+ /* Now unlink from the "scope" list */
+ if (prev == NULL) /* it's the head of the list */
+ AllMacros = cur->scope;
+ else
+ prev->scope = cur->scope;
+
+ /* Remove it from the macro hash table */
+ zzs_del (sym);
+
+ /* And finally, free up the entry's text and the entry itself */
+ if (sym->text) free (sym->text);
+ free (sym);
+} /* delete_macro_entry() */
+
+
+/* ------------------------------------------------------------------------
+@NAME : bt_add_macro_value()
+@INPUT : assignment - AST node representing "macro = value"
+ options - string-processing options that were used to
+ process this string after parsing
+@OUTPUT :
+@RETURNS :
+@DESCRIPTION: Adds a value to the symbol table used for macros.
+
+ If the value was not already post-processed as a macro value
+ (expand macros, paste substrings, but don't collapse
+ whitespace), then this post-processing is done before adding
+ the macro text to the table.
+
+ If the macro is already defined, a warning is printed and
+ the old text is overridden.
+@GLOBALS :
+@CALLS : bt_add_macro_text()
+ bt_postprocess_field()
+@CALLERS : bt_postprocess_entry() (post_parse.c)
+@CREATED : Jan 1997, GPW
+-------------------------------------------------------------------------- */
+void
+bt_add_macro_value (AST *assignment, btshort options)
+{
+ AST * value;
+ char * macro;
+ char * text;
+ boolean free_text;
+
+ if (assignment == NULL || assignment->down == NULL) return;
+ value = assignment->down;
+
+ /*
+ * If the options that were used to process the macro's expansion text
+ * are anything other than BTO_MACRO, then we'll have to do it ourselves.
+ */
+
+ if ((options & BTO_STRINGMASK) != BTO_MACRO)
+ {
+ text = bt_postprocess_field (assignment, BTO_MACRO, FALSE);
+ free_text = TRUE; /* because it's alloc'd by */
+ /* bt_postprocess_field() */
+ }
+ else
+ {
+ /*
+ * First a sanity check to make sure that the presumed post-processing
+ * had the desired effect.
+ */
+
+ if (value->nodetype != BTAST_STRING || value->right != NULL)
+ {
+ internal_error ("add_macro: macro value was not "
+ "correctly preprocessed");
+ }
+
+ text = assignment->down->text;
+ free_text = FALSE;
+ }
+
+ macro = assignment->text;
+ bt_add_macro_text (macro, text, assignment->filename, assignment->line);
+ if (free_text && text != NULL)
+ free (text);
+
+} /* bt_add_macro_value() */
+
+
+/* ------------------------------------------------------------------------
+@NAME : bt_add_macro_text()
+@INPUT : macro - the name of the macro to define
+ text - the macro text
+ filename, line - where the macro is defined; pass NULL
+ for filename if no file, 0 for line if no line number
+ (just used to generate warning message)
+@OUTPUT :
+@RETURNS :
+@DESCRIPTION: Sets the text value for a macro. If the macro is already
+ defined, a warning is printed and the old value is overridden.
+@GLOBALS :
+@CALLS : zzs_get(), zzs_newadd()
+@CALLERS : bt_add_macro_value()
+ (exported from library)
+@CREATED : 1997/11/13, GPW (from code in bt_add_macro_value())
+@MODIFIED :
+-------------------------------------------------------------------------- */
+void
+bt_add_macro_text (char * macro, char * text, char * filename, int line)
+{
+ Sym * sym;
+ Sym * new_rec;
+
+#if DEBUG == 1
+ printf ("adding macro \"%s\" = \"%s\"\n", macro, text);
+#elif DEBUG >= 2
+ printf ("add_macro: macro = %p (%s)\n"
+ " text = %p (%s)\n",
+ macro, macro, text, text);
+#endif
+
+ if ((sym = zzs_get (macro)))
+ {
+ macro_warning (filename, line,
+ "overriding existing definition of macro \"%s\"",
+ macro);
+ delete_macro_entry (sym);
+ }
+
+ new_rec = zzs_newadd (macro);
+ new_rec->text = (text != NULL) ? strdup (text) : NULL;
+ DBG_ACTION
+ (2, printf (" saved = %p (%s)\n",
+ new_rec->text, new_rec->text);)
+
+} /* bt_add_macro_text() */
+
+
+/* ------------------------------------------------------------------------
+@NAME : bt_delete_macro()
+@INPUT : macro - name of macro to delete
+@DESCRIPTION: Deletes a macro from the macro table.
+@CALLS : zzs_get()
+@CALLERS :
+@CREATED : 1998/03/01, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+void
+bt_delete_macro (char * macro)
+{
+ Sym * sym;
+
+ sym = zzs_get (macro);
+ if (! sym) return;
+ delete_macro_entry (sym);
+}
+
+
+/* ------------------------------------------------------------------------
+@NAME : bt_delete_all_macros()
+@DESCRIPTION: Deletes all macros from the macro table.
+@CALLS : zzs_rmscore()
+@CALLERS :
+@CREATED : 1998/03/01, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+void
+bt_delete_all_macros (void)
+{
+ Sym *cur, *next;
+
+ DBG_ACTION (2, printf ("bt_delete_all_macros():\n");)
+
+ /*
+ * Use the current `scope' (same one for all macros) to get access to
+ * a linked list of all macros. Then traverse the list, free()'ing
+ * both the text (which was strdup()'d in add_macro(), below) and
+ * the records themselves (which are calloc()'d by zzs_new()).
+ */
+
+ cur = zzs_rmscope (&AllMacros);
+ while (cur != NULL)
+ {
+ DBG_ACTION
+ (2, printf (" freeing macro \"%s\" (%p=\"%s\") at %p\n",
+ cur->symbol, cur->text, cur->text, cur);)
+
+ next = cur->scope;
+ if (cur->text != NULL) free (cur->text);
+ free (cur);
+ cur = next;
+ }
+}
+
+
+/* ------------------------------------------------------------------------
+@NAME : bt_macro_length()
+@INPUT : macro - the macro name
+@OUTPUT :
+@RETURNS : length of the macro's text, or zero if the macro is undefined
+@DESCRIPTION: Returns length of a macro's text.
+@GLOBALS :
+@CALLS : zzs_get()
+@CALLERS : bt_postprocess_value()
+ (exported from library)
+@CREATED : Jan 1997, GPW
+-------------------------------------------------------------------------- */
+int
+bt_macro_length (char *macro)
+{
+ Sym *sym;
+
+ DBG_ACTION
+ (2, printf ("bt_macro_length: looking up \"%s\"\n", macro);)
+
+ sym = zzs_get (macro);
+ if (sym)
+ return strlen (sym->text);
+ else
+ return 0;
+}
+
+
+/* ------------------------------------------------------------------------
+@NAME : bt_macro_text()
+@INPUT : macro - the macro name
+ filename, line - where the macro was invoked; NULL for
+ `filename' and zero for `line' if not applicable
+@OUTPUT :
+@RETURNS : The text of the macro, or NULL if it's undefined.
+@DESCRIPTION: Fetches a macros text; prints warning and returns NULL if
+ macro is undefined.
+@CALLS : zzs_get()
+@CALLERS : bt_postprocess_value()
+@CREATED : Jan 1997, GPW
+-------------------------------------------------------------------------- */
+char *
+bt_macro_text (char * macro, char * filename, int line)
+{
+ Sym * sym;
+
+ DBG_ACTION
+ (2, printf ("bt_macro_text: looking up \"%s\"\n", macro);)
+
+ sym = zzs_get (macro);
+ if (!sym)
+ {
+ macro_warning (filename, line, "undefined macro \"%s\"", macro);
+ return NULL;
+ }
+
+ return sym->text;
+}
diff --git a/btparse/src/mode.h b/btparse/src/mode.h
new file mode 100644
index 0000000..25b36ce
--- /dev/null
+++ b/btparse/src/mode.h
@@ -0,0 +1,3 @@
+#define START 0
+#define LEX_ENTRY 1
+#define LEX_STRING 2
diff --git a/btparse/src/modify.c b/btparse/src/modify.c
new file mode 100644
index 0000000..64c8caa
--- /dev/null
+++ b/btparse/src/modify.c
@@ -0,0 +1,75 @@
+/* ------------------------------------------------------------------------
+@NAME : modify.c
+@DESCRIPTION: Routines for modifying the AST for a single entry.
+@GLOBALS :
+@CALLS :
+@CREATED : 1999/11/25, Greg Ward (based on code supplied by
+ Stephane Genaud <genaud@icps.u-strasbg.fr>)
+@MODIFIED :
+@VERSION : $Id$
+@COPYRIGHT : Copyright (c) 1996-99 by Gregory P. Ward. All rights reserved.
+
+ This file is part of the btparse library. This library is
+ free software; you can redistribute it and/or modify it under
+ the terms of the GNU Library General Public License as
+ published by the Free Software Foundation; either version 2
+ of the License, or (at your option) any later version.
+-------------------------------------------------------------------------- */
+#include "bt_config.h"
+#include <stdlib.h>
+#include <string.h>
+#include "btparse.h"
+#include "error.h"
+#include "my_dmalloc.h"
+
+
+/* ------------------------------------------------------------------------
+@NAME : bt_set_text ()
+@INPUT : node
+ new_text
+@OUTPUT : node->text
+@RETURNS :
+@DESCRIPTION: Replace the text member of an AST node with a new string.
+ The passed in string, 'new_text', is duplicated, so the
+ caller may free it without worry.
+@GLOBALS :
+@CALLS :
+@CALLERS :
+@CREATED : 1999/11/25, GPW (from Stephane Genaud)
+@MODIFIED :
+-------------------------------------------------------------------------- */
+void bt_set_text (AST * node, char * new_text)
+{
+ free(node->text);
+ node->text = strdup (new_text);
+}
+
+
+/* ------------------------------------------------------------------------
+@NAME : bt_entry_set_key ()
+@INPUT : entry
+ new_key
+@OUTPUT : entry->down->text
+@RETURNS :
+@DESCRIPTION: Changes the key of a regular entry to 'new_key'. If 'entry'
+ is not a regular entry, or if it doesn't already have a child
+ node holding an entry key, bombs via 'usage_error()'.
+ Otherwise a duplicate of 'new_key' is copied into the entry
+ AST (so the caller can free that string without worry).
+@CALLS : bt_set_text ()
+@CREATED : 1999/11/25, GPW (from Stephane Genaud)
+@MODIFIED :
+-------------------------------------------------------------------------- */
+void bt_entry_set_key (AST * entry, char * new_key)
+{
+ if (entry->metatype == BTE_REGULAR &&
+ entry->down && entry->down->nodetype == BTAST_KEY)
+ {
+ bt_set_text (entry->down, new_key);
+ }
+ else
+ {
+ usage_error ("can't set entry key -- not a regular entry, "
+ "or doesn't have a key already");
+ }
+}
diff --git a/btparse/src/my_alloca.h b/btparse/src/my_alloca.h
new file mode 100644
index 0000000..dce8185
--- /dev/null
+++ b/btparse/src/my_alloca.h
@@ -0,0 +1,35 @@
+/* ------------------------------------------------------------------------
+@NAME : my_alloca.h
+@DESCRIPTION: All-out assault at making alloca() available on any Unix
+ platform. Stolen from the GNU Autoconf manual.
+@CREATED : 1997/10/30, Greg Ward
+@VERSION : $Id$
+@COPYRIGHT : This file is part of the btparse library. This library is
+ free software; you can redistribute it and/or modify it under
+ the terms of the GNU Library General Public License as
+ published by the Free Software Foundation; either version 2
+ of the License, or (at your option) any later version.
+-------------------------------------------------------------------------- */
+
+#ifndef MY_ALLOCA_H
+#define MY_ALLOCA_H
+
+#ifdef __GNUC__
+# ifndef alloca
+# define alloca __builtin_alloca
+# endif
+#else
+# if HAVE_ALLOCA_H
+# include <alloca.h>
+# else
+# ifdef _AIX
+# pragma alloca
+# else
+# ifndef alloca /* predefined by HP cc +Olibcalls */
+char *alloca ();
+# endif
+# endif
+# endif
+#endif
+
+#endif /* MY_ALLOCA_H */
diff --git a/btparse/src/my_dmalloc.h b/btparse/src/my_dmalloc.h
new file mode 100644
index 0000000..64b44fa
--- /dev/null
+++ b/btparse/src/my_dmalloc.h
@@ -0,0 +1,17 @@
+/* ------------------------------------------------------------------------
+@NAME : my_dmalloc.h
+@DESCRIPTION: Tiny header file to possibly include <dmalloc.h> (ie. the
+ "real thing"), depending on the DMALLOC preprocessor token.
+@CREATED : 1997/09/06, Greg Ward
+@MODIFIED :
+@VERSION : $Id$
+-------------------------------------------------------------------------- */
+
+#ifndef MY_DMALLOC_H
+#define MY_DMALLOC_H
+
+#ifdef DMALLOC
+# include <dmalloc.h>
+#endif
+
+#endif /* MY_DMALLOC_H */
diff --git a/btparse/src/names.c b/btparse/src/names.c
new file mode 100644
index 0000000..affb18b
--- /dev/null
+++ b/btparse/src/names.c
@@ -0,0 +1,916 @@
+/* ------------------------------------------------------------------------
+@NAME : names.c
+@DESCRIPTION: Functions for dealing with BibTeX names and lists of names:
+ bt_split_list
+ bt_split_name
+@GLOBALS :
+@CALLS :
+@CREATED : 1997/05/05, Greg Ward (as string_util.c)
+@MODIFIED : 1997/05/14-05/16, GW: added all the code to split individual
+ names, renamed file to names.c
+@VERSION : $Id$
+@COPYRIGHT : Copyright (c) 1996-99 by Gregory P. Ward. All rights reserved.
+
+ This file is part of the btparse library. This library is
+ free software; you can redistribute it and/or modify it under
+ the terms of the GNU Library General Public License as
+ published by the Free Software Foundation; either version 2
+ of the License, or (at your option) any later version.
+-------------------------------------------------------------------------- */
+
+#include "bt_config.h"
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include "btparse.h"
+#include "prototypes.h"
+#include "error.h"
+#include "my_alloca.h"
+#include "my_dmalloc.h"
+#include "bt_debug.h"
+
+
+#define MAX_COMMAS 2
+
+#define update_depth(s,offs,depth) \
+switch (s[offs]) \
+{ \
+ case '{': depth++; break; \
+ case '}': depth--; break; \
+}
+
+/*
+ * `name_loc' specifies where a name is found -- used for generating
+ * useful warning messages. `line' and `name_num' are both 1-based.
+ */
+typedef struct
+{
+ char * filename;
+ int line;
+ int name_num;
+} name_loc;
+
+
+GEN_PRIVATE_ERRFUNC (name_warning,
+ (name_loc * loc, char * fmt, ...),
+ BTERR_CONTENT, loc->filename, loc->line,
+ "name", loc->name_num, fmt)
+
+
+/* ------------------------------------------------------------------------
+@NAME : bt_split_list()
+@INPUT : string - string to split up; whitespace must be collapsed
+ eg. by bt_postprocess_string()
+ delim - delimiter to use; must be lowercase and should be
+ free of whitespace (code requires that delimiters
+ in string be surrounded by whitespace)
+ filename - source of string (for warning messages)
+ line - 1-based line number into file (for warning messages)
+ description - what substrings are (eg. "name") (for warning
+ messages); if NULL will use "substring"
+@OUTPUT : substrings (*substrings is allocated by bt_split_list() for you)
+@RETURNS : number of substrings found
+@DESCRIPTION: Splits a string using a fixed delimiter, in the BibTeX way:
+ * delimiters at beginning or end of string are ignored
+ * delimiters in string must be surrounded by whitespace
+ * case insensitive
+ * delimiters at non-zero brace depth are ignored
+
+ The list of substrings is returned as *substrings, which
+ is an array of pointers into a duplicate of string. This
+ duplicate copy has been scribbled on such that there is
+ a nul byte at the end of every substring. You should
+ call bt_free_list() to free both the duplicate copy
+ of string and *substrings itself. Do *not* walk over
+ the array free()'ing the substrings yourself, as this is
+ invalid -- they were not malloc()'d!
+@GLOBALS :
+@CALLS :
+@CALLERS : anyone (exported by library)
+@CREATED : 1997/05/05, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+bt_stringlist *
+bt_split_list (char * string,
+ char * delim,
+ char * filename,
+ int line,
+ char * description)
+{
+ int depth; /* brace depth */
+ int i, j; /* offset into string and delim */
+ int inword; /* flag telling if prev. char == ws */
+ int string_len;
+ int delim_len;
+ int maxdiv; /* upper limit on no. of divisions */
+ int maxoffs; /* max offset of delim in string */
+ int numdiv; /* number of divisions */
+ int * start; /* start of each division */
+ int * stop; /* stop of each division */
+ bt_stringlist *
+ list; /* structure to return */
+
+ if (string == NULL)
+ return NULL;
+ if (description == NULL)
+ description = "substring";
+
+ string_len = strlen (string);
+ delim_len = strlen (delim);
+ maxdiv = (string_len / delim_len) + 1;
+ maxoffs = string_len - delim_len + 1;
+
+ /*
+ * This is a bit of a band-aid solution to the "split empty string"
+ * bug (formerly hit the internal_error() at the end of hte function).
+ * Still need a general "detect and fix unpreprocessed string" --
+ * admittedly a different bug/misfeature.
+ */
+ if (string_len == 0)
+ return NULL;
+
+ start = (int *) alloca (maxdiv * sizeof (int));
+ stop = (int *) alloca (maxdiv * sizeof (int));
+
+ list = (bt_stringlist *) malloc (sizeof (bt_stringlist));
+
+ depth = 0;
+ i = j = 0;
+ inword = 1; /* so leading delim ignored */
+ numdiv = 0;
+ start[0] = 0; /* first substring @ start of string */
+
+ while (i < maxoffs)
+ {
+ /* does current char. in string match current char. in delim? */
+ if (depth == 0 && !inword && tolower (string[i]) == delim[j])
+ {
+ j++; i++;
+
+ /* have we found an entire delim, followed by a space? */
+ if (j == delim_len && string[i] == ' ')
+ {
+
+ stop[numdiv] = i - delim_len - 1;
+ start[++numdiv] = ++i;
+ j = 0;
+
+#if DEBUG
+ printf ("found complete delim; i == %d, numdiv == %d: "
+ "stop[%d] == %d, start[%d] == %d\n",
+ i, numdiv,
+ numdiv-1, stop[numdiv-1],
+ numdiv, start[numdiv]);
+#endif
+ }
+ }
+
+ /* no match between string and delim, at non-zero depth, or in a word */
+ else
+ {
+ update_depth (string, i, depth);
+ inword = (i < string_len) && (string[i] != ' ');
+ i++;
+ j = 0;
+ }
+ }
+
+ stop[numdiv] = string_len; /* last substring ends just past eos */
+ list->num_items = numdiv+1;
+
+
+ /*
+ * OK, now we know how many divisions there are and where they are --
+ * so let's split that string up for real!
+ *
+ * list->items will be an array of pointers into a duplicate of
+ * `string'; we duplicate `string' so we can safely scribble on it and
+ * free() it later (in bt_free_list()).
+ */
+
+ list->items = (char **) malloc (list->num_items * sizeof (char *));
+ list->string = strdup (string);
+
+ for (i = 0; i < list->num_items; i++)
+ {
+ /*
+ * Possible cases:
+ * - stop < start is for empty elements, e.g. "and and" seen in
+ * input. (`start' for empty element will be the 'a' of the
+ * second 'and', and its stop will be the ' ' *before* the
+ * second 'and'.)
+ * - stop > start is for anything else between two and's (the usual)
+ * - stop == start should never happen if the loop above is correct
+ */
+
+ if (stop[i] > start[i]) /* the usual case */
+ {
+ list->string[stop[i]] = 0;
+ list->items[i] = list->string+start[i];
+ }
+ else if (stop[i] < start[i]) /* empty element */
+ {
+ list->items[i] = NULL;
+ general_error (BTERR_CONTENT, filename, line,
+ description, i+1, "empty %s", description);
+ }
+ else /* should not happen! */
+ {
+ internal_error ("stop == start for substring %d", i);
+ }
+ }
+
+ return list;
+/* return num_substrings; */
+
+} /* bt_split_list () */
+
+
+/* ------------------------------------------------------------------------
+@NAME : bt_free_list()
+@INPUT : list
+@OUTPUT :
+@RETURNS :
+@DESCRIPTION: Frees the list of strings created by bt_split_list().
+@GLOBALS :
+@CALLS :
+@CALLERS : anyone (exported by library)
+@CREATED : 1997/05/06, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+void bt_free_list (bt_stringlist *list)
+{
+ if (list && list->string) free (list->string);
+ if (list && list->items) free (list->items);
+ if (list) free (list);
+}
+
+
+
+/* ----------------------------------------------------------------------
+ * Stuff for splitting up a single name
+ */
+
+
+/* ------------------------------------------------------------------------
+@NAME : find_commas
+@INPUT : name - string to search for commas
+ max_commas - maximum number of commas to allow (if more than
+ this number are seen, a warning is printed and
+ the excess commas are removed)
+@OUTPUT :
+@RETURNS : number of commas found
+@DESCRIPTION: Counts and records positions of commas at brace-depth 0.
+ Modifies string in-place to remove whitespace around commas,
+ excess commas, and any trailing commas; warns on excess or
+ trailing commas. Excess commas are removed by replacing them
+ with space and calling bt_postprocess_string() to collapse
+ whitespace a second time; trailing commas are simply replaced
+ with (char) 0 to truncate the string.
+
+ Assumes whitespace has been collapsed (ie. no space at
+ beginning or end of string, and all internal strings of
+ whitespace reduced to exactly one space).
+@GLOBALS :
+@CALLS : name_warning() (if too many commas, or commas at end)
+@CALLERS : bt_split_name()
+@CREATED : 1997/05/14, Greg Ward
+@MODIFIED :
+-------------------------------------------------------------------------- */
+static int
+find_commas (name_loc * loc, char *name, int max_commas)
+{
+ int i, j;
+ int depth;
+ int num_commas;
+ int len;
+ boolean at_comma;
+ boolean warned;
+
+ i = j = 0;
+ depth = 0;
+ num_commas = 0;
+ len = strlen (name);
+ warned = 0;
+
+ /* First pass to check for and blank out excess commas */
+
+ for (i = 0; i < len; i++)
+ {
+ update_depth (name, i, depth);
+ if (depth == 0 && name[i] == ',')
+ {
+ num_commas++;
+ if (num_commas > max_commas)
+ {
+ if (! warned)
+ {
+ name_warning (loc, "too many commas in name (removing extras)");
+ warned = TRUE;
+ }
+ name[i] = ' ';
+ }
+ }
+ }
+
+ /*
+ * If we blanked out a comma, better re-collapse whitespace. (This is
+ * a bit of a cop-out -- I could probably adjust i and j appropriately
+ * in the above loop to do the collapsing for me, but my brain
+ * hurt when I tried to think it through. Some other time, perhaps.
+ */
+
+ if (warned)
+ bt_postprocess_string (name, 1);
+
+ /* Now the real comma-finding loop (only if necessary) */
+
+ if (num_commas == 0)
+ return 0;
+
+ num_commas = 0;
+ i = 0;
+ while (i < len)
+ {
+ at_comma = (depth == 0 && name[i] == ',');
+ if (at_comma)
+ {
+ while (j > 0 && name[j-1] == ' ') j--;
+ num_commas++;
+ }
+
+ update_depth (name, i, depth);
+ if (i != j)
+ name[j] = name[i];
+
+ i++; j++;
+ if (at_comma)
+ {
+ while (i < len && name[i] == ' ') i++;
+ }
+ } /* while i */
+
+ if (i != j) name[j] = (char) 0;
+ j--;
+
+ if (name[j] == ',')
+ {
+ name_warning (loc, "comma(s) at end of name (removing)");
+ while (name[j] == ',')
+ {
+ name[j--] = (char) 0;
+ num_commas--;
+ }
+ }
+
+ return num_commas;
+
+} /* find_commas() */
+
+
+/* ------------------------------------------------------------------------
+@NAME : find_tokens
+@INPUT : name - string to tokenize (should be a private copy
+ that we're free to clobber and mangle)
+@OUTPUT : comma_token- number of token immediately preceding each comma
+ (caller must allocate with at least one element
+ per comma in `name')
+@RETURNS : newly-allocated bt_stringlist structure
+@DESCRIPTION: Finds tokens in a string; delimiter is space or comma at
+ brace-depth zero. Assumes whitespace has been collapsed
+ and find_commas has been run on the string to remove
+ whitespace around commas and any trailing commas.
+
+ The bt_stringlist structure returned can (and should) be
+ freed with bt_free_list().
+@GLOBALS :
+@CALLS :
+@CALLERS : bt_split_name()
+@CREATED : 1997/05/14, Greg Ward
+@MODIFIED :
+-------------------------------------------------------------------------- */
+static bt_stringlist *
+find_tokens (char * name,
+ int * comma_token)
+{
+ int i; /* index into name */
+ int num_tok;
+ int in_boundary; /* previous char was ' ' or ',' */
+ int cur_comma; /* index into comma_token */
+ int len;
+ int depth;
+ bt_stringlist *
+ tokens;
+
+ i = 0;
+ in_boundary = 1; /* so first char will start a token */
+ cur_comma = 0;
+ len = strlen (name);
+ depth = 0;
+
+ tokens = (bt_stringlist *) malloc (sizeof (bt_stringlist));
+ /* tokens->string = name ? strdup (name) : NULL; */
+ tokens->string = name;
+ num_tok = 0;
+ tokens->items = NULL;
+
+ if (len == 0) /* empty string? */
+ return tokens; /* return empty token list */
+
+ tokens->items = (char **) malloc (sizeof (char *) * len);
+
+ while (i < len)
+ {
+ if (depth == 0 && in_boundary) /* at start of a new token */
+ {
+ tokens->items[num_tok++] = name+i;
+ }
+
+ if (depth == 0 && (name[i] == ' ' || name[i] == ','))
+ {
+ /* if we're at a comma, record the token preceding the comma */
+
+ if (name[i] == ',')
+ {
+ comma_token[cur_comma++] = num_tok-1;
+ }
+
+ /*
+ * if already in a boundary zone, we have an empty token
+ * (caused by multiple consecutive commas)
+ */
+ if (in_boundary)
+ {
+ tokens->items[num_tok-1] = NULL;
+ }
+
+ /* in any case, mark the end of one token and prepare for the
+ * start of the next
+ */
+ name[i] = (char) 0;
+ in_boundary = 1;
+ }
+ else
+ {
+ in_boundary = 0; /* inside a token */
+ }
+
+ update_depth (name, i, depth);
+ i++;
+
+ } /* while i */
+
+ tokens->num_items = num_tok;
+ return tokens;
+
+} /* find_tokens() */
+
+
+/* ------------------------------------------------------------------------
+@NAME : find_lc_tokens()
+@INPUT : tokens
+@OUTPUT : first_lc
+ last_lc
+@RETURNS :
+@DESCRIPTION: Finds the first contiguous string of lowercase tokens in
+ `name'. The string must already be tokenized by
+ find_tokens(), and the input args num_tok, tok_start, and
+ tok_stop are the return value and the two same-named output
+ arguments from find_tokens().
+@GLOBALS :
+@CALLS :
+@CALLERS : bt_split_name()
+@CREATED : 1997/05/14, Greg Ward
+@MODIFIED :
+-------------------------------------------------------------------------- */
+static void
+find_lc_tokens (bt_stringlist * tokens,
+ int * first_lc,
+ int * last_lc)
+{
+ int i; /* iterate over token list this time */
+
+ *first_lc = *last_lc = -1; /* haven't found either yet */
+
+ i = 0;
+ while (i < tokens->num_items)
+ {
+ char * token = tokens->items[i];
+ if (*first_lc == -1 && token != NULL && isulower (token))
+ {
+ *first_lc = i;
+
+ i++;
+ char * token = tokens->items[i];
+ while (i < tokens->num_items && token != NULL && isulower (token)) {
+ i++;
+ token = tokens->items[i];
+ }
+ *last_lc = i-1;
+ }
+ else
+ {
+ i++;
+ }
+ }
+} /* find_lc_tokens() */
+
+
+/* ------------------------------------------------------------------------
+@NAME : resolve_token_range()
+@INPUT : tokens - structure containing the token list
+ tok_range - two-element array with start and stop token number
+@OUTPUT : *part - set to point to first token in range, or NULL
+ if empty range
+ *num_tok - number of tokens in the range
+@RETURNS :
+@DESCRIPTION: Given a list of tokens and a range of token numbers (as a
+ two-element array, tok_range), computes the number of tokens
+ in the range. If this is >= 0, sets *part to point
+ to the first token in the range; otherwise, sets *part
+ to NULL.
+@CALLERS :
+@CREATED : May 1997, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+static void
+resolve_token_range (bt_stringlist *tokens,
+ int * tok_range,
+ char *** part,
+ int * num_tok)
+{
+ *num_tok = (tok_range[1] - tok_range[0]) + 1;
+ if (*num_tok <= 0)
+ {
+ *num_tok = 0;
+ *part = NULL;
+ }
+ else
+ {
+ *part = tokens->items + tok_range[0];
+ }
+} /* resolve_token_range() */
+
+
+/* ------------------------------------------------------------------------
+@NAME : split_simple_name()
+@INPUT : name
+ first_lc
+ last_lc
+@OUTPUT : name
+@RETURNS :
+@DESCRIPTION: Splits up a name (represented as a string divided into
+ non-overlapping, whitespace-separated tokens) according
+ to the BibTeX rules for names without commas. Specifically:
+ * tokens up to (but not including) the first lowercase
+ token, or the last token of the string if there
+ are no lowercase tokens, become the `first' part
+ * the earliest contiguous sequence of lowercase tokens,
+ up to (but not including) the last token of the string,
+ becomes the `von' part
+ * the tokens following the `von' part, or the last
+ single token if there is no `von' part, become
+ the `last' part
+ * there is no `jr' part
+@GLOBALS :
+@CALLS : name_warning() (if last lc token taken as lastname)
+ resolve_token_range()
+@CALLERS : bt_split_name()
+@CREATED : 1997/05/15, Greg Ward
+@MODIFIED :
+-------------------------------------------------------------------------- */
+static void
+split_simple_name (name_loc * loc,
+ bt_name * name,
+ int first_lc,
+ int last_lc)
+{
+ int first_t[2], von_t[2], last_t[2];
+ int end;
+
+ end = name->tokens->num_items-1; /* token number of last token */
+
+ if (first_lc > -1) /* any lowercase tokens at all? */
+ {
+ first_t[0] = 0; /* first name goes from beginning */
+ first_t[1] = first_lc-1; /* to just before first lc token */
+
+ if (last_lc == end) /* sequence of lowercase tokens */
+ { /* goes all the way to end of string */
+ last_lc--; /* -- roll it back by one so we */
+ /* still have a lastname */
+#ifdef WARN_LC_LASTNAME
+ /*
+ * disable this warning for now because "others" is used fairly
+ * often as a name in BibTeX databases -- oops!
+ */
+ name_warning (loc,
+ "no capitalized token at end of name; "
+ "using \"%s\" as lastname",
+ name->tokens->items[end]);
+#else
+# ifndef ALLOW_WARNINGS
+ loc = NULL; /* avoid "unused parameter" warning */
+# endif
+#endif
+ }
+
+ von_t[0] = first_lc; /* `von' part covers sequence of */
+ von_t[1] = last_lc; /* lowercase tokens */
+ last_t[0] = last_lc+1; /* lastname from after `von' to end */
+ last_t[1] = end; /* of string */
+ }
+ else /* no lowercase tokens */
+ {
+ von_t[0] = 0; /* empty `von' part */
+ von_t[1] = -1;
+ first_t[0] = 0; /* `first' goes from first to second */
+ first_t[1] = end-1; /* last token */
+ last_t[0] = last_t[1] = end; /* and `last' is just the last token */
+ }
+
+ resolve_token_range (name->tokens, first_t,
+ name->parts+BTN_FIRST, name->part_len+BTN_FIRST);
+ resolve_token_range (name->tokens, von_t,
+ name->parts+BTN_VON, name->part_len+BTN_VON);
+ resolve_token_range (name->tokens, last_t,
+ name->parts+BTN_LAST, name->part_len+BTN_LAST);
+ name->parts[BTN_JR] = NULL; /* no jr part possible */
+ name->part_len[BTN_JR] = 0;
+
+} /* split_simple_name() */
+
+
+/* ------------------------------------------------------------------------
+@NAME : split_general_name()
+@INPUT : name
+ num_commas
+ comma_token
+ first_lc
+ last_lc
+@OUTPUT : name
+@RETURNS :
+@DESCRIPTION: Splits a name according to the BibTeX rules for names
+ with 1 or 2 commas (> 2 commas is handled elsewhere,
+ namely by bt_split_name() calling find_commas() with
+ max_commas == 2). Specifically:
+ * an initial string of lowercase tokens, up to (but not
+ including) the token before the first comma, becomes
+ the `von' part
+ * tokens from immediately after the `von' part,
+ or from the beginning of the string if no `von',
+ up to the first comma become the `last' part
+
+ if one comma:
+ * all tokens following the sole comma become the
+ `first' part
+
+ if two commas:
+ * tokens between the two commas become the `jr' part
+ * all tokens following the second comma become the
+ `first' part
+@GLOBALS :
+@CALLS : name_warning() (if last lc token taken as lastname)
+ resolve_token_range()
+@CALLERS : bt_split_name()
+@CREATED : 1997/05/15, Greg Ward
+@MODIFIED :
+-------------------------------------------------------------------------- */
+static void
+split_general_name (name_loc * loc,
+ bt_name * name,
+ int num_commas,
+ int * comma_token,
+ int first_lc,
+ int last_lc)
+{
+ int first_t[2], von_t[2], last_t[2], jr_t[2];
+ int end;
+
+ end = name->tokens->num_items-1; /* last token number */
+
+ if (first_lc == 0) /* we have an initial string of */
+ { /* lowercase tokens */
+ if (last_lc == comma_token[0]) /* lc string ends at first comma */
+ {
+ /* No longer very useful as a warning since we can have Unicode bits now
+ and so "capitalised" doesn't really have consistent meaning any more */
+ /* name_warning (loc, "no capitalized tokens before first comma"); */
+ last_lc--;
+ }
+
+ von_t[0] = first_lc; /* `von' covers the sequence of */
+ von_t[1] = last_lc; /* lowercase tokens */
+ }
+ else /* no lowercase tokens at start */
+ {
+ von_t[0] = 0; /* empty `von' part */
+ von_t[1] = -1;
+ }
+
+ last_t[0] = von_t[1] + 1; /* start right after end of `von' */
+ last_t[1] = comma_token[0]; /* and end at first comma */
+
+ if (num_commas == 1)
+ {
+ first_t[0] = comma_token[0]+1; /* start right after comma */
+ first_t[1] = end; /* stop at end of string */
+ jr_t[0] = 0; /* empty `jr' part */
+ jr_t[1] = -1;
+ }
+ else /* more than 1 comma */
+ {
+ jr_t[0] = comma_token[0]+1; /* start after first comma */
+ jr_t[1] = comma_token[1]; /* stop at second comma */
+ first_t[0] = comma_token[1]+1; /* start after second comma */
+ first_t[1] = end; /* and go to end */
+ }
+
+ resolve_token_range (name->tokens, first_t,
+ name->parts+BTN_FIRST, name->part_len+BTN_FIRST);
+ resolve_token_range (name->tokens, von_t,
+ name->parts+BTN_VON, name->part_len+BTN_VON);
+ resolve_token_range (name->tokens, last_t,
+ name->parts+BTN_LAST, name->part_len+BTN_LAST);
+ resolve_token_range (name->tokens, jr_t,
+ name->parts+BTN_JR, name->part_len+BTN_JR);
+
+} /* split_general_name() */
+
+
+/* ------------------------------------------------------------------------
+@NAME : bt_split_name()
+@INPUT : name
+ filename
+ line
+ name_num
+@OUTPUT :
+@RETURNS : newly-allocated bt_name structure containing the four
+ parts as token-lists
+@DESCRIPTION: Splits a name according to the BibTeX rules. There are
+ actually two sets of rules: one for names with no commas,
+ and one for names with 1 or 2 commas. (If a name has
+ more than 2 commas, the extras are removed and it's treated
+ as though it had just the first 2.)
+
+ See split_simple_name() for the no-comma rules, and
+ split_general_name() for the 1-or-2-commas rules.
+
+ The bt_name structure returned can (and should) be freed
+ with bt_free_name() when you no longer need it.
+@GLOBALS :
+@CALLS :
+@CALLERS : anyone (exported by library)
+@CREATED : 1997/05/14, Greg Ward
+@MODIFIED :
+@COMMENTS : The name-splitting code all implicitly assumes that the
+ string being split has been post-processed to collapse
+ whitespace in the BibTeX way. This means that it tends to
+ dump core on such things as leading whitespace, or more than
+ one space in a row inside the string. This could probably be
+ alleviated with a call to bt_postprocess_string(), possibly
+ preceded by a check for any of those occurences. Before
+ doing that, though, I want to examine the code carefully to
+ determine just what assumptions it makes -- so I can
+ check/correct for all of them.
+-------------------------------------------------------------------------- */
+bt_name *
+bt_split_name (char * name,
+ char * filename,
+ int line,
+ int name_num)
+{
+ name_loc loc;
+ bt_stringlist *
+ tokens;
+ int comma_token[MAX_COMMAS];
+ int len;
+ int num_commas;
+ int first_lc, last_lc;
+ bt_name * split_name;
+ int i;
+
+ DBG_ACTION (1, printf ("bt_split_name(): name=%p (%s)\n", name, name))
+
+ split_name = (bt_name *) malloc (sizeof (bt_name));
+ if (name == NULL)
+ {
+ len = 0;
+ }
+ else
+ {
+ name = strdup (name); /* private copy that we may clobber */
+ len = strlen (name);
+ }
+
+ DBG_ACTION (1, printf ("bt_split_name(): split_name=%p\n", split_name))
+
+ if (len == 0) /* non-existent or empty string? */
+ {
+ for (i = 0; i < BT_MAX_NAMEPARTS; i++)
+ {
+ split_name->parts[i] = NULL;
+ split_name->part_len[i] = 0;
+ }
+ return split_name;
+ }
+
+ loc.filename = filename; /* so called functions can generate */
+ loc.line = line; /* decent warning messages */
+ loc.name_num = name_num;
+
+ num_commas = find_commas (&loc, name, MAX_COMMAS);
+ assert (num_commas <= MAX_COMMAS);
+
+ DBG_ACTION (1, printf ("found %d commas: ", num_commas))
+
+ tokens = find_tokens (name, comma_token);
+
+#if DEBUG
+ printf ("found %d tokens:\n", tokens->num_items);
+ for (i = 0; i < tokens->num_items; i++)
+ {
+ printf (" %d: ", i);
+
+ if (tokens->items[i]) /* non-empty token? */
+ {
+ printf (">%s<\n", tokens->items[i]);
+ }
+ else
+ {
+ printf ("(empty)\n");
+ }
+ }
+#endif
+
+#if DEBUG
+ printf ("comma tokens: ");
+ for (i = 0; i < num_commas; i++)
+ printf ("%d ", comma_token[i]);
+ printf ("\n");
+#endif
+
+ find_lc_tokens (tokens, &first_lc, &last_lc);
+#if DEBUG
+ printf ("(first,last) lc tokens = (%d,%d)\n", first_lc, last_lc);
+#endif
+
+ if (strlen (name) == 0) /* name now empty? */
+ {
+ for (i = 0; i < BT_MAX_NAMEPARTS; i++)
+ {
+ split_name->parts[i] = NULL;
+ split_name->part_len[i] = 0;
+ }
+ }
+ else
+ {
+ split_name->tokens = tokens;
+ if (num_commas == 0) /* no commas -- "simple" format */
+ {
+ split_simple_name (&loc, split_name,
+ first_lc, last_lc);
+ }
+ else
+ {
+ split_general_name (&loc, split_name,
+ num_commas, comma_token,
+ first_lc, last_lc);
+ }
+ }
+
+#if DEBUG
+ printf ("bt_split_name(): returning structure %p\n", split_name);
+#endif
+ return split_name;
+} /* bt_split_name() */
+
+
+/* ------------------------------------------------------------------------
+@NAME : bt_free_name()
+@INPUT : name
+@OUTPUT :
+@RETURNS :
+@DESCRIPTION: Frees up any memory allocated for a bt_name structure
+ (namely, the `tokens' field [a bt_stringlist structure,
+ this freed with bt_free_list()] and the structure itself.)
+@CALLS : bt_free_list()
+@CALLERS : anyone (exported)
+@CREATED : 1997/11/14, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+void
+bt_free_name (bt_name * name)
+{
+ DBG_ACTION (2, printf ("bt_free_name(): freeing name %p "
+ "(%d tokens, string=%p (%s), last[0]=%s)\n",
+ name,
+ name->tokens->num_items,
+ name->tokens->string,
+ name->tokens->string,
+ name->parts[BTN_LAST][0]));
+ bt_free_list (name->tokens);
+ free (name);
+ DBG_ACTION (2, printf ("bt_free_name(): done, everything freed\n"));
+}
diff --git a/btparse/src/parse_auxiliary.c b/btparse/src/parse_auxiliary.c
new file mode 100644
index 0000000..6584bdd
--- /dev/null
+++ b/btparse/src/parse_auxiliary.c
@@ -0,0 +1,391 @@
+/* ------------------------------------------------------------------------
+@NAME : parse_auxiliary.c
+@INPUT :
+@OUTPUT :
+@RETURNS :
+@DESCRIPTION: Anything needed by the parser that's too hairy to go in the
+ grammar itself. Currently, just stuff needed for generating
+ syntax errors. (See error.c for how they're actually
+ printed.)
+@GLOBALS :
+@CALLS :
+@CALLERS :
+@CREATED : 1996/08/07, Greg Ward
+@MODIFIED :
+@VERSION : $Id$
+@COPYRIGHT : Copyright (c) 1996-99 by Gregory P. Ward. All rights reserved.
+
+ This file is part of the btparse library. This library is
+ free software; you can redistribute it and/or modify it under
+ the terms of the GNU Library General Public License as
+ published by the Free Software Foundation; either version 2
+ of the License, or (at your option) any later version.
+-------------------------------------------------------------------------- */
+
+#include "bt_config.h"
+#include "stdpccts.h"
+#include "error.h"
+#include "lex_auxiliary.h"
+#include "parse_auxiliary.h"
+#include "my_dmalloc.h"
+
+extern char * InputFilename; /* from input.c */
+
+GEN_PRIVATE_ERRFUNC (syntax_error, (char * fmt, ...),
+ BTERR_SYNTAX, InputFilename, zzline, NULL, -1, fmt)
+
+
+/* this is stolen from PCCTS' err.h */
+static SetWordType bitmask[] =
+{
+ 0x00000001, 0x00000002, 0x00000004, 0x00000008,
+ 0x00000010, 0x00000020, 0x00000040, 0x00000080
+};
+
+static struct
+{
+ int token;
+ char *new_name;
+} new_tokens[] =
+{
+ { AT, "\"@\"" },
+ { NAME, "name (entry type, key, field, or macro name)" },
+ { LBRACE, "left brace (\"{\")" },
+ { RBRACE, "right brace (\"}\")" },
+ { ENTRY_OPEN, "start of entry (\"{\" or \"(\")" },
+ { ENTRY_CLOSE,"end of entry (\"}\" or \")\")" },
+ { EQUALS, "\"=\"" },
+ { HASH, "\"#\"" },
+ { COMMA, "\",\"" },
+ { NUMBER, "number" },
+ { STRING, "quoted string ({...} or \"...\")" }
+};
+
+
+#ifdef CLEVER_TOKEN_STUFF
+char **token_names;
+#endif
+
+
+#ifndef HAVE_STRLCAT
+/********************************* AMBS **********************/
+/*
+ * Appends src to string dst of size dsize (unlike strlcat, dsize is the
+ * full size of dst, not space left). At most dsize-1 characters
+ * will be copied. Always NUL terminates (unless dsize <= strlen(dst)).
+ * Returns strlen(src) + MIN(dsize, strlen(initial dst)).
+ * If retval >= dsize, truncation occurred.
+ */
+static size_t
+strlcat(char *dst, const char *src, size_t dsize)
+{
+ const char *odst = dst;
+ const char *osrc = src;
+ size_t n = dsize;
+ size_t dlen;
+
+ /* Find the end of dst and adjust bytes left but don't go past end. */
+ while (n-- != 0 && *dst != '\0')
+ dst++;
+ dlen = dst - odst;
+ n = dsize - dlen;
+
+ if (n-- == 0)
+ return(dlen + strlen(src));
+ while (*src != '\0') {
+ if (n != 0) {
+ *dst++ = *src;
+ n--;
+ }
+ src++;
+ }
+ *dst = '\0';
+
+ return(dlen + (src - osrc)); /* count does not include NUL */
+}
+/********************************* AMBS **********************/
+#endif
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+void
+fix_token_names (void)
+{
+ int i;
+ int num_replace;
+
+#ifdef CLEVER_TOKEN_STUFF /* clever, but it doesn't work... */
+ /* arg! this doesn't work because I don't know how to find out the
+ * number of tokens
+ */
+
+ int num_tok;
+
+ num_tok = (sizeof(zztokens) / sizeof(*zztokens));
+ sizeof (zztokens);
+ sizeof (*zztokens);
+ token_names = (char **) malloc (sizeof (char *) * num_tok);
+
+ for (i = 0; i < num_tok; i++)
+ {
+ token_names[i] = zztokens[i];
+ }
+#endif
+
+ num_replace = (sizeof(new_tokens) / sizeof(*new_tokens));
+ for (i = 0; i < num_replace; i++)
+ {
+ char *new = new_tokens[i].new_name;
+ char **old = zztokens + new_tokens[i].token;
+
+ *old = new;
+ }
+}
+
+
+#ifdef USER_ZZSYN
+
+static void
+append_token_set (char *msg, SetWordType *a)
+{
+ SetWordType *p = a;
+ SetWordType *endp = &(p[zzSET_SIZE]);
+ unsigned e = 0;
+ int tokens_printed = 0;
+
+ do
+ {
+ SetWordType t = *p;
+ SetWordType *b = &(bitmask[0]);
+ do
+ {
+ if (t & *b)
+ {
+ strlcat (msg, zztokens[e], MAX_ERROR);
+ tokens_printed++;
+ if (tokens_printed < zzset_deg (a) - 1)
+ strlcat (msg, ", ", MAX_ERROR);
+ else if (tokens_printed == zzset_deg (a) - 1)
+ strlcat (msg, " or ", MAX_ERROR);
+ }
+ e++;
+ } while (++b < &(bitmask[sizeof(SetWordType)*8]));
+ } while (++p < endp);
+}
+
+
+void
+zzsyn(char * text,
+ int tok,
+ char * egroup,
+ SetWordType * eset,
+ int etok,
+ int k,
+ char * bad_text)
+{
+ static char msg [MAX_ERROR];
+ int len;
+
+#ifndef ALLOW_WARNINGS
+ text = NULL; /* avoid "unused parameter" warning */
+#endif
+
+ /* Initial message: give location of error */
+
+ msg[0] = (char) 0; /* make sure string is empty to start! */
+ if (tok == zzEOF_TOKEN)
+ strlcat (msg, "at end of input", MAX_ERROR);
+ else
+ snprintf (msg, MAX_ERROR - 1, "found \"%s\"", bad_text);
+
+ len = strlen (msg);
+
+
+ /* Caller supplied neither a single token nor set of tokens expected... */
+
+ if (!etok && !eset)
+ {
+ syntax_error (msg);
+ return;
+ }
+ else
+ {
+ strlcat (msg, ", ", MAX_ERROR);
+ len += 2;
+ }
+
+
+ /* I'm not quite sure what this is all about, or where k would be != 1... */
+
+ if (k != 1)
+ {
+ snprintf (msg+len, MAX_ERROR - len - 1, "; \"%s\" not", bad_text);
+ if (zzset_deg (eset) > 1) strcat (msg, " in");
+ len = strlen (msg);
+ }
+
+
+ /* This is the code that usually gets run */
+
+ if (zzset_deg (eset) > 0)
+ {
+ if (zzset_deg (eset) == 1)
+ strlcat (msg, "expected ", MAX_ERROR);
+ else
+ strlcat (msg, "expected one of: ", MAX_ERROR);
+
+ append_token_set (msg, eset);
+ }
+ else
+ {
+ if (MAX_ERROR - len > 0) // Check if we have space for more info...
+ snprintf (msg+len, MAX_ERROR - len - 1, "expected %s", zztokens[etok]);
+ if (etok == ENTRY_CLOSE)
+ {
+ strlcat (msg, " (skipping to next \"@\")", MAX_ERROR);
+ initialize_lexer_state ();
+ }
+ }
+
+ len = strlen (msg);
+ if (egroup && strlen (egroup) > 0)
+ snprintf (msg+len, MAX_ERROR - len - 1, " in %s", egroup);
+
+ syntax_error (msg);
+
+}
+#endif /* USER_ZZSYN */
+
+
+void
+check_field_name (AST * field)
+{
+ char * name;
+
+ if (! field || field->nodetype != BTAST_FIELD)
+ return;
+
+ name = field->text;
+ if (strchr ("0123456789", name[0]))
+ syntax_error ("invalid field name \"%s\": cannot start with digit",
+ name);
+}
+
+
+#ifdef STACK_DUMP_CODE
+
+static void
+show_ast_stack_elem (int num)
+{
+ extern char *nodetype_names[]; /* nicked from bibtex_ast.c */
+ /* bt_nodetype nodetype;
+ bt_metatype metatype; */
+ AST *elem;
+
+ elem = zzastStack[num];
+ printf ("zzastStack[%3d] = ", num);
+ if (elem)
+ {
+ /* get_node_type (elem, &nodetype, &metatype); */
+ if (elem->nodetype <= BTAST_MACRO)
+ {
+ printf ("{ %s: \"%s\" (line %d, char %d) }\n",
+ nodetype_names[elem->nodetype],
+ elem->text, elem->line, elem->offset);
+ }
+ else
+ {
+ printf ("bogus node (uninitialized?)\n");
+ }
+ }
+ else
+ {
+ printf ("NULL\n");
+ }
+}
+
+
+static void
+show_ast_stack_top (char *label)
+{
+ if (label)
+ printf ("%s: ast stack top: ", label);
+ else
+ printf ("ast stack top: ");
+ show_ast_stack_elem (zzast_sp);
+}
+
+
+static void
+dump_ast_stack (char *label)
+{
+ int i;
+
+ if (label)
+ printf ("%s: complete ast stack:\n", label);
+ else
+ printf ("complete ast stack:\n");
+
+ for (i = zzast_sp; i < ZZAST_STACKSIZE; i++)
+ {
+ printf (" ");
+ show_ast_stack_elem (i);
+ }
+}
+
+
+static void
+show_attrib_stack_elem (int num)
+{
+ Attrib elem;
+
+ elem = zzaStack[num];
+ printf ("zzaStack[%3d] = ", num);
+ printf ("{ \"%s\" (token %d (%s), line %d, char %d) }\n",
+ elem.text, elem.token, zztokens[elem.token],
+ elem.line, elem.offset);
+}
+
+
+static void
+show_attrib_stack_top (char *label)
+{
+ if (label)
+ printf ("%s: attrib stack top: ", label);
+ else
+ printf ("attrib stack top: ");
+ show_attrib_stack_elem (zzasp);
+}
+
+
+static void
+dump_attrib_stack (char *label)
+{
+ int i;
+
+ if (label)
+ printf ("%s: complete attrib stack:\n", label);
+ else
+ printf ("complete attrib stack:\n");
+
+ for (i = zzasp; i < ZZA_STACKSIZE; i++)
+ {
+ printf (" ");
+ show_attrib_stack_elem (i);
+ }
+}
+
+#endif /* STACK_DUMP_CODE */
diff --git a/btparse/src/parse_auxiliary.h b/btparse/src/parse_auxiliary.h
new file mode 100644
index 0000000..fe35121
--- /dev/null
+++ b/btparse/src/parse_auxiliary.h
@@ -0,0 +1,32 @@
+/* ------------------------------------------------------------------------
+@NAME : parse_auxiliary.h
+@INPUT :
+@OUTPUT :
+@RETURNS :
+@DESCRIPTION: Prototype declarations for functions in parse_auxiliary.c
+@GLOBALS :
+@CALLS :
+@CREATED : 1997/01/08, Greg Ward
+@MODIFIED :
+@VERSION : $Id$
+@COPYRIGHT : Copyright (c) 1996-99 by Gregory P. Ward. All rights reserved.
+
+ This file is part of the btparse library. This library is
+ free software; you can redistribute it and/or modify it under
+ the terms of the GNU Library General Public License as
+ published by the Free Software Foundation; either version 2
+ of the License, or (at your option) any later version.
+-------------------------------------------------------------------------- */
+
+#ifndef PARSE_AUXILIARY_H
+#define PARSE_AUXILIARY_H
+
+#include "stdpccts.h" /* for SetWordType typedef */
+
+void fix_token_names (void);
+void zzsyn (char *text, int tok,
+ char *egroup, SetWordType *eset, int etok,
+ int k, char *bad_text);
+void check_field_name (AST * field);
+
+#endif /* PARSE_AUXILIARY_H */
diff --git a/btparse/src/parser.dlg b/btparse/src/parser.dlg
new file mode 100644
index 0000000..6cf3666
--- /dev/null
+++ b/btparse/src/parser.dlg
@@ -0,0 +1,215 @@
+<<
+/* parser.dlg -- DLG Description of scanner
+ *
+ * Generated from: bibtex.g
+ *
+ * Terence Parr, Will Cohen, and Hank Dietz: 1989-1994
+ * Purdue University Electrical Engineering
+ * With AHPCRC, University of Minnesota
+ * ANTLR Version 1.33
+ */
+
+#include <stdio.h>
+#define ANTLR_VERSION 133
+
+#define ZZCOL
+#define USER_ZZSYN
+
+#include "config.h"
+#include "btparse.h"
+#include "attrib.h"
+#include "lex_auxiliary.h"
+#include "error.h"
+#include "my_dmalloc.h"
+
+extern char * InputFilename; /* for zzcr_ast call in pccts/ast.c */
+#include "antlr.h"
+#include "ast.h"
+#include "tokens.h"
+#include "dlgdef.h"
+LOOKAHEAD
+void zzerraction()
+{
+ (*zzerr)("invalid token");
+ zzadvance();
+ zzskip();
+}
+>>
+
+
+%%START
+
+@
+ <<
+ NLA = 1;
+ >>
+
+\@
+ <<
+ NLA = AT;
+ at_sign ();
+ >>
+
+\n
+ <<
+ NLA = 3;
+ newline ();
+ >>
+
+\%~[\n]*\n
+ <<
+ NLA = COMMENT;
+ comment ();
+ >>
+
+[\ \r\t]+
+ <<
+ NLA = 5;
+ zzskip ();
+ >>
+
+~[\@\n\ \r\t]+
+ <<
+ NLA = 6;
+ toplevel_junk ();
+ >>
+
+
+%%LEX_ENTRY
+
+@
+ <<
+ NLA = 1;
+ >>
+
+\n
+ <<
+ NLA = 7;
+ newline ();
+ >>
+
+\%~[\n]*\n
+ <<
+ NLA = COMMENT;
+ comment ();
+ >>
+
+[\ \r\t]+
+ <<
+ NLA = 8;
+ zzskip ();
+ >>
+
+[0-9]+
+ <<
+ NLA = NUMBER;
+ >>
+
+[a-z0-9\!\$\&\*\+\-\.\/\:\;\<\>\?\[\]\^\_\`\|]+
+ <<
+ NLA = NAME;
+ name ();
+ >>
+
+\{
+ <<
+ NLA = LBRACE;
+ lbrace ();
+ >>
+
+\}
+ <<
+ NLA = RBRACE;
+ rbrace ();
+ >>
+
+\(
+ <<
+ NLA = ENTRY_OPEN;
+ lparen ();
+ >>
+
+\)
+ <<
+ NLA = ENTRY_CLOSE;
+ rparen ();
+ >>
+
+=
+ <<
+ NLA = EQUALS;
+ >>
+
+\#
+ <<
+ NLA = HASH;
+ >>
+
+,
+ <<
+ NLA = COMMA;
+ >>
+
+\"
+ <<
+ NLA = 18;
+ start_string ('"');
+ >>
+
+
+%%LEX_STRING
+
+@
+ <<
+ NLA = 1;
+ >>
+
+\n~[\n\{\}\(\)\"\\]*
+ <<
+ NLA = 19;
+ check_runaway_string ();
+ >>
+
+[\r\t]
+ <<
+ NLA = 20;
+ zzreplchar (' '); zzmore ();
+ >>
+
+\{
+ <<
+ NLA = 21;
+ open_brace ();
+ >>
+
+\}
+ <<
+ NLA = 22;
+ close_brace ();
+ >>
+
+\(
+ <<
+ NLA = 23;
+ lparen_in_string ();
+ >>
+
+\)
+ <<
+ NLA = 24;
+ rparen_in_string ();
+ >>
+
+\"
+ <<
+ NLA = STRING;
+ quote_in_string ();
+ >>
+
+~[\n\{\}\(\)\"]+
+ <<
+ NLA = 26;
+ zzmore ();
+ >>
+
+%%
diff --git a/btparse/src/postprocess.c b/btparse/src/postprocess.c
new file mode 100644
index 0000000..1a6f32a
--- /dev/null
+++ b/btparse/src/postprocess.c
@@ -0,0 +1,500 @@
+/* ------------------------------------------------------------------------
+@NAME : postprocess.c
+@DESCRIPTION: Operations applied to the AST (or strings in it) after
+ parsing is complete.
+@GLOBALS :
+@CALLS :
+@CREATED : 1997/01/12, Greg Ward (from code in bibparse.c, lex_auxiliary.c)
+@MODIFIED :
+@VERSION : $Id$
+@COPYRIGHT : Copyright (c) 1996-99 by Gregory P. Ward. All rights reserved.
+
+ This file is part of the btparse library. This library is
+ free software; you can redistribute it and/or modify it under
+ the terms of the GNU Library General Public License as
+ published by the Free Software Foundation; either version 2
+ of the License, or (at your option) any later version.
+-------------------------------------------------------------------------- */
+#include "bt_config.h"
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include "btparse.h"
+#include "error.h"
+#include "parse_auxiliary.h"
+#include "prototypes.h"
+#include "my_dmalloc.h"
+
+#define DEBUG 1
+
+
+/* ------------------------------------------------------------------------
+@NAME : bt_postprocess_string ()
+@INPUT : s
+ options
+@OUTPUT : s (modified in place according to the flags)
+@RETURNS : (void)
+@DESCRIPTION: Make a pass over string s (which is modified in-place) to
+ optionally collapse whitespace according to BibTeX rules
+ (if the BTO_COLLAPSE bit in options is true).
+
+ Rules for collapsing whitespace are:
+ * whitespace at beginning/end of string is deleted
+ * within the string, each whitespace sequence is replaced by
+ a single space
+
+ Note that part of the work is done by the lexer proper,
+ namely conversion of tabs and newlines to spaces.
+@GLOBALS :
+@CALLS :
+@CREATED : originally in lex_auxiliary.c; moved here 1997/01/12
+@MODIFIED :
+@COMMENTS : this only collapses whitespace now -- rename it???
+-------------------------------------------------------------------------- */
+void
+bt_postprocess_string (char * s, btshort options)
+{
+ boolean collapse_whitespace;
+ char *i, *j;
+ int len;
+
+ if (s == NULL) return; /* quit if no string supplied */
+
+#if DEBUG > 1
+ printf ("bt_postprocess_string: looking at >%s<\n", s);
+#endif
+
+ /* Extract any relevant options (just one currently) to local flags. */
+ collapse_whitespace = options & BTO_COLLAPSE;
+
+ /*
+ * N.B. i and j will both point into s; j is always >= i, and
+ * we copy characters from j to i. Whitespace is collapsed/deleted
+ * by advancing j without advancing i.
+ */
+ i = j = s; /* start both at beginning of string */
+
+ /*
+ * If we're supposed to collapse whitespace, then advance j to the
+ * first non-space character.
+ */
+ if (collapse_whitespace)
+ {
+ while (*j == ' ' && *j != (char) 0)
+ j++;
+ }
+
+ while (*j != (char) 0)
+ {
+
+ if (*j == '\r') j++; /* don't want Ctrl-Ms in strings in the output */
+ /*
+ * If we're in a string of spaces (ie. current and previous char.
+ * are both space), and we're supposed to be collapsing whitespace,
+ * then skip until we hit a non-space character (or end of string).
+ */
+ if (collapse_whitespace && *j == ' ' && *(j-1) == ' ')
+ {
+ while (*j == ' ') j++; /* skip spaces */
+ if (*j == (char) 0) /* reached end of string? */
+ break;
+ }
+
+ /* Copy the current character from j down to i */
+ *(i++) = *(j++);
+ }
+ *i = (char) 0; /* ensure string is terminated */
+
+
+ /*
+ * And mop up whitespace (if any) at end of string -- note that if there
+ * was any whitespace there, it has already been collapsed to exactly
+ * one space.
+ */
+ len = strlen (s);
+ if (len > 0 && collapse_whitespace && s[len-1] == ' ')
+ {
+ s[--len] = (char) 0;
+ }
+
+#if DEBUG > 1
+ printf (" transformed to >%s<\n", s);
+#endif
+
+} /* bt_postprocess_string */
+
+
+/* ------------------------------------------------------------------------
+@NAME : bt_postprocess_value()
+@INPUT :
+@OUTPUT :
+@RETURNS :
+@DESCRIPTION: Post-processes a series of strings (compound value),
+ frequently found as the value of a "field = value" or "macro
+ = value" assignment. The actions taken here are governed by
+ the bits in 'options', but there are two distinct modes of
+ operation: pasting or not.
+
+ We paste strings if and only if the BTO_PASTE bit in options
+ is set and there are two or more simple values in the
+ compound value. In this case, the BTO_EXPAND bit must be set
+ (it would be very silly to paste together strings with
+ unexpanded macro names!), and we make two passes over the
+ data: one to postprocess individual strings and accumulate
+ the one big string, and a second to postprocess the big
+ string. In the first pass, the caller-supplied 'options'
+ variable is largely ignored; we will never collapse
+ whitespace in the individual strings. The caller's wishes
+ are fully respected when we make the final post-processing
+ pass over the concatenation of the individual strings,
+ though.
+
+ If we're not pasting strings, then the character of the
+ individual simple values will be preserved; macros might not
+ be expanded (depending on the BTO_EXPAND bit), numbers will
+ stay numbers, and strings will be post-processed
+ independently according to the 'options' variable. (Beware
+ -- this means you might collapse whitespace in individual
+ sub-strings, which would be bad if you intend to concatenate
+ them later in the BibTeX sense.)
+
+ The 'replace' parameter is used to govern whether the
+ existing strings in the AST should be replaced with their
+ post-processed versions. This can extend as far as
+ collapsing a series of simple values into a single BTAST_STRING
+ node, if we paste sub-strings together. If replace is FALSE,
+ the returned string is allocated here, and you must free() it
+ later.
+@GLOBALS :
+@CALLS :
+@CREATED : 1997/01/10, GPW
+@MODIFIED : 1997/08/25, GPW: renamed from bt_postprocess_field(), and changed
+ to take the head of a list of simple values,
+ rather than the parent of that list
+-------------------------------------------------------------------------- */
+char *
+bt_postprocess_value (AST * value, btshort options, boolean replace)
+{
+ AST * simple_value; /* current simple value */
+ boolean pasting;
+ btshort string_opts; /* what to do to individual strings */
+ int tot_len; /* total length of pasted string */
+ char * new_string; /* in case of string pasting */
+ char * tmp_string;
+ boolean free_tmp; /* should we free() tmp_string? */
+
+ if (value == NULL) return NULL;
+ if (value->nodetype != BTAST_STRING &&
+ value->nodetype != BTAST_NUMBER &&
+ value->nodetype != BTAST_MACRO)
+ {
+ usage_error ("bt_postprocess_value: invalid AST node (not a value)");
+ }
+
+
+ /*
+ * We will paste strings iff the user wants us to, and there are at least
+ * two simple values in the list headed by 'value'.
+ */
+
+ pasting = (options & BTO_PASTE) && (value->right);
+
+ /*
+ * If we're to concatenate (paste) sub-strings, we need to know the
+ * total length of them. So make a pass over all the sub-strings
+ * (simple values), adding up their lengths.
+ */
+
+ tot_len = 0; /* these are out here to keep */
+ new_string = NULL; /* gcc -Wall happy */
+ tmp_string = NULL;
+
+ if (pasting)
+ {
+ simple_value = value;
+ while (simple_value)
+ {
+ switch (simple_value->nodetype)
+ {
+ case BTAST_MACRO:
+ tot_len += bt_macro_length (simple_value->text);
+ break;
+ case BTAST_STRING:
+ tot_len += (simple_value->text)
+ ? (strlen (simple_value->text)) : 0;
+ break;
+ case BTAST_NUMBER:
+ tot_len += (simple_value->text)
+ ? (strlen (simple_value->text)) : 0;
+ break;
+ default:
+ internal_error ("simple value has bad nodetype (%d)",
+ (int) simple_value->nodetype);
+ }
+ simple_value = simple_value->right;
+ }
+
+ /* Now allocate the buffer in which we'll accumulate the whole string */
+
+ new_string = (char *) calloc (tot_len+1, sizeof (char));
+ }
+
+
+ /*
+ * Before entering the main loop, figure out just what
+ * bt_postprocess_string() is supposed to do -- eg. if pasting strings,
+ * we should not (yet) collapse whitespace. (That'll be done on the
+ * final, concatenated string -- assuming the caller put BTO_COLLAPSE in
+ * the options bitmap.)
+ */
+
+ if (pasting)
+ {
+ string_opts = options & ~BTO_COLLAPSE; /* turn off collapsing */
+ }
+ else
+ {
+ string_opts = options; /* leave it alone */
+ }
+
+ /*
+ * Sanity check: if we continue blindly on, we might stupidly
+ * concatenate a macro name and a literal string. So check for that.
+ * Converting numbers is superficial, but requiring that it be done
+ * keeps people honest.
+ */
+
+ if (pasting && ! (options & (BTO_CONVERT|BTO_EXPAND)))
+ {
+ usage_error ("bt_postprocess_value(): "
+ "must convert numbers and expand macros "
+ "when pasting substrings");
+ }
+
+ /*
+ * Now the main loop to process each string, and possibly tack it onto
+ * new_string.
+ */
+
+ simple_value = value;
+ while (simple_value)
+ {
+ tmp_string = NULL;
+ free_tmp = FALSE;
+
+ /*
+ * If this simple value is a macro and we're supposed to expand
+ * macros, then do so. We also have to post-process the string
+ * returned from the macro table, because they're stored there
+ * without whitespace collapsed; if we're supposed to be doing that
+ * to the current value (and we're not pasting), this is where it
+ * will get done.
+ */
+ if (simple_value->nodetype == BTAST_MACRO && (options & BTO_EXPAND))
+ {
+ tmp_string = bt_macro_text (simple_value->text,
+ simple_value->filename,
+ simple_value->line);
+ if (tmp_string != NULL)
+ {
+ tmp_string = strdup (tmp_string);
+ free_tmp = TRUE;
+ bt_postprocess_string (tmp_string, string_opts);
+ }
+
+ if (replace)
+ {
+ simple_value->nodetype = BTAST_STRING;
+ if (simple_value->text)
+ free (simple_value->text);
+ simple_value->text = tmp_string;
+ free_tmp = FALSE; /* mustn't free, it's now in the AST */
+ }
+ }
+
+ /*
+ * If the current simple value is a literal string, then just
+ * post-process it. This will be done in-place if 'replace' is
+ * true, otherwise a copy of the string will be post-processed.
+ */
+ else if (simple_value->nodetype == BTAST_STRING && simple_value->text)
+ {
+ if (replace)
+ {
+ tmp_string = simple_value->text;
+ }
+ else
+ {
+ tmp_string = strdup (simple_value->text);
+ free_tmp = TRUE;
+ }
+
+ bt_postprocess_string (tmp_string, string_opts);
+ }
+
+ /*
+ * Finally, if the current simple value is a number, change it to a
+ * string (depending on options) and get its value. We generally
+ * treat strings as numbers as equivalent, except of course numbers
+ * aren't post-processed -- there can't be any whitespace in them!
+ * The BTO_CONVERT option is mainly a sop to my strong-typing
+ * tendencies.
+ */
+ if (simple_value->nodetype == BTAST_NUMBER)
+ {
+ if (replace && (options & BTO_CONVERT))
+ simple_value->nodetype = BTAST_STRING;
+
+ if (simple_value->text)
+ {
+ if (replace)
+ tmp_string = simple_value->text;
+ else
+ {
+ tmp_string = strdup (simple_value->text);
+ free_tmp = TRUE;
+ }
+ }
+ }
+
+ if (pasting)
+ {
+ if (tmp_string)
+ strcat (new_string, tmp_string);
+ if (free_tmp)
+ free (tmp_string);
+ }
+ else
+ {
+ /*
+ * N.B. if tmp_string is NULL (eg. from a single undefined macro)
+ * we make a strdup() of the empty string -- this is so we can
+ * safely free() the string returned from this function
+ * at some future point.
+ *
+ * This strdup() seems to cause a 1-byte memory leak in some
+ * circumstances. I s'pose I should look into that some rainy
+ * afternoon...
+ */
+
+ new_string = (tmp_string != NULL) ? tmp_string : strdup ("");
+ }
+
+ simple_value = simple_value->right;
+ }
+
+ if (pasting)
+ {
+ int len;
+
+ len = strlen (new_string);
+ assert (len <= tot_len); /* hope we alloc'd enough! */
+
+ bt_postprocess_string (new_string, options);
+
+ /*
+ * If replacing data in the AST, delete all but first child of
+ * `field', and replace text for first child with new_string.
+ */
+
+ if (replace)
+ {
+ assert (value->right != NULL); /* there has to be > 1 simple value! */
+ zzfree_ast (value->right); /* free from second simple value on */
+ value->right = NULL; /* remind ourselves they're gone */
+ if (value->text) /* free text of first simple value */
+ free (value->text);
+ value->text = new_string; /* and replace it with concatenation */
+ }
+ }
+
+ return new_string;
+
+} /* bt_postprocess_value() */
+
+
+/* ------------------------------------------------------------------------
+@NAME : bt_postprocess_field()
+@INPUT :
+@OUTPUT :
+@RETURNS :
+@DESCRIPTION: Postprocesses all the strings in a single "field = value"
+ assignment subtree. Just checks that 'field' does indeed
+ point to an BTAST_FIELD node (presumably the parent of a list
+ of simple values), downcases the field name, and calls
+ bt_postprocess_value() on the value.
+@GLOBALS :
+@CALLS :
+@CALLERS :
+@CREATED : 1997/08/25, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+char *
+bt_postprocess_field (AST * field, btshort options, boolean replace)
+{
+ if (field == NULL) return NULL;
+ if (field->nodetype != BTAST_FIELD)
+ usage_error ("bt_postprocess_field: invalid AST node (not a field)");
+
+ strlwr (field->text); /* downcase field name */
+ return bt_postprocess_value (field->down, options, replace);
+
+} /* bt_postprocess_field() */
+
+
+
+/* ------------------------------------------------------------------------
+@NAME : bt_postprocess_entry()
+@INPUT :
+@OUTPUT :
+@RETURNS :
+@DESCRIPTION: Postprocesses all the strings in an entry: collapse whitespace,
+ concatenate substrings, expands macros, and whatnot.
+@GLOBALS :
+@CALLS :
+@CREATED : 1997/01/10, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+void
+bt_postprocess_entry (AST * top, btshort options)
+{
+ AST *cur;
+
+ if (top == NULL) return; /* not even an entry at all! */
+ if (top->nodetype != BTAST_ENTRY)
+ usage_error ("bt_postprocess_entry: "
+ "invalid node type (not entry root)");
+ strlwr (top->text); /* downcase entry type */
+
+ if (top->down == NULL) return; /* no children at all */
+
+ cur = top->down;
+ if (cur->nodetype == BTAST_KEY)
+ cur = cur->right;
+
+ switch (top->metatype)
+ {
+ case BTE_REGULAR:
+ case BTE_MACRODEF:
+ {
+ while (cur)
+ {
+ bt_postprocess_field (cur, options, TRUE);
+ if (top->metatype == BTE_MACRODEF && ! (options & BTO_NOSTORE))
+ bt_add_macro_value (cur, options);
+
+ cur = cur->right;
+ }
+ break;
+ }
+
+ case BTE_COMMENT:
+ case BTE_PREAMBLE:
+ bt_postprocess_value (cur, options, TRUE);
+ break;
+ default:
+ internal_error ("bt_postprocess_entry: unknown entry metatype (%d)",
+ (int) top->metatype);
+ }
+
+} /* bt_postprocess_entry() */
diff --git a/btparse/src/prototypes.h b/btparse/src/prototypes.h
new file mode 100644
index 0000000..a902f2a
--- /dev/null
+++ b/btparse/src/prototypes.h
@@ -0,0 +1,50 @@
+/* ------------------------------------------------------------------------
+@NAME : prototypes.h
+@INPUT :
+@OUTPUT :
+@RETURNS :
+@DESCRIPTION: Prototype declarations for functions from various places.
+ Only functions that are private to the library (but shared
+ between files within the library) are declared here.
+ Functions that are "exported from" the library (ie. usable
+ by and expected to be used by library user) are declared in
+ btparse.h.
+@GLOBALS :
+@CALLS :
+@CREATED : 1997/01/12, Greg Ward
+@MODIFIED :
+@VERSION : $Id$
+@COPYRIGHT : Copyright (c) 1996-99 by Gregory P. Ward. All rights reserved.
+
+ This file is part of the btparse library. This library is
+ free software; you can redistribute it and/or modify it under
+ the terms of the GNU Library General Public License as
+ published by the Free Software Foundation; either version 2
+ of the License, or (at your option) any later version.
+-------------------------------------------------------------------------- */
+
+#ifndef PROTOTYPES_H
+#define PROTOTYPES_H
+
+#include <stdio.h>
+#include "btparse.h" /* for types */
+
+/* util.c */
+int get_uchar(char *string, int offset);
+int isulower(char *string);
+#if !HAVE_STRLWR
+char *strlwr (char *s);
+#endif
+#if !HAVE_STRUPR
+char *strupr (char *s);
+#endif
+
+
+/* macros.c */
+void init_macros (void);
+void done_macros (void);
+
+/* bibtex_ast.c */
+void dump_ast (char *msg, AST *root);
+
+#endif /* PROTOTYPES_H */
diff --git a/btparse/src/scan.c b/btparse/src/scan.c
new file mode 100644
index 0000000..bf617a3
--- /dev/null
+++ b/btparse/src/scan.c
@@ -0,0 +1,615 @@
+
+/* parser.dlg -- DLG Description of scanner
+ *
+ * Generated from: bibtex.g
+ *
+ * Terence Parr, Will Cohen, and Hank Dietz: 1989-1994
+ * Purdue University Electrical Engineering
+ * With AHPCRC, University of Minnesota
+ * ANTLR Version 1.33
+ */
+
+#include <stdio.h>
+#define ANTLR_VERSION 133
+
+#define ZZCOL
+#define USER_ZZSYN
+
+#include "config.h"
+#include "btparse.h"
+#include "attrib.h"
+#include "lex_auxiliary.h"
+#include "error.h"
+#include "my_dmalloc.h"
+
+extern char * InputFilename; /* for zzcr_ast call in pccts/ast.c */
+#include "../pccts/antlr.h"
+#include "../pccts/ast.h"
+#include "tokens.h"
+#include "../pccts/dlgdef.h"
+LOOKAHEAD
+void zzerraction()
+{
+ (*zzerr)("invalid token");
+ zzadvance();
+ zzskip();
+}
+/*
+ * D L G tables
+ *
+ * Generated from: parser.dlg
+ *
+ * 1989-1994 by Will Cohen, Terence Parr, and Hank Dietz
+ * Purdue University Electrical Engineering
+ * DLG Version 1.33
+ */
+
+#include "mode.h"
+
+
+
+static void act1()
+{
+ NLA = 1;
+ }
+
+
+static void act2()
+{
+ NLA = AT;
+ at_sign ();
+ }
+
+
+static void act3()
+{
+ NLA = 3;
+ newline ();
+ }
+
+
+static void act4()
+{
+ NLA = COMMENT;
+ comment ();
+ }
+
+
+static void act5()
+{
+ NLA = 5;
+ zzskip ();
+ }
+
+
+static void act6()
+{
+ NLA = 6;
+ toplevel_junk ();
+ }
+
+static unsigned char shift0[257] = {
+ 0, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 4, 2, 5, 5, 4, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 4, 5, 5, 5, 5, 3, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 1, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5
+};
+
+
+static void act7()
+{
+ NLA = 1;
+ }
+
+
+static void act8()
+{
+ NLA = 7;
+ newline ();
+ }
+
+
+static void act9()
+{
+ NLA = COMMENT;
+ comment ();
+ }
+
+
+static void act10()
+{
+ NLA = 8;
+ zzskip ();
+ }
+
+
+static void act11()
+{
+ NLA = NUMBER;
+ }
+
+
+static void act12()
+{
+ NLA = NAME;
+ name ();
+ }
+
+
+static void act13()
+{
+ NLA = LBRACE;
+ lbrace ();
+ }
+
+
+static void act14()
+{
+ NLA = RBRACE;
+ rbrace ();
+ }
+
+
+static void act15()
+{
+ NLA = ENTRY_OPEN;
+ lparen ();
+ }
+
+
+static void act16()
+{
+ NLA = ENTRY_CLOSE;
+ rparen ();
+ }
+
+
+static void act17()
+{
+ NLA = EQUALS;
+ }
+
+
+static void act18()
+{
+ NLA = HASH;
+ }
+
+
+static void act19()
+{
+ NLA = COMMA;
+ }
+
+
+static void act20()
+{
+ NLA = 18;
+ start_string ('"');
+ }
+
+static unsigned char shift1[257] = {
+ 0, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 3, 1, 14, 14, 3, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 3, 5, 13, 11, 5, 2, 5,
+ 14, 8, 9, 5, 5, 12, 5, 5, 5, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 5,
+ 5, 5, 10, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 14, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 6, 5, 7, 5, 14, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5
+};
+
+
+static void act21()
+{
+ NLA = 1;
+ }
+
+
+static void act22()
+{
+ NLA = 19;
+ check_runaway_string ();
+ }
+
+
+static void act23()
+{
+ NLA = 20;
+ zzreplchar (' '); zzmore ();
+ }
+
+
+static void act24()
+{
+ NLA = 21;
+ open_brace ();
+ }
+
+
+static void act25()
+{
+ NLA = 22;
+ close_brace ();
+ }
+
+
+static void act26()
+{
+ NLA = 23;
+ lparen_in_string ();
+ }
+
+
+static void act27()
+{
+ NLA = 24;
+ rparen_in_string ();
+ }
+
+
+static void act28()
+{
+ NLA = STRING;
+ quote_in_string ();
+ }
+
+
+static void act29()
+{
+ NLA = 26;
+ zzmore ();
+ }
+
+static unsigned char shift2[257] = {
+ 0, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 2, 1, 3, 3, 2, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 8, 3, 3, 3, 3,
+ 3, 6, 7, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 9, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 4, 3, 5, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3
+};
+
+#define DfaStates 38
+typedef unsigned char DfaState;
+
+static DfaState st0[7] = {
+ 1, 2, 3, 4, 5, 6, 38
+};
+
+static DfaState st1[7] = {
+ 38, 38, 38, 38, 38, 38, 38
+};
+
+static DfaState st2[7] = {
+ 38, 38, 38, 38, 38, 38, 38
+};
+
+static DfaState st3[7] = {
+ 38, 38, 38, 38, 38, 38, 38
+};
+
+static DfaState st4[7] = {
+ 38, 7, 8, 9, 7, 9, 38
+};
+
+static DfaState st5[7] = {
+ 38, 38, 38, 38, 5, 38, 38
+};
+
+static DfaState st6[7] = {
+ 38, 38, 38, 6, 38, 6, 38
+};
+
+static DfaState st7[7] = {
+ 38, 7, 8, 7, 7, 7, 38
+};
+
+static DfaState st8[7] = {
+ 38, 38, 38, 38, 38, 38, 38
+};
+
+static DfaState st9[7] = {
+ 38, 7, 8, 9, 7, 9, 38
+};
+
+static DfaState st10[16] = {
+ 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+ 21, 22, 23, 24, 38, 38
+};
+
+static DfaState st11[16] = {
+ 38, 38, 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38
+};
+
+static DfaState st12[16] = {
+ 38, 38, 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38
+};
+
+static DfaState st13[16] = {
+ 38, 25, 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 38
+};
+
+static DfaState st14[16] = {
+ 38, 38, 38, 14, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38
+};
+
+static DfaState st15[16] = {
+ 38, 38, 38, 38, 15, 16, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38
+};
+
+static DfaState st16[16] = {
+ 38, 38, 38, 38, 16, 16, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38
+};
+
+static DfaState st17[16] = {
+ 38, 38, 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38
+};
+
+static DfaState st18[16] = {
+ 38, 38, 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38
+};
+
+static DfaState st19[16] = {
+ 38, 38, 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38
+};
+
+static DfaState st20[16] = {
+ 38, 38, 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38
+};
+
+static DfaState st21[16] = {
+ 38, 38, 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38
+};
+
+static DfaState st22[16] = {
+ 38, 38, 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38
+};
+
+static DfaState st23[16] = {
+ 38, 38, 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38
+};
+
+static DfaState st24[16] = {
+ 38, 38, 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38
+};
+
+static DfaState st25[16] = {
+ 38, 38, 38, 38, 38, 38, 38, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38
+};
+
+static DfaState st26[16] = {
+ 38, 25, 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 38
+};
+
+static DfaState st27[11] = {
+ 28, 29, 30, 31, 32, 33, 34, 35, 36, 31,
+ 38
+};
+
+static DfaState st28[11] = {
+ 38, 38, 38, 38, 38, 38, 38, 38, 38, 38,
+ 38
+};
+
+static DfaState st29[11] = {
+ 38, 38, 37, 37, 38, 38, 38, 38, 38, 38,
+ 38
+};
+
+static DfaState st30[11] = {
+ 38, 38, 31, 31, 38, 38, 38, 38, 38, 31,
+ 38
+};
+
+static DfaState st31[11] = {
+ 38, 38, 31, 31, 38, 38, 38, 38, 38, 31,
+ 38
+};
+
+static DfaState st32[11] = {
+ 38, 38, 38, 38, 38, 38, 38, 38, 38, 38,
+ 38
+};
+
+static DfaState st33[11] = {
+ 38, 38, 38, 38, 38, 38, 38, 38, 38, 38,
+ 38
+};
+
+static DfaState st34[11] = {
+ 38, 38, 38, 38, 38, 38, 38, 38, 38, 38,
+ 38
+};
+
+static DfaState st35[11] = {
+ 38, 38, 38, 38, 38, 38, 38, 38, 38, 38,
+ 38
+};
+
+static DfaState st36[11] = {
+ 38, 38, 38, 38, 38, 38, 38, 38, 38, 38,
+ 38
+};
+
+static DfaState st37[11] = {
+ 38, 38, 37, 37, 38, 38, 38, 38, 38, 38,
+ 38
+};
+
+
+DfaState *dfa[38] = {
+ st0,
+ st1,
+ st2,
+ st3,
+ st4,
+ st5,
+ st6,
+ st7,
+ st8,
+ st9,
+ st10,
+ st11,
+ st12,
+ st13,
+ st14,
+ st15,
+ st16,
+ st17,
+ st18,
+ st19,
+ st20,
+ st21,
+ st22,
+ st23,
+ st24,
+ st25,
+ st26,
+ st27,
+ st28,
+ st29,
+ st30,
+ st31,
+ st32,
+ st33,
+ st34,
+ st35,
+ st36,
+ st37
+};
+
+
+DfaState accepts[39] = {
+ 0, 1, 2, 3, 6, 5, 6, 0, 4, 6,
+ 0, 7, 8, 0, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 9, 0, 0, 21, 22,
+ 23, 29, 24, 25, 26, 27, 28, 22, 0
+};
+
+void (*actions[30])() = {
+ zzerraction,
+ act1,
+ act2,
+ act3,
+ act4,
+ act5,
+ act6,
+ act7,
+ act8,
+ act9,
+ act10,
+ act11,
+ act12,
+ act13,
+ act14,
+ act15,
+ act16,
+ act17,
+ act18,
+ act19,
+ act20,
+ act21,
+ act22,
+ act23,
+ act24,
+ act25,
+ act26,
+ act27,
+ act28,
+ act29
+};
+
+static DfaState dfa_base[] = {
+ 0,
+ 10,
+ 27
+};
+
+static unsigned char *b_class_no[] = {
+ shift0,
+ shift1,
+ shift2
+};
+
+
+
+#define ZZSHIFT(c) (b_class_no[zzauto][1+c])
+#define MAX_MODE 3
+#include "../pccts/dlgauto.h"
diff --git a/btparse/src/stdpccts.h b/btparse/src/stdpccts.h
new file mode 100644
index 0000000..c8139ae
--- /dev/null
+++ b/btparse/src/stdpccts.h
@@ -0,0 +1,32 @@
+#ifndef STDPCCTS_H
+#define STDPCCTS_H
+/*
+ * stdpccts.h -- P C C T S I n c l u d e
+ *
+ * Terence Parr, Will Cohen, and Hank Dietz: 1989-1994
+ * Purdue University Electrical Engineering
+ * With AHPCRC, University of Minnesota
+ * ANTLR Version 1.33
+ */
+#include <stdio.h>
+#define ANTLR_VERSION 133
+
+#define ZZCOL
+#define USER_ZZSYN
+
+#include "config.h"
+#include "btparse.h"
+#include "attrib.h"
+#include "lex_auxiliary.h"
+#include "error.h"
+#include "my_dmalloc.h"
+
+extern char * InputFilename; /* for zzcr_ast call in pccts/ast.c */
+#define GENAST
+#define zzSET_SIZE 4
+#include "../pccts/antlr.h"
+#include "../pccts/ast.h"
+#include "tokens.h"
+#include "../pccts/dlgdef.h"
+#include "mode.h"
+#endif
diff --git a/btparse/src/string_util.c b/btparse/src/string_util.c
new file mode 100644
index 0000000..14846b3
--- /dev/null
+++ b/btparse/src/string_util.c
@@ -0,0 +1,698 @@
+/* ------------------------------------------------------------------------
+@NAME : string_util.c
+@DESCRIPTION: Various string-processing utility functions:
+ bt_purify_string()
+ bt_change_case()
+
+ and their helpers:
+ foreign_letter()
+ purify_special_char()
+@GLOBALS :
+@CALLS :
+@CALLERS :
+@CREATED : 1997/10/19, Greg Ward
+@MODIFIED : 1997/11/25, GPW: renamed to from purify.c to string_util.c
+ added bt_change_case() and friends
+@VERSION : $Id$
+-------------------------------------------------------------------------- */
+
+#include <stdlib.h>
+#include <ctype.h>
+#include <string.h>
+#include <assert.h>
+#include "error.h"
+#include "btparse.h"
+#include "bt_debug.h"
+
+
+/*
+ * These definitions should be fixed to be consistent with HTML
+ * entities, just for fun. And perhaps I should add entries for
+ * accented letters (at least those supported by TeX and HTML).
+ */
+typedef enum
+{
+ L_OTHER, /* not a "foreign" letter */
+ L_OSLASH_L, /* Eastern European {\o} */
+ L_OSLASH_U,
+ L_LSLASH_L, /* {\l} */
+ L_LSLASH_U,
+ L_OELIG_L, /* Latin {\oe} ligature */
+ L_OELIG_U,
+ L_AELIG_L, /* {\ae} ligature */
+ L_AELIG_U,
+ L_SSHARP_L, /* German "sharp s" {\ss} */
+ L_SSHARP_U,
+ L_ACIRCLE_L, /* Nordic {\aa} */
+ L_ACIRCLE_U,
+ L_INODOT_L, /* undotted i: {\i} */
+ L_JNODOT_L /* {\j} */
+} bt_letter;
+
+
+static char * uc_version[] =
+{
+ NULL, /* L_OTHER */
+ "\\O", /* L_OSLASH_L */
+ "\\O", /* L_OSLASH_U */
+ "\\L", /* L_LSLASH_L */
+ "\\L", /* L_LSLASH_U */
+ "\\OE", /* L_OELIG_L */
+ "\\OE", /* L_OELIG_U */
+ "\\AE", /* L_AELIG_L */
+ "\\AE", /* L_AELIG_U */
+ "SS", /* L_SSHARP_L -- for LaTeX 2.09 */
+ "\\SS", /* L_SSHARP_U */
+ "\\AA", /* L_ACIRCLE_L */
+ "\\AA", /* L_ACIRCLE_U */
+ "I", /* L_INODOT_L */
+ "J" /* L_JNODOT_L */
+};
+
+static char * lc_version[] =
+{
+ NULL, /* L_OTHER */
+ "\\o", /* L_OSLASH_L */
+ "\\o", /* L_OSLASH_U */
+ "\\l", /* L_LSLASH_L */
+ "\\l", /* L_LSLASH_U */
+ "\\oe", /* L_OELIG_L */
+ "\\oe", /* L_OELIG_U */
+ "\\ae", /* L_AELIG_L */
+ "\\ae", /* L_AELIG_U */
+ "\\ss", /* L_SSHARP_L */
+ "\\ss", /* L_SSHARP_U */
+ "\\aa", /* L_ACIRCLE_L */
+ "\\aa", /* L_ACIRCLE_U */
+ "\\i", /* L_INODOT_L */
+ "\\j" /* L_JNODOT_L */
+};
+
+
+
+/* ------------------------------------------------------------------------
+@NAME : foreign_letter()
+@INPUT : str
+ start
+ stop
+@OUTPUT : letter
+@RETURNS : TRUE if the string delimited by start and stop is a foreign
+ letter control sequence
+@DESCRIPTION: Determines if a character sequence is one of (La)TeX's
+ "foreign letter" control sequences (l, o, ae, oe, aa, ss, plus
+ uppercase versions). If `letter' is non-NULL, returns which
+ letter was found in it (as a bt_letter value).
+@CALLS :
+@CALLERS : purify_special_char()
+@CREATED : 1997/10/19, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+static boolean
+foreign_letter (char *str, int start, int stop, bt_letter * letter)
+{
+ char c1, c2;
+ bt_letter dummy;
+
+
+ /*
+ * This is written for speed, not flexibility -- adding new foreign
+ * letters would be trying and vexatious.
+ *
+ * N.B. my gold standard list of foreign letters is Kopka and Daly's
+ * *A Guide to LaTeX 2e*, section 2.5.6.
+ */
+
+ if (letter == NULL) /* so we can assign to *letter */
+ letter = &dummy; /* without compunctions */
+ *letter = L_OTHER; /* assume not a "foreign" letter */
+
+ c1 = str[start+0]; /* only two characters that we're */
+ c2 = str[start+1]; /* interested in */
+
+ switch (stop - start)
+ {
+ case 1: /* one-character control sequences */
+ switch (c1) /* (\o and \l) */
+ {
+ case 'o':
+ *letter = L_OSLASH_L; return TRUE;
+ case 'O':
+ *letter = L_OSLASH_U; return TRUE;
+ case 'l':
+ *letter = L_LSLASH_L; return TRUE;
+ case 'L':
+ *letter = L_LSLASH_L; return TRUE;
+ case 'i':
+ *letter = L_INODOT_L; return TRUE;
+ case 'j':
+ *letter = L_JNODOT_L; return TRUE;
+ default:
+ return FALSE;
+ }
+ break;
+ case 2: /* two character control sequences */
+ switch (c1) /* (\oe, \ae, \aa, and \ss) */
+ {
+ case 'o':
+ if (c2 == 'e') { *letter = L_OELIG_L; return TRUE; }
+ case 'O':
+ if (c2 == 'E') { *letter = L_OELIG_U; return TRUE; }
+
+ /* BibTeX 0.99 does not handle \aa and \AA -- but I do!*/
+ case 'a':
+ if (c2 == 'e')
+ { *letter = L_AELIG_L; return TRUE; }
+ else if (c2 == 'a')
+ { *letter = L_ACIRCLE_L; return TRUE; }
+ else
+ return FALSE;
+ case 'A':
+ if (c2 == 'E')
+ { *letter = L_AELIG_U; return TRUE; }
+ else if (c2 == 'A')
+ { *letter = L_ACIRCLE_U; return TRUE; }
+ else
+ return FALSE;
+
+ /* uppercase sharp-s -- new with LaTeX 2e (so far all I do
+ * is recognize it as a "foreign" letter)
+ */
+ case 's':
+ if (c2 == 's')
+ { *letter = L_SSHARP_L; return TRUE; }
+ else
+ return FALSE;
+ case 'S':
+ if (c2 == 'S')
+ { *letter = L_SSHARP_U; return TRUE; }
+ else
+ return FALSE;
+
+ default:
+ return FALSE;
+ }
+ break;
+ default:
+ return FALSE;
+ } /* switch on length of control sequence */
+
+ internal_error ("foreign_letter(): should never reach end of function");
+ return FALSE; /* to keep gcc -Wall happy */
+
+} /* foreign_letter */
+
+
+/* ------------------------------------------------------------------------
+@NAME : purify_special_char()
+@INPUT : *src, *dst - pointers into the input and output strings
+@OUTPUT : *src - updated to point to the closing brace of the
+ special char
+ *dst - updated to point to the next available spot
+ for copying text to
+@RETURNS :
+@DESCRIPTION: "Purifies" a BibTeX special character. On input, *src should
+ point to the opening brace of a special character (ie. the
+ brace must be at depth 0 of the whole string, and the
+ character immediately following it must be a backslash).
+ *dst should point to the next spot to copy into the output
+ (purified) string. purify_special_char() will skip over the
+ opening brace and backslash; if the control sequence is one
+ of LaTeX's foreign letter sequences (as determined by
+ foreign_letter()), then it is simply copied to *dst.
+ Otherwise the control sequence is skipped. In either case,
+ text after the control sequence is either copied (alphabetic
+ characters) or skipped (anything else, including hyphens,
+ ties, and digits).
+@CALLS : foreign_letter()
+@CALLERS : bt_purify_string()
+@CREATED : 1997/10/19, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+static void
+purify_special_char (char *str, int * src, int * dst)
+{
+ int depth;
+ int peek;
+
+ assert (str[*src] == '{' && str[*src + 1] == '\\');
+ depth = 1;
+
+ *src += 2; /* jump to start of control sequence */
+ peek = *src; /* scan to end of control sequence */
+ while (isalpha (str[peek]))
+ peek++;
+ if (peek == *src) /* in case of single-char, non-alpha */
+ peek++; /* control sequence (eg. {\'e}) */
+
+ if (foreign_letter (str, *src, peek, NULL))
+ {
+ assert (peek - *src == 1 || peek - *src == 2);
+ str[(*dst)++] = str[(*src)++]; /* copy first char */
+ if (*src < peek) /* copy second char, downcasing */
+ str[(*dst)++] = tolower (str[(*src)++]);
+ }
+ else /* not a foreign letter -- skip */
+ { /* the control sequence entirely */
+ *src = peek;
+ }
+
+ while (str[*src])
+ {
+ switch (str[*src])
+ {
+ case '{':
+ depth++;
+ (*src)++;
+ break;
+ case '}':
+ depth--;
+ if (depth == 0) return; /* done with special char */
+ (*src)++;
+ break;
+ default:
+ if (isalpha (str[*src])) /* copy alphabetic chars */
+ str[(*dst)++] = str[(*src)++];
+ else /* skip everything else */
+ (*src)++;
+ }
+ }
+
+ /*
+ * If we get here, we have unbalanced braces -- the '}' case should
+ * always hit a depth == 0 point if braces are balanced. No warning,
+ * though, because a) BibTeX doesn't warn about purifying unbalanced
+ * strings, and b) we (should have) already warned about it in the
+ * lexer.
+ */
+
+} /* purify_special_char() */
+
+
+/* ------------------------------------------------------------------------
+@NAME : bt_purify_string()
+@INOUT : instr
+@INPUT : options
+@OUTPUT :
+@RETURNS : instr - same as input string, but modified in place
+@DESCRIPTION: "Purifies" a BibTeX string. This consists of copying
+ alphanumeric characters, converting hyphens and ties to
+ space, copying spaces, and skipping everything else. (Well,
+ almost -- special characters are handled specially, of
+ course. Basically, accented letters have the control
+ sequence skipped, while foreign letters have the control
+ sequence preserved in a reasonable manner. See
+ purify_special_char() for details.)
+@CALLS : purify_special_char()
+@CALLERS :
+@CREATED : 1997/10/19, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+void
+bt_purify_string (char * string, btshort options)
+{
+ int src, /* both indeces into string */
+ dst;
+ int depth; /* brace depth in string */
+ unsigned orig_len;
+
+ /*
+ * Since purification always copies or deletes chars, outstr will
+ * be no longer than string -- so nothing fancy is required to put
+ * an upper bound on its eventual size.
+ */
+
+ depth = 0;
+ src = 0;
+ dst = 0;
+ orig_len = strlen (string);
+
+ DBG_ACTION (1, printf ("bt_purify_string(): input = %p (%s)\n",
+ string, string));
+
+ while (string[src] != (char) 0)
+ {
+ DBG_ACTION (2, printf (" next: >%c<: ", string[src]));
+ switch (string[src])
+ {
+ case '~': /* "separator" characters -- */
+ case '-': /* replaced with space */
+ case ' ': /* and copy an actual space */
+ string[dst++] = ' ';
+ src++;
+ DBG_ACTION (2, printf ("replacing with space"));
+ break;
+ case '{':
+ if (depth == 0 && string[src+1] == '\\')
+ {
+ DBG_ACTION (2, printf ("special char found"));
+ purify_special_char (string, &src, &dst);
+ }
+ else
+ {
+ DBG_ACTION (2, printf ("ordinary open brace"));
+ src++;
+ }
+ depth++;
+ break;
+ case '}':
+ DBG_ACTION (2, printf ("close brace"));
+ depth--;
+ src++;
+ break;
+ default:
+ if (isalnum (string[src])) /* any alphanumeric char -- */
+ {
+ DBG_ACTION (2, printf ("alphanumeric -- copying"));
+ string[dst++] = string[src++]; /* copy it */
+ }
+ else /* anything else -- skip it */
+ {
+ DBG_ACTION (2, printf ("non-separator, non-brace, non-alpha"));
+ src++;
+ }
+ } /* switch string[src] */
+
+ DBG_ACTION (2, printf ("\n"));
+
+ } /* while string[src] */
+
+ DBG_ACTION (1, printf ("bt_purify_string(): depth on exit: %d\n", depth));
+
+ string[dst] = (char) 0;
+ assert (strlen (string) <= orig_len);
+} /* bt_purify_string() */
+
+
+/* ======================================================================
+ * Case-transformation stuff
+ */
+
+
+/* ------------------------------------------------------------------------
+@NAME : convert_special_char()
+@INPUT : transform
+@INOUT : string
+ src
+ dst
+ start_sentence
+ after_colon
+@RETURNS :
+@DESCRIPTION: Does case conversion on a special character.
+@GLOBALS :
+@CALLS :
+@CALLERS :
+@CREATED : 1997/11/25, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+static void
+convert_special_char (char transform,
+ char * string,
+ int * src,
+ int * dst,
+ boolean * start_sentence,
+ boolean * after_colon)
+{
+ int depth;
+ boolean done_special;
+ int cs_end;
+ int cs_len; /* counting the backslash */
+ bt_letter letter;
+ char * repl;
+ int repl_len;
+
+#ifndef ALLOW_WARNINGS
+ repl = NULL; /* silence "might be used" */
+ /* uninitialized" warning */
+#endif
+
+ /* First, copy just the opening brace */
+ string[(*dst)++] = string[(*src)++];
+
+ /*
+ * Now loop over characters inside the braces -- stop when we reach
+ * the matching close brace, or when the string ends.
+ */
+ depth = 1; /* because we're in a special char */
+ done_special = FALSE;
+
+ while (string[*src] != 0 && !done_special)
+ {
+ switch (string[*src])
+ {
+ case '\\': /* a control sequence */
+ {
+ cs_end = *src+1; /* scan over chars of c.s. */
+ while (isalpha (string[cs_end]))
+ cs_end++;
+
+ /*
+ * OK, now *src points to the backslash (so src+*1 points to
+ * first char. of control sequence), and cs_end points to
+ * character immediately following end of control sequence.
+ * Thus we analyze [*src+1..cs_end] to determine if the control
+ * sequence is a foreign letter, and use (cs_end - (*src+1) + 1)
+ * = (cs_end - *src) as the length of the control sequence.
+ */
+
+ cs_len = cs_end - *src; /* length of cs, counting backslash */
+
+ if (foreign_letter (string, *src+1, cs_end, &letter))
+ {
+ if (letter == L_OTHER)
+ internal_error ("impossible foreign letter");
+
+ switch (transform)
+ {
+ case 'u':
+ repl = uc_version[(int) letter];
+ break;
+ case 'l':
+ repl = lc_version[(int) letter];
+ break;
+ case 't':
+ if (*start_sentence || *after_colon)
+ {
+ repl = uc_version[(int) letter];
+ *start_sentence = *after_colon = FALSE;
+ }
+ else
+ {
+ repl = lc_version[(int) letter];
+ }
+ break;
+ default:
+ internal_error ("impossible case transform \"%c\"",
+ transform);
+ }
+
+ repl_len = strlen (repl);
+ if (repl_len > cs_len)
+ internal_error
+ ("replacement text longer than original cs");
+
+ strncpy (string + *dst, repl, repl_len);
+ *src = cs_end;
+ *dst += repl_len;
+ } /* control sequence is a foreign letter */
+ else
+ {
+ /* not a foreign letter -- just copy the control seq. as is */
+
+
+ strncpy (string + *dst, string + *src, cs_end - *src);
+ *src += cs_len;
+ assert (*src == cs_end);
+ *dst += cs_len;
+ } /* control sequence not a foreign letter */
+
+ break;
+ } /* case: '\\' */
+
+ case '{':
+ {
+ string[(*dst)++] = string[(*src)++];
+ depth++;
+ break;
+ }
+
+ case '}':
+ {
+ string[(*dst)++] = string[(*src)++];
+ depth--;
+ if (depth == 0)
+ done_special = TRUE;
+ break;
+ }
+
+ default: /* any other character */
+ {
+ switch (transform)
+ {
+ /*
+ * Inside special chars, lowercase and title caps are same.
+ * (At least, that's bibtex's convention. I might change this
+ * at some point to be a bit smarter.)
+ */
+ case 'l':
+ case 't':
+ string[(*dst)++] = tolower (string[(*src)++]);
+ break;
+ case 'u':
+ string[(*dst)++] = toupper (string[(*src)++]);
+ break;
+ default:
+ internal_error ("impossible case transform \"%c\"",
+ transform);
+ }
+ } /* default char */
+
+ } /* switch: current char */
+
+ } /* while: string or special char not done */
+
+} /* convert_special_char() */
+
+
+/* ------------------------------------------------------------------------
+@NAME : bt_change_case()
+@INPUT :
+@OUTPUT :
+@RETURNS :
+@DESCRIPTION: Converts a string (in-place) to either uppercase, lowercase,
+ or "title capitalization">
+@GLOBALS :
+@CALLS :
+@CALLERS :
+@CREATED : 1997/11/25, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+void
+bt_change_case (char transform,
+ char * string,
+ btshort options)
+{
+ int len;
+ int depth;
+ int src, dst; /* indeces into string */
+ boolean start_sentence;
+ boolean after_colon;
+
+ src = dst = 0;
+ len = strlen (string);
+ depth = 0;
+
+ start_sentence = TRUE;
+ after_colon = FALSE;
+
+ while (string[src] != 0)
+ {
+ switch (string[src])
+ {
+ case '{':
+
+ /*
+ * At start of special character? The entire special char.
+ * will be handled here, as follows:
+ * - text at any brace-depth within the s.c. is case-mangled;
+ * punctuation (sentence endings, colons) are ignored
+ * - control sequences are left alone, unless they are
+ * one of the "foreign letter" control sequences, in
+ * which case they're converted to the appropriate string
+ * according to the uc_version or lc_version tables.
+ */
+ if (depth == 0 && string[src+1] == '\\')
+ {
+ convert_special_char (transform, string, &src, &dst,
+ &start_sentence, &after_colon);
+ }
+
+ /*
+ * Otherwise, it's just something in braces. This is probably
+ * a proper noun or something encased in braces to protect it
+ * from case-mangling, so we do not case-mangle it. However,
+ * we *do* switch out of start_sentence or after_colon mode if
+ * we happen to be there (otherwise we'll do the wrong thing
+ * once we're out of the braces).
+ */
+ else
+ {
+ string[dst++] = string[src++];
+ start_sentence = after_colon = FALSE;
+ depth++;
+ }
+ break;
+
+ case '}':
+ string[dst++] = string[src++];
+ depth--;
+ break;
+
+ /*
+ * Sentence-ending punctuation and colons are handled separately
+ * to allow for exact mimicing of BibTeX's behaviour. I happen
+ * to think that this behaviour (capitalize first word of sentences
+ * in a title) is better than BibTeX's, but I want to keep my
+ * options open for a future goal of perfect compatability.
+ */
+ case '.':
+ case '?':
+ case '!':
+ start_sentence = TRUE;
+ string[dst++] = string[src++];
+ break;
+
+ case ':':
+ after_colon = TRUE;
+ string[dst++] = string[src++];
+ break;
+
+ default:
+ if (isspace (string[src]))
+ {
+ string[dst++] = string[src++];
+ }
+ else
+ {
+ if (depth == 0)
+ {
+ switch (transform)
+ {
+ case 'u':
+ string[dst++] = toupper (string[src++]);
+ break;
+ case 'l':
+ string[dst++] = tolower (string[src++]);
+ break;
+ case 't':
+ if (start_sentence || after_colon)
+ {
+ /*
+ * XXX BibTeX only preserves case of character
+ * immediately after a colon; I do two things
+ * differently: first, I pay attention to sentence
+ * punctuation, and second I force uppercase
+ * at start of sentence or after a colon.
+ */
+ string[dst++] = toupper (string[src++]);
+ start_sentence = after_colon = FALSE;
+ }
+ else
+ {
+ string[dst++] = tolower (string[src++]);
+ }
+ break;
+ default:
+ internal_error ("impossible case transform \"%c\"",
+ transform);
+ }
+ } /* depth == 0 */
+ else
+ {
+ string[dst++] = string[src++];
+ }
+ } /* not blank */
+ } /* switch on current character */
+
+ } /* while not at end of string */
+
+} /* bt_change_case */
diff --git a/btparse/src/sym.c b/btparse/src/sym.c
new file mode 100644
index 0000000..ef41e13
--- /dev/null
+++ b/btparse/src/sym.c
@@ -0,0 +1,376 @@
+/*
+ * Simple symbol table manager using coalesced chaining to resolve collisions
+ *
+ * Doubly-linked lists are used for fast removal of entries.
+ *
+ * 'sym.h' must have a definition for typedef "Sym". Sym must include at
+ * minimum the following fields:
+ *
+ * ...
+ * char *symbol;
+ * struct ... *next, *prev, **head, *scope;
+ * unsigned int hash;
+ * ...
+ *
+ * 'template.h' can be used as a template to create a 'sym.h'.
+ *
+ * 'head' is &(table[hash(itself)]).
+ * The hash table is not resizable at run-time.
+ * The scope field is used to link all symbols of a current scope together.
+ * Scope() sets the current scope (linked list) to add symbols to.
+ * Any number of scopes can be handled. The user passes the address of
+ * a pointer to a symbol table
+ * entry (INITIALIZED TO NULL first time).
+ *
+ * Available Functions:
+ *
+ * zzs_init(s1,s2) -- Create hash table with size s1, string table size s2.
+ * zzs_done() -- Free hash and string table created with zzs_init().
+ * zzs_add(key,rec)-- Add 'rec' with key 'key' to the symbol table.
+ * zzs_newadd(key) -- create entry; add using 'key' to the symbol table.
+ * zzs_get(key) -- Return pointer to last record entered under 'key'
+ * Else return NULL
+ * zzs_del(p) -- Unlink the entry associated with p. This does
+ * NOT free 'p' and DOES NOT remove it from a scope
+ * list. If it was a part of your intermediate code
+ * tree or another structure. It will still be there.
+ * It is only removed from further consideration
+ * by the symbol table.
+ * zzs_keydel(s) -- Unlink the entry associated with key s.
+ * Calls zzs_del(p) to unlink.
+ * zzs_scope(sc) -- Specifies that everything added to the symbol
+ * table with zzs_add() is added to the list (scope)
+ * 'sc'. 'sc' is of 'Sym **sc' type and must be
+ * initialized to NULL before trying to add anything
+ * to it (passing it to zzs_scope()). Scopes can be
+ * switched at any time and merely links a set of
+ * symbol table entries. If a NULL pointer is
+ * passed, the current scope is returned.
+ * zzs_rmscope(sc) -- Remove (zzs_del()) all elements of scope 'sc'
+ * from the symbol table. The entries are NOT
+ * free()'d. A pointer to the first
+ * element in the "scope" is returned. The user
+ * can then manipulate the list as he/she chooses
+ * (such as freeing them all). NOTE that this
+ * function sets your scope pointer to NULL,
+ * but returns a pointer to the list for you to use.
+ * zzs_stat() -- Print out the symbol table and some relevant stats.
+ * zzs_new(key) -- Create a new record with calloc() of type Sym.
+ * Add 'key' to the string table and make the new
+ * records 'symbol' pointer point to it.
+ * zzs_strdup(s) -- Add s to the string table and return a pointer
+ * to it. Very fast allocation routine
+ * and does not require strlen() nor calloc().
+ *
+ * Example:
+ *
+ * #include <stdio.h>
+ * #include "sym.h"
+ *
+ * main()
+ * {
+ * Sym *scope1=NULL, *scope2=NULL, *a, *p;
+ *
+ * zzs_init(101, 100);
+ *
+ * a = zzs_new("Apple"); zzs_add(a->symbol, a); -- No scope
+ * zzs_scope( &scope1 ); -- enter scope 1
+ * a = zzs_new("Plum"); zzs_add(a->symbol, a);
+ * zzs_scope( &scope2 ); -- enter scope 2
+ * a = zzs_new("Truck"); zzs_add(a->symbol, a);
+ *
+ * p = zzs_get("Plum");
+ * if ( p == NULL ) fprintf(stderr, "Hmmm...Can't find 'Plum'\n");
+ *
+ * p = zzs_rmscope(&scope1)
+ * for (; p!=NULL; p=p->scope) {printf("Scope1: %s\n", p->symbol);}
+ * p = zzs_rmscope(&scope2)
+ * for (; p!=NULL; p=p->scope) {printf("Scope2: %s\n", p->symbol);}
+ * }
+ *
+ * Terence Parr
+ * Purdue University
+ * February 1990
+ *
+ * CHANGES
+ *
+ * Terence Parr
+ * May 1991
+ * Renamed functions to be consistent with ANTLR
+ * Made HASH macro
+ * Added zzs_keydel()
+ * Added zzs_newadd()
+ * Fixed up zzs_stat()
+ *
+ * July 1991
+ * Made symbol table entry save its hash code for fast comparison
+ * during searching etc...
+ */
+
+#include "bt_config.h"
+#include <stdio.h>
+#if __STDC__ == 1
+#include <string.h>
+#include <stdlib.h>
+#else
+#include "malloc.h"
+#endif
+#ifdef MEMCHK
+#include "trax.h"
+#endif
+#include "sym.h"
+#include "my_dmalloc.h"
+
+#define StrSame 0
+
+static Sym **CurScope = NULL;
+static unsigned size = 0;
+static Sym **table=NULL;
+static char *strings;
+static char *strp;
+static int strsize = 0;
+
+void
+zzs_init(int sz, int strs)
+{
+ if ( sz <= 0 || strs <= 0 ) return;
+ table = (Sym **) calloc(sz, sizeof(Sym *));
+ if ( table == NULL )
+ {
+ fprintf(stderr, "Cannot allocate table of size %d\n", sz);
+ exit(1);
+ }
+ strings = (char *) calloc(strs, sizeof(char));
+ if ( strings == NULL )
+ {
+ fprintf(stderr, "Cannot allocate string table of size %d\n", strs);
+ exit(1);
+ }
+ size = sz;
+ strsize = strs;
+ strp = strings;
+}
+
+
+void
+zzs_free(void)
+{
+ unsigned i;
+ Sym *cur, *next;
+
+ for (i = 0; i < size; i++)
+ {
+ cur = table[i];
+ while (cur != NULL)
+ {
+ next = cur->next;
+ free (cur);
+ cur = next;
+ }
+ }
+}
+
+
+void
+zzs_done(void)
+{
+ if ( table != NULL ) free( table );
+ if ( strings != NULL ) free( strings );
+}
+
+void
+zzs_add(char *key, register Sym *rec)
+{
+ register unsigned int h=0;
+ register char *p=key;
+
+ HASH_FUN(p, h);
+ rec->hash = h; /* save hash code for fast comp later */
+ h %= size;
+
+ if ( CurScope != NULL ) {rec->scope = *CurScope; *CurScope = rec;}
+ rec->next = table[h]; /* Add to doubly-linked list */
+ rec->prev = NULL;
+ if ( rec->next != NULL ) (rec->next)->prev = rec;
+ table[h] = rec;
+ rec->head = &(table[h]);
+}
+
+Sym *
+zzs_get(char *key)
+{
+ register unsigned int h=0;
+ register char *p=key;
+ register Sym *q;
+
+ HASH_FUN(p, h);
+
+ for (q = table[h%size]; q != NULL; q = q->next)
+ {
+ if ( q->hash == h ) /* do we even have a chance of matching? */
+ if ( strcasecmp(key, q->symbol) == StrSame ) return( q );
+ }
+ return( NULL );
+}
+
+/*
+ * Unlink p from the symbol table. Hopefully, it's actually in the
+ * symbol table.
+ *
+ * If p is not part of a bucket chain of the symbol table, bad things
+ * will happen.
+ *
+ * Will do nothing if all list pointers are NULL
+ */
+void
+zzs_del(register Sym *p)
+{
+ if ( p == NULL ) {fprintf(stderr, "zzs_del(NULL)\n"); exit(1);}
+ if ( p->prev == NULL ) /* Head of list */
+ {
+ register Sym **t = p->head;
+
+ if ( t == NULL ) return; /* not part of symbol table */
+ (*t) = p->next;
+ if ( (*t) != NULL ) (*t)->prev = NULL;
+ }
+ else
+ {
+ (p->prev)->next = p->next;
+ if ( p->next != NULL ) (p->next)->prev = p->prev;
+ }
+ p->next = p->prev = NULL; /* not part of symbol table anymore */
+ p->head = NULL;
+}
+
+void
+zzs_keydel(char *key)
+{
+ Sym *p = zzs_get(key);
+
+ if ( p != NULL ) zzs_del( p );
+}
+
+/* S c o p e S t u f f */
+
+/* Set current scope to 'scope'; return current scope if 'scope' == NULL */
+Sym **
+zzs_scope(Sym **scope)
+{
+ if ( scope == NULL ) return( CurScope );
+ CurScope = scope;
+ return( scope );
+}
+
+/* Remove a scope described by 'scope'. Return pointer to 1st element in scope */
+Sym *
+zzs_rmscope(register Sym **scope)
+{
+ register Sym *p;
+ Sym *start;
+
+ if ( scope == NULL ) return(NULL);
+ start = p = *scope;
+ for (; p != NULL; p=p->scope) { zzs_del( p ); }
+ *scope = NULL;
+ return( start );
+}
+
+void
+zzs_stat(void)
+{
+ static unsigned short count[20];
+ unsigned int i,n=0,low=0, hi=0;
+ register Sym **p;
+ float avg=0.0;
+
+ for (i=0; i<20; i++) count[i] = 0;
+ for (p=table; p<&(table[size]); p++)
+ {
+ register Sym *q = *p;
+ unsigned int len;
+
+ if ( q != NULL && low==0 ) low = p-table;
+ len = 0;
+ if ( q != NULL ) printf("[%ld]", p-table);
+ while ( q != NULL )
+ {
+ len++;
+ n++;
+ printf(" %s", q->symbol);
+ q = q->next;
+ if ( q == NULL ) printf("\n");
+ }
+ if ( len>=20 ) printf("zzs_stat: count table too small\n");
+ else count[len]++;
+ if ( *p != NULL ) hi = p-table;
+ }
+
+ printf("Storing %d recs used %d hash positions out of %d\n",
+ n, size-count[0], size);
+ printf("%f %% utilization\n",
+ ((float)(size-count[0]))/((float)size));
+ for (i=0; i<20; i++)
+ {
+ if ( count[i] != 0 )
+ {
+ avg += (((float)(i*count[i]))/((float)n)) * i;
+ printf("Buckets of len %d == %d (%f %% of recs)\n",
+ i, count[i], 100.0*((float)(i*count[i]))/((float)n));
+ }
+ }
+ printf("Avg bucket length %f\n", avg);
+ printf("Range of hash function: %d..%d\n", low, hi);
+}
+
+/*
+ * Given a string, this function allocates and returns a pointer to a
+ * symbol table record whose "symbol" pointer is reset to a position
+ * in the string table.
+ */
+Sym *
+zzs_new(char *text)
+{
+ Sym *p;
+ char *zzs_strdup(register char *s);
+
+ if ( (p = (Sym *) calloc(1,sizeof(Sym))) == 0 )
+ {
+ fprintf(stderr,"Out of memory\n");
+ exit(1);
+ }
+ p->symbol = zzs_strdup(text);
+
+ return p;
+}
+
+/* create a new symbol table entry and add it to the symbol table */
+Sym *
+zzs_newadd(char *text)
+{
+ Sym *p = zzs_new(text);
+ if ( p != NULL ) zzs_add(text, p);
+ return p;
+}
+
+/* Add a string to the string table and return a pointer to it.
+ * Bump the pointer into the string table to next avail position.
+ */
+char *
+zzs_strdup(register char *s)
+{
+ register char *start=strp;
+
+ while ( *s != '\0' )
+ {
+ if ( strp >= &(strings[strsize-2]) )
+ {
+ fprintf(stderr, "sym: string table overflow (%d chars)\n", strsize);
+ exit(-1);
+ }
+ *strp++ = *s++;
+ }
+ *strp++ = '\0';
+
+ return( start );
+}
diff --git a/btparse/src/sym.h b/btparse/src/sym.h
new file mode 100644
index 0000000..a673b06
--- /dev/null
+++ b/btparse/src/sym.h
@@ -0,0 +1,48 @@
+#include <ctype.h>
+
+/*
+ * Declarations for symbol table in sym.c
+ */
+
+/* define some hash function */
+#ifndef HASH_FUN
+#define HASH_FUN(p, h) while ( *p != '\0' ) h = (h<<1) + tolower (*p++);
+#endif
+
+/* minimum symbol table record */
+typedef struct _sym
+{
+ char *symbol; /* the macro name */
+ char *text; /* its expansion */
+ struct _sym *next, *prev, **head, *scope;
+ unsigned int hash;
+} Sym, *SymPtr;
+
+#ifdef __STDC__
+void zzs_init(int, int);
+void zzs_free(void);
+void zzs_done(void);
+void zzs_add(char *, Sym *);
+Sym *zzs_get(char *);
+void zzs_del(Sym *);
+void zzs_keydel(char *);
+Sym **zzs_scope(Sym **);
+Sym *zzs_rmscope(Sym **);
+void zzs_stat(void);
+Sym *zzs_new(char *);
+Sym *zzs_newadd(char *);
+char *zzs_strdup(char *);
+#else
+void zzs_init();
+void zzs_done();
+void zzs_add();
+Sym *zzs_get();
+void zzs_del();
+void zzs_keydel();
+Sym **zzs_scope();
+Sym *zzs_rmscope();
+void zzs_stat();
+Sym *zzs_new();
+Sym *zzs_newadd();
+char *zzs_strdup();
+#endif
diff --git a/btparse/src/tex_tree.c b/btparse/src/tex_tree.c
new file mode 100644
index 0000000..0ca6bfe
--- /dev/null
+++ b/btparse/src/tex_tree.c
@@ -0,0 +1,414 @@
+/* ------------------------------------------------------------------------
+@NAME : tex_tree.c
+@DESCRIPTION: Functions for dealing with strings of TeX code: converting
+ them to tree representation, traversing the trees to glean
+ useful information, and converting back to string form.
+@GLOBALS :
+@CALLS :
+@CALLERS :
+@CREATED : 1997/05/29, Greg Ward
+@MODIFIED :
+@VERSION : $Id$
+@COPYRIGHT : Copyright (c) 1996-99 by Gregory P. Ward. All rights reserved.
+
+ This file is part of the btparse library. This library is
+ free software; you can redistribute it and/or modify it under
+ the terms of the GNU Library General Public License as
+ published by the Free Software Foundation; either version 2
+ of the License, or (at your option) any later version.
+-------------------------------------------------------------------------- */
+
+#include "bt_config.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include "error.h"
+#include "btparse.h"
+#include "my_dmalloc.h"
+
+/* blech! temp hack until I make error.c perfect and magical */
+#define string_warning(w) fprintf (stderr, w);
+
+typedef struct treestack_s
+{
+ bt_tex_tree * node;
+ struct treestack_s
+ * prev,
+ * next;
+} treestack;
+
+
+/* ----------------------------------------------------------------------
+ * Stack manipulation functions
+ */
+
+/* ------------------------------------------------------------------------
+@NAME : push_treestack()
+@INPUT : *stack
+ node
+@OUTPUT : *stack
+@RETURNS :
+@DESCRIPTION: Creates and initializes new node in a stack, and pushes it
+ onto the stack.
+@GLOBALS :
+@CALLS :
+@CALLERS :
+@CREATED : 1997/05/29, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+static void
+push_treestack (treestack **stack, bt_tex_tree *node)
+{
+ treestack *newtop;
+
+ newtop = (treestack *) malloc (sizeof (treestack));
+ newtop->node = node;
+ newtop->next = NULL;
+ newtop->prev = *stack;
+
+ if (*stack != NULL) /* stack already has some entries */
+ {
+ (*stack)->next = newtop;
+ *stack = newtop;
+ }
+
+ *stack = newtop;
+
+} /* push_treestack() */
+
+
+/* ------------------------------------------------------------------------
+@NAME : pop_treestack
+@INPUT : *stack
+@OUTPUT : *stack
+@RETURNS :
+@DESCRIPTION: Pops an entry off of a stack of tex_tree nodes, frees up
+ the wrapper treestack node, and returns the popped tree node.
+@GLOBALS :
+@CALLS :
+@CALLERS :
+@CREATED : 1997/05/29, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+static bt_tex_tree *
+pop_treestack (treestack **stack)
+{
+ treestack * oldtop;
+ bt_tex_tree * node;
+
+ if (*stack == NULL)
+ internal_error ("attempt to pop off empty stack");
+ oldtop = (*stack)->prev;
+ node = (*stack)->node;
+ free (*stack);
+ if (oldtop != NULL)
+ oldtop->next = NULL;
+ *stack = oldtop;
+ return node;
+
+} /* pop_treestack() */
+
+
+/* ----------------------------------------------------------------------
+ * Tree creation/destruction functions
+ */
+
+/* ------------------------------------------------------------------------
+@NAME : new_tex_tree
+@INPUT : start
+@OUTPUT :
+@RETURNS : pointer to newly-allocated node
+@DESCRIPTION: Allocates and initializes a bt_tex_tree node.
+@GLOBALS :
+@CALLS :
+@CALLERS :
+@CREATED : 1997/05/29, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+static bt_tex_tree *
+new_tex_tree (char *start)
+{
+ bt_tex_tree * node;
+
+ node = (bt_tex_tree *) malloc (sizeof (bt_tex_tree));
+ node->start = start;
+ node->len = 0;
+ node->child = node->next = NULL;
+ return node;
+}
+
+
+/* ------------------------------------------------------------------------
+@NAME : bt_build_tex_tree
+@INPUT : string
+@OUTPUT :
+@RETURNS : pointer to a complete tree; call bt_free_tex_tree() to free
+ the entire tree
+@DESCRIPTION: Traverses a string looking for TeX groups ({...}), and builds
+ a tree containing pointers into the string and describing
+ its brace-structure.
+@GLOBALS :
+@CALLS :
+@CALLERS :
+@CREATED : 1997/05/29, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+bt_tex_tree *
+bt_build_tex_tree (char * string)
+{
+ int i;
+ int depth;
+ int len;
+ bt_tex_tree
+ * top,
+ * cur,
+ * new;
+ treestack
+ * stack;
+
+ i = 0;
+ depth = 0;
+ len = strlen (string);
+ top = new_tex_tree (string);
+ stack = NULL;
+
+ cur = top;
+
+ while (i < len)
+ {
+ switch (string[i])
+ {
+ case '{': /* go one level deeper */
+ {
+ if (i == len-1) /* open brace in last character? */
+ {
+ string_warning ("unbalanced braces: { at end of string");
+ goto error;
+ }
+
+ new = new_tex_tree (string+i+1);
+ cur->child = new;
+ push_treestack (&stack, cur);
+ cur = new;
+ depth++;
+ break;
+ }
+ case '}': /* pop level(s) off */
+ {
+ while (i < len && string[i] == '}')
+ {
+ if (stack == NULL)
+ {
+ string_warning ("unbalanced braces: extra }");
+ goto error;
+ }
+ cur = pop_treestack (&stack);
+ depth--;
+ i++;
+ }
+ i--;
+
+ if (i == len-1) /* reached end of string? */
+ {
+ if (depth > 0) /* but not at depth 0 */
+ {
+ string_warning ("unbalanced braces: not enough }'s");
+ goto error;
+ }
+
+ /*
+ * if we get here, do nothing -- we've reached the end of
+ * the string and are at depth 0, so will just fall out
+ * of the while loop at the end of this iteration
+ */
+ }
+ else /* still have characters left */
+ { /* to worry about */
+ new = new_tex_tree (string+i+1);
+ cur->next = new;
+ cur = new;
+ }
+
+ break;
+ }
+ default:
+ {
+ cur->len++;
+ }
+
+ } /* switch */
+
+ i++;
+
+ } /* while i */
+
+ if (depth > 0)
+ {
+ string_warning ("unbalanced braces (not enough }'s)");
+ goto error;
+ }
+
+ return top;
+
+error:
+ bt_free_tex_tree (&top);
+ return NULL;
+
+} /* bt_build_tex_tree() */
+
+
+/* ------------------------------------------------------------------------
+@NAME : bt_free_tex_tree
+@INPUT : *top
+@OUTPUT : *top (set to NULL after it's free()'d)
+@RETURNS :
+@DESCRIPTION: Frees up an entire tree created by bt_build_tex_tree().
+@GLOBALS :
+@CALLS : itself, free()
+@CALLERS :
+@CREATED : 1997/05/29, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+void
+bt_free_tex_tree (bt_tex_tree **top)
+{
+ if ((*top)->child) bt_free_tex_tree (&(*top)->child);
+ if ((*top)->next) bt_free_tex_tree (&(*top)->next);
+ free (*top);
+ *top = NULL;
+}
+
+
+
+/* ----------------------------------------------------------------------
+ * Tree traversal functions
+ */
+
+/* ------------------------------------------------------------------------
+@NAME : bt_dump_tex_tree
+@INPUT : node
+ depth
+ stream
+@OUTPUT :
+@RETURNS :
+@DESCRIPTION: Dumps a TeX tree: one node per line, depth indented according
+ to depth.
+@GLOBALS :
+@CALLS : itself
+@CALLERS :
+@CREATED : 1997/05/29, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+void
+bt_dump_tex_tree (bt_tex_tree *node, int depth, FILE *stream)
+{
+ char buf[256];
+
+ if (node == NULL)
+ return;
+
+ if (node->len > 255)
+ internal_error ("augughgh! buf too small");
+ strncpy (buf, node->start, node->len);
+ buf[node->len] = (char) 0;
+
+ fprintf (stream, "%*s[%s]\n", depth*2, "", buf);
+
+ bt_dump_tex_tree (node->child, depth+1, stream);
+ bt_dump_tex_tree (node->next, depth, stream);
+
+}
+
+
+/* ------------------------------------------------------------------------
+@NAME : count_length
+@INPUT : node
+@OUTPUT :
+@RETURNS :
+@DESCRIPTION: Counts the total number of characters that will be needed
+ to print a string reconstructed from a TeX tree. (Length
+ of string in each node, plus two [{ and }] for each down
+ edge.)
+@GLOBALS :
+@CALLS : itself
+@CALLERS : bt_flatten_tex_tree
+@CREATED : 1997/05/29, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+static int
+count_length (bt_tex_tree *node)
+{
+ if (node == NULL) return 0;
+ return
+ node->len +
+ (node->child ? 2 : 0) +
+ count_length (node->child) +
+ count_length (node->next);
+}
+
+
+/* ------------------------------------------------------------------------
+@NAME : flatten_tree
+@INPUT : node
+ *offset
+@OUTPUT : *buf
+ *offset
+@RETURNS :
+@DESCRIPTION: Dumps a reconstructed string ("flat" representation of the
+ tree) into a pre-allocated buffer, starting at a specified
+ offset.
+@GLOBALS :
+@CALLS : itself
+@CALLERS : bt_flatten_tex_tree
+@CREATED : 1997/05/29, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+static void
+flatten_tree (bt_tex_tree *node, char *buf, int *offset)
+{
+ strncpy (buf + *offset, node->start, node->len);
+ *offset += node->len;
+
+ if (node->child)
+ {
+ buf[(*offset)++] = '{';
+ flatten_tree (node->child, buf, offset);
+ buf[(*offset)++] = '}';
+ }
+
+ if (node->next)
+ {
+ flatten_tree (node->next, buf, offset);
+ }
+}
+
+
+/* ------------------------------------------------------------------------
+@NAME : bt_flatten_tex_tree
+@INPUT : top
+@OUTPUT :
+@RETURNS : flattened string representation of the tree (as a string
+ allocated with malloc(), so you should free() it when
+ you're done with it)
+@DESCRIPTION: Counts the number of characters needed for a "flat"
+ string representation of a tree, allocates a string of
+ that size, and generates the string.
+@GLOBALS :
+@CALLS : count_length, flatten_tree
+@CALLERS :
+@CREATED : 1997/05/29, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+char *
+bt_flatten_tex_tree (bt_tex_tree *top)
+{
+ int len;
+ int offset;
+ char * buf;
+
+ len = count_length (top);
+ buf = (char *) malloc (sizeof (char) * (len+1));
+ offset = 0;
+ flatten_tree (top, buf, &offset);
+ return buf;
+}
diff --git a/btparse/src/tokens.h b/btparse/src/tokens.h
new file mode 100644
index 0000000..dfb49eb
--- /dev/null
+++ b/btparse/src/tokens.h
@@ -0,0 +1,80 @@
+#ifndef tokens_h
+#define tokens_h
+/* tokens.h -- List of labelled tokens and stuff
+ *
+ * Generated from: bibtex.g
+ *
+ * Terence Parr, Will Cohen, and Hank Dietz: 1989-1994
+ * Purdue University Electrical Engineering
+ * ANTLR Version 1.33
+ */
+#define zzEOF_TOKEN 1
+#define AT 2
+#define COMMENT 4
+#define NUMBER 9
+#define NAME 10
+#define LBRACE 11
+#define RBRACE 12
+#define ENTRY_OPEN 13
+#define ENTRY_CLOSE 14
+#define EQUALS 15
+#define HASH 16
+#define COMMA 17
+#define STRING 25
+
+#ifdef __STDC__
+void bibfile(AST**_root);
+#else
+extern void bibfile();
+#endif
+
+#ifdef __STDC__
+void entry(AST**_root);
+#else
+extern void entry();
+#endif
+
+#ifdef __STDC__
+void body(AST**_root, bt_metatype metatype );
+#else
+extern void body();
+#endif
+
+#ifdef __STDC__
+void contents(AST**_root, bt_metatype metatype );
+#else
+extern void contents();
+#endif
+
+#ifdef __STDC__
+void fields(AST**_root);
+#else
+extern void fields();
+#endif
+
+#ifdef __STDC__
+void field(AST**_root);
+#else
+extern void field();
+#endif
+
+#ifdef __STDC__
+void value(AST**_root);
+#else
+extern void value();
+#endif
+
+#ifdef __STDC__
+void simple_value(AST**_root);
+#else
+extern void simple_value();
+#endif
+
+#endif
+extern SetWordType zzerr1[];
+extern SetWordType zzerr2[];
+extern SetWordType zzerr3[];
+extern SetWordType zzerr4[];
+extern SetWordType setwd1[];
+extern SetWordType zzerr5[];
+extern SetWordType setwd2[];
diff --git a/btparse/src/traversal.c b/btparse/src/traversal.c
new file mode 100644
index 0000000..11fd03a
--- /dev/null
+++ b/btparse/src/traversal.c
@@ -0,0 +1,187 @@
+/* ------------------------------------------------------------------------
+@NAME : traversal.c
+@DESCRIPTION: Routines for traversing the AST for a single entry.
+@GLOBALS :
+@CALLS :
+@CREATED : 1997/01/21, Greg Ward
+@MODIFIED :
+@VERSION : $Id$
+@COPYRIGHT : Copyright (c) 1996-99 by Gregory P. Ward. All rights reserved.
+
+ This file is part of the btparse library. This library is
+ free software; you can redistribute it and/or modify it under
+ the terms of the GNU Library General Public License as
+ published by the Free Software Foundation; either version 2
+ of the License, or (at your option) any later version.
+-------------------------------------------------------------------------- */
+#include "bt_config.h"
+#include <stdlib.h>
+#include "btparse.h"
+#include "parse_auxiliary.h"
+#include "prototypes.h"
+#include "my_dmalloc.h"
+
+
+AST *bt_next_entry (AST *entry_list, AST *prev_entry)
+{
+ if (entry_list == NULL || entry_list->nodetype != BTAST_ENTRY)
+ return NULL;
+
+ if (prev_entry)
+ {
+ if (prev_entry->nodetype != BTAST_ENTRY)
+ return NULL;
+ else
+ return prev_entry->right;
+ }
+ else
+ return entry_list;
+}
+
+
+bt_metatype bt_entry_metatype (AST *entry)
+{
+ if (!entry) return BTE_UNKNOWN;
+ if (entry->nodetype != BTAST_ENTRY)
+ return BTE_UNKNOWN;
+ else
+ return entry->metatype;
+}
+
+
+char *bt_entry_type (AST *entry)
+{
+ if (!entry) return NULL;
+ if (entry->nodetype != BTAST_ENTRY)
+ return NULL;
+ else
+ return entry->text;
+}
+
+
+char *bt_entry_key (AST *entry)
+{
+ if (entry->metatype == BTE_REGULAR &&
+ entry->down && entry->down->nodetype == BTAST_KEY)
+ {
+ return entry->down->text;
+ }
+ else
+ {
+ return NULL;
+ }
+}
+
+
+AST *bt_next_field (AST *entry, AST *prev, char **name)
+{
+ AST *field;
+ bt_metatype metatype;
+
+ *name = NULL;
+ if (!entry || !entry->down) return NULL; /* protect against empty entry */
+
+ metatype = entry->metatype;
+ if (metatype != BTE_MACRODEF && metatype != BTE_REGULAR)
+ return NULL;
+
+ if (prev == NULL) /* no previous field -- they must */
+ { /* want the first one */
+ field = entry->down;
+ if (metatype == BTE_REGULAR && field->nodetype == BTAST_KEY)
+ field = field->right; /* skip over citation key if present */
+ }
+ else /* they really do want the next one */
+ {
+ field = prev->right;
+ }
+
+ if (!field) return NULL; /* protect against field-less entry */
+ if (name) *name = field->text;
+ return field;
+} /* bt_next_field() */
+
+
+AST *bt_next_macro (AST *entry, AST *prev, char **name)
+{
+ return bt_next_field (entry, prev, name);
+}
+
+
+AST *bt_next_value (AST *top, AST *prev, bt_nodetype *nodetype, char **text)
+{
+ bt_nodetype nt; /* type of `top' node (to check) */
+ bt_metatype mt;
+ AST * value;
+
+ if (nodetype) *nodetype = BTAST_BOGUS;
+ if (text) *text = NULL;
+
+ if (!top) return NULL;
+ /* get_node_type (top, &nt, &mt); */
+ nt = top->nodetype;
+ mt = top->metatype;
+
+ if ((nt == BTAST_FIELD) ||
+ (nt == BTAST_ENTRY && (mt == BTE_COMMENT || mt == BTE_PREAMBLE)))
+ {
+ if (prev == NULL) /* no previous value -- give 'em */
+ { /* the first one */
+ value = top->down;
+ if (!value) return NULL;
+ if (nodetype) *nodetype = value->nodetype;
+ }
+ else
+ {
+ value = prev->right;
+ if (!value) return NULL;
+ if (nodetype) *nodetype = value->nodetype;
+ }
+
+ if (nt == BTAST_ENTRY && value->nodetype != BTAST_STRING)
+ internal_error ("found comment or preamble with non-string value");
+ }
+ else
+ {
+ value = NULL;
+ }
+
+ if (text && value) *text = value->text;
+
+ return value;
+} /* bt_next_value() */
+
+
+char *bt_get_text (AST *node)
+{
+ btshort pp_options = BTO_FULL; /* options for full processing: */
+ /* expand macros, paste strings, */
+ /* collapse whitespace */
+ bt_nodetype nt;
+ bt_metatype mt;
+
+ nt = node->nodetype;
+ mt = node->metatype;
+
+ if (nt == BTAST_FIELD)
+ {
+#if DEBUG
+ char *value;
+
+ dump_ast ("bt_get_text (pre): node =\n", node);
+ value = bt_postprocess_field (node, pp_options, FALSE);
+ dump_ast ("bt_get_text (post): node =\n", node);
+ return value;
+#else
+ return bt_postprocess_field (node, pp_options, FALSE);
+#endif
+ }
+ else if (nt == BTAST_ENTRY && (mt == BTE_COMMENT || mt == BTE_PREAMBLE))
+ {
+ return bt_postprocess_value (node->down, pp_options, FALSE);
+ }
+ else
+ {
+ return NULL;
+ }
+}
diff --git a/btparse/src/util.c b/btparse/src/util.c
new file mode 100644
index 0000000..e568e76
--- /dev/null
+++ b/btparse/src/util.c
@@ -0,0 +1,1180 @@
+/*
+ * ------------------------------------------------------------------------
+ * @NAME : util.c @INPUT : @OUTPUT : @RETURNS :
+ * @DESCRIPTION: Miscellaneous utility functions. So far, just: strlwr
+ * strupr @CREATED : Summer 1996, Greg Ward @MODIFIED : @VERSION :
+ * $Id$ @COPYRIGHT : Copyright (c) 1996-99 by Gregory P. Ward. All rights
+ * reserved.
+ *
+ * This file is part of the btparse library. This library is free software; you
+ * can redistribute it and/or modify it under the terms of the GNU Library
+ * General Public License as published by the Free Software Foundation;
+ * either version 2 of the License, or (at your option) any later version.
+ * --------------------------------------------------------------------------
+ */
+
+#include "bt_config.h"
+#include <string.h>
+#include <ctype.h>
+#include "prototypes.h"
+#include "my_dmalloc.h"
+
+/*
+ * ------------------------------------------------------------------------
+ * @NAME : strlwr() @INPUT : @OUTPUT : @RETURNS :
+ * @DESCRIPTION: Converts a string to lowercase in place. @GLOBALS :
+ * @CALLS : @CREATED : 1996/01/06, GPW @MODIFIED : @COMMENTS :
+ * This should work the same as strlwr() in DOS compilers -- why this isn't
+ * mandated by ANSI is a mystery to me...
+ * --------------------------------------------------------------------------
+ */
+#if !HAVE_STRLWR
+char *
+strlwr(char *s)
+{
+ int len , i;
+
+ len = strlen(s);
+ for (i = 0; i < len; i++)
+ s[i] = tolower(s[i]);
+
+ return s;
+}
+#endif
+
+
+
+/*
+ * ------------------------------------------------------------------------
+ * @NAME : strupr() @INPUT : @OUTPUT : @RETURNS :
+ * @DESCRIPTION: Converts a string to uppercase in place. @GLOBALS :
+ * @CALLS : @CREATED : 1996/01/06, GPW @MODIFIED : @COMMENTS :
+ * This should work the same as strupr() in DOS compilers -- why this isn't
+ * mandated by ANSI is a mystery to me...
+ * --------------------------------------------------------------------------
+ */
+#if !HAVE_STRUPR
+char *
+strupr(char *s)
+{
+ int len , i;
+
+ len = strlen(s);
+ for (i = 0; i < len; i++)
+ s[i] = toupper(s[i]);
+
+ return s;
+}
+#endif
+
+/*
+ * ------------------------------------------------------------------------
+ * @NAME : get_uchar() @INPUT : string offset in string @OUTPUT
+ * : number of bytes required to gobble the next unicode character, including
+ * any combining marks @RETURNS : @DESCRIPTION: In order to deal with
+ * unicode chars when calculating abbreviations, we need to know how many
+ * bytes the next character is. @CALLS : @CALLERS :
+ * count_virtual_char() @CREATED : 2010/03/14, PK @MODIFIED :
+ * --------------------------------------------------------------------------
+ */
+int
+get_uchar(char *string, int offset)
+{
+ unsigned char *bytes = (unsigned char *)string;
+ int init;
+ unsigned int c = 0;
+ //Without unsigned, for some reason Solaris coredumps
+
+ if (!string)
+ return 0;
+
+ if ((//ASCII
+ bytes[offset] == 0x09 ||
+ bytes[offset] == 0x0A ||
+ bytes[offset] == 0x0D ||
+ (0x20 <= bytes[offset] && bytes[offset] <= 0x7E)
+ )
+ ) {
+ init = 1;
+ }
+ if ((//non - overlong 2 - byte
+ (0xC2 <= bytes[offset] && bytes[offset] <= 0xDF) &&
+ (0x80 <= bytes[offset + 1] && bytes[offset + 1] <= 0xBF)
+ )
+ ) {
+ init = 2;
+ }
+ if ((//excluding overlongs
+ bytes[offset] == 0xE0 &&
+ (0xA0 <= bytes[offset + 1] && bytes[offset + 1] <= 0xBF) &&
+ (0x80 <= bytes[offset + 2] && bytes[offset + 2] <= 0xBF)
+ ) ||
+ (//straight 3 - byte
+ ((0xE1 <= bytes[offset] && bytes[offset] <= 0xEC) ||
+ bytes[offset] == 0xEE ||
+ bytes[offset] == 0xEF) &&
+ (0x80 <= bytes[offset + 1] && bytes[offset + 1] <= 0xBF) &&
+ (0x80 <= bytes[offset + 2] && bytes[offset + 2] <= 0xBF)
+ ) ||
+ (//excluding surrogates
+ bytes[offset] == 0xED &&
+ (0x80 <= bytes[offset + 1] && bytes[offset + 1] <= 0x9F) &&
+ (0x80 <= bytes[offset + 2] && bytes[offset + 2] <= 0xBF)
+ )
+ ) {
+ init = 3;
+ }
+ if ((//planes 1 - 3
+ bytes[offset] == 0xF0 &&
+ (0x90 <= bytes[offset + 1] && bytes[offset + 1] <= 0xBF) &&
+ (0x80 <= bytes[offset + 2] && bytes[offset + 2] <= 0xBF) &&
+ (0x80 <= bytes[offset + 3] && bytes[offset + 3] <= 0xBF)
+ ) ||
+ (//planes 4 - 15
+ (0xF1 <= bytes[offset] && bytes[offset] <= 0xF3) &&
+ (0x80 <= bytes[offset + 1] && bytes[offset + 1] <= 0xBF) &&
+ (0x80 <= bytes[offset + 2] && bytes[offset + 2] <= 0xBF) &&
+ (0x80 <= bytes[offset + 3] && bytes[offset + 3] <= 0xBF)
+ ) ||
+ (//plane 16
+ bytes[offset] == 0xF4 &&
+ (0x80 <= bytes[offset + 1] && bytes[offset + 1] <= 0x8F) &&
+ (0x80 <= bytes[offset + 2] && bytes[offset + 2] <= 0xBF) &&
+ (0x80 <= bytes[offset + 3] && bytes[offset + 3] <= 0xBF)
+ )
+ ) {
+ init = 4;
+ }
+ /* Now check for combining marks which are separate even in NFC */
+ while (bytes[offset + init + c]) {
+ /* 0300–036F - Combining Diacritical Marks */
+ if (bytes[offset + init + c] == 0xCC &&
+ (0x80 <= bytes[offset + init + 1 + c] && bytes[offset + init + 1 + c] <= 0xAF)
+ ) {
+ c = c + 2; /* Skip to next possible combining
+ * mark */
+ }
+ /* 1DC0–1DFF - Combining Diacritical Marks Supplement */
+ else if (bytes[offset + init + c] == 0xE1 &&
+ bytes[offset + init + 1 + c] == 0xB7 &&
+ (0x80 <= bytes[offset + init + 2 + c] && bytes[offset + init + 2 + c] <= 0xBF)
+ ) {
+ c = c + 3; /* Skip to next possible combining
+ * mark */
+ }
+ /* FE20–FE2F - Combining Half Marks */
+ else if (bytes[offset + init + c] == 0xEF &&
+ bytes[offset + init + 1 + c] == 0xB8 &&
+ (0xA0 <= bytes[offset + init + 2 + c] && bytes[offset + init + 2 + c] <= 0xAF)
+ ) {
+ c = c + 3; /* Skip to next possible combining
+ * mark */
+ } else {
+ break;
+ }
+ }
+ return init + c;
+}
+
+/*
+ * ------------------------------------------------------------------------
+ * @NAME : isulower() @INPUT : some bytes @OUTPUT : @RETURNS
+ * : boolean 1 or 0 @DESCRIPTION: Passed some bytes, returns 1 of the first
+ * UTF-8 char is lowercase The code was autogenerated from a dump of perl's
+ * fabulous unichars -a '\p{Ll}', massaged into bytes and printed. This list
+ * of lowercased property glyphs is from Unicode 6.2.0 @CALLS : @CALLERS
+ * : find_lc_tokens() @CREATED : 2014/02/27, PK @MODIFIED :
+ * --------------------------------------------------------------------------
+ */
+int
+isulower(char *string)
+{
+ unsigned char *bytes = (unsigned char *)string;
+
+ if (!string)
+ return 0;
+
+ if (
+ (0x61 <= bytes[0] && bytes[0] <= 0x7A)
+ ) {
+ return 1;
+ }
+ if (
+ (
+ bytes[0] == 0xC2 &&
+ (
+ bytes[1] == 0xB5
+ )
+ ) ||
+ (
+ bytes[0] == 0xC3 &&
+ (
+ (0x9F <= bytes[1] && bytes[1] <= 0xB6) ||
+ (0xB8 <= bytes[1] && bytes[1] <= 0xBF)
+ )
+ ) ||
+ (
+ bytes[0] == 0xC4 &&
+ (
+ bytes[1] == 0x81 ||
+ bytes[1] == 0x83 ||
+ bytes[1] == 0x85 ||
+ bytes[1] == 0x87 ||
+ bytes[1] == 0x89 ||
+ bytes[1] == 0x8B ||
+ bytes[1] == 0x8D ||
+ bytes[1] == 0x8F ||
+ bytes[1] == 0x91 ||
+ bytes[1] == 0x93 ||
+ bytes[1] == 0x95 ||
+ bytes[1] == 0x97 ||
+ bytes[1] == 0x99 ||
+ bytes[1] == 0x9B ||
+ bytes[1] == 0x9D ||
+ bytes[1] == 0x9F ||
+ bytes[1] == 0xA1 ||
+ bytes[1] == 0xA3 ||
+ bytes[1] == 0xA5 ||
+ bytes[1] == 0xA7 ||
+ bytes[1] == 0xA9 ||
+ bytes[1] == 0xAB ||
+ bytes[1] == 0xAD ||
+ bytes[1] == 0xAF ||
+ bytes[1] == 0xB1 ||
+ bytes[1] == 0xB3 ||
+ bytes[1] == 0xB5 ||
+ (0xB7 <= bytes[1] && bytes[1] <= 0xB8) ||
+ bytes[1] == 0xBA ||
+ bytes[1] == 0xBC ||
+ bytes[1] == 0xBE
+ )
+ ) ||
+ (
+ bytes[0] == 0xC5 &&
+ (
+ bytes[1] == 0x80 ||
+ bytes[1] == 0x82 ||
+ bytes[1] == 0x84 ||
+ bytes[1] == 0x86 ||
+ (0x88 <= bytes[1] && bytes[1] <= 0x89) ||
+ bytes[1] == 0x8B ||
+ bytes[1] == 0x8D ||
+ bytes[1] == 0x8F ||
+ bytes[1] == 0x91 ||
+ bytes[1] == 0x93 ||
+ bytes[1] == 0x95 ||
+ bytes[1] == 0x97 ||
+ bytes[1] == 0x99 ||
+ bytes[1] == 0x9B ||
+ bytes[1] == 0x9D ||
+ bytes[1] == 0x9F ||
+ bytes[1] == 0xA1 ||
+ bytes[1] == 0xA3 ||
+ bytes[1] == 0xA5 ||
+ bytes[1] == 0xA7 ||
+ bytes[1] == 0xA9 ||
+ bytes[1] == 0xAB ||
+ bytes[1] == 0xAD ||
+ bytes[1] == 0xAF ||
+ bytes[1] == 0xB1 ||
+ bytes[1] == 0xB3 ||
+ bytes[1] == 0xB5 ||
+ bytes[1] == 0xB7 ||
+ bytes[1] == 0xBA ||
+ bytes[1] == 0xBC ||
+ (0xBE <= bytes[1] && bytes[1] <= 0xBF)
+ )
+ ) ||
+ (
+ bytes[0] == 0xC6 &&
+ (
+ bytes[1] == 0x80 ||
+ bytes[1] == 0x83 ||
+ bytes[1] == 0x85 ||
+ bytes[1] == 0x88 ||
+ (0x8C <= bytes[1] && bytes[1] <= 0x8D) ||
+ bytes[1] == 0x92 ||
+ bytes[1] == 0x95 ||
+ (0x99 <= bytes[1] && bytes[1] <= 0x9B) ||
+ bytes[1] == 0x9E ||
+ bytes[1] == 0xA1 ||
+ bytes[1] == 0xA3 ||
+ bytes[1] == 0xA5 ||
+ bytes[1] == 0xA8 ||
+ (0xAA <= bytes[1] && bytes[1] <= 0xAB) ||
+ bytes[1] == 0xAD ||
+ bytes[1] == 0xB0 ||
+ bytes[1] == 0xB4 ||
+ bytes[1] == 0xB6 ||
+ (0xB9 <= bytes[1] && bytes[1] <= 0xBA) ||
+ (0xBD <= bytes[1] && bytes[1] <= 0xBF)
+ )
+ ) ||
+ (
+ bytes[0] == 0xC7 &&
+ (
+ bytes[1] == 0x86 ||
+ bytes[1] == 0x89 ||
+ bytes[1] == 0x8C ||
+ bytes[1] == 0x8E ||
+ bytes[1] == 0x90 ||
+ bytes[1] == 0x92 ||
+ bytes[1] == 0x94 ||
+ bytes[1] == 0x96 ||
+ bytes[1] == 0x98 ||
+ bytes[1] == 0x9A ||
+ (0x9C <= bytes[1] && bytes[1] <= 0x9D) ||
+ bytes[1] == 0x9F ||
+ bytes[1] == 0xA1 ||
+ bytes[1] == 0xA3 ||
+ bytes[1] == 0xA5 ||
+ bytes[1] == 0xA7 ||
+ bytes[1] == 0xA9 ||
+ bytes[1] == 0xAB ||
+ bytes[1] == 0xAD ||
+ (0xAF <= bytes[1] && bytes[1] <= 0xB0) ||
+ bytes[1] == 0xB3 ||
+ bytes[1] == 0xB5 ||
+ bytes[1] == 0xB9 ||
+ bytes[1] == 0xBB ||
+ bytes[1] == 0xBD ||
+ bytes[1] == 0xBF
+ )
+ ) ||
+ (
+ bytes[0] == 0xC8 &&
+ (
+ bytes[1] == 0x81 ||
+ bytes[1] == 0x83 ||
+ bytes[1] == 0x85 ||
+ bytes[1] == 0x87 ||
+ bytes[1] == 0x89 ||
+ bytes[1] == 0x8B ||
+ bytes[1] == 0x8D ||
+ bytes[1] == 0x8F ||
+ bytes[1] == 0x91 ||
+ bytes[1] == 0x93 ||
+ bytes[1] == 0x95 ||
+ bytes[1] == 0x97 ||
+ bytes[1] == 0x99 ||
+ bytes[1] == 0x9B ||
+ bytes[1] == 0x9D ||
+ bytes[1] == 0x9F ||
+ bytes[1] == 0xA1 ||
+ bytes[1] == 0xA3 ||
+ bytes[1] == 0xA5 ||
+ bytes[1] == 0xA7 ||
+ bytes[1] == 0xA9 ||
+ bytes[1] == 0xAB ||
+ bytes[1] == 0xAD ||
+ bytes[1] == 0xAF ||
+ bytes[1] == 0xB1 ||
+ (0xB3 <= bytes[1] && bytes[1] <= 0xB9) ||
+ bytes[1] == 0xBC ||
+ bytes[1] == 0xBF
+ )
+ ) ||
+ (
+ bytes[0] == 0xC9 &&
+ (
+ bytes[1] == 0x80 ||
+ bytes[1] == 0x82 ||
+ bytes[1] == 0x87 ||
+ bytes[1] == 0x89 ||
+ bytes[1] == 0x8B ||
+ bytes[1] == 0x8D ||
+ (0x8F <= bytes[1] && bytes[1] <= 0xBF)
+ )
+ ) ||
+ (
+ bytes[0] == 0xCA &&
+ (
+ (0x80 <= bytes[1] && bytes[1] <= 0x93) ||
+ (0x95 <= bytes[1] && bytes[1] <= 0xAF)
+ )
+ ) ||
+ (
+ bytes[0] == 0xCD &&
+ (
+ bytes[1] == 0xB1 ||
+ bytes[1] == 0xB3 ||
+ bytes[1] == 0xB7 ||
+ (0xBB <= bytes[1] && bytes[1] <= 0xBD)
+ )
+ ) ||
+ (
+ bytes[0] == 0xCE &&
+ (
+ bytes[1] == 0x90 ||
+ (0xAC <= bytes[1] && bytes[1] <= 0xBF)
+ )
+ ) ||
+ (
+ bytes[0] == 0xCF &&
+ (
+ (0x80 <= bytes[1] && bytes[1] <= 0x8E) ||
+ (0x90 <= bytes[1] && bytes[1] <= 0x91) ||
+ (0x95 <= bytes[1] && bytes[1] <= 0x97) ||
+ bytes[1] == 0x99 ||
+ bytes[1] == 0x9B ||
+ bytes[1] == 0x9D ||
+ bytes[1] == 0x9F ||
+ bytes[1] == 0xA1 ||
+ bytes[1] == 0xA3 ||
+ bytes[1] == 0xA5 ||
+ bytes[1] == 0xA7 ||
+ bytes[1] == 0xA9 ||
+ bytes[1] == 0xAB ||
+ bytes[1] == 0xAD ||
+ (0xAF <= bytes[1] && bytes[1] <= 0xB3) ||
+ bytes[1] == 0xB5 ||
+ bytes[1] == 0xB8 ||
+ (0xBB <= bytes[1] && bytes[1] <= 0xBC)
+ )
+ ) ||
+ (
+ bytes[0] == 0xD0 &&
+ (
+ (0xB0 <= bytes[1] && bytes[1] <= 0xBF)
+ )
+ ) ||
+ (
+ bytes[0] == 0xD1 &&
+ (
+ (0x80 <= bytes[1] && bytes[1] <= 0x9F) ||
+ bytes[1] == 0xA1 ||
+ bytes[1] == 0xA3 ||
+ bytes[1] == 0xA5 ||
+ bytes[1] == 0xA7 ||
+ bytes[1] == 0xA9 ||
+ bytes[1] == 0xAB ||
+ bytes[1] == 0xAD ||
+ bytes[1] == 0xAF ||
+ bytes[1] == 0xB1 ||
+ bytes[1] == 0xB3 ||
+ bytes[1] == 0xB5 ||
+ bytes[1] == 0xB7 ||
+ bytes[1] == 0xB9 ||
+ bytes[1] == 0xBB ||
+ bytes[1] == 0xBD ||
+ bytes[1] == 0xBF
+ )
+ ) ||
+ (
+ bytes[0] == 0xD2 &&
+ (
+ bytes[1] == 0x81 ||
+ bytes[1] == 0x8B ||
+ bytes[1] == 0x8D ||
+ bytes[1] == 0x8F ||
+ bytes[1] == 0x91 ||
+ bytes[1] == 0x93 ||
+ bytes[1] == 0x95 ||
+ bytes[1] == 0x97 ||
+ bytes[1] == 0x99 ||
+ bytes[1] == 0x9B ||
+ bytes[1] == 0x9D ||
+ bytes[1] == 0x9F ||
+ bytes[1] == 0xA1 ||
+ bytes[1] == 0xA3 ||
+ bytes[1] == 0xA5 ||
+ bytes[1] == 0xA7 ||
+ bytes[1] == 0xA9 ||
+ bytes[1] == 0xAB ||
+ bytes[1] == 0xAD ||
+ bytes[1] == 0xAF ||
+ bytes[1] == 0xB1 ||
+ bytes[1] == 0xB3 ||
+ bytes[1] == 0xB5 ||
+ bytes[1] == 0xB7 ||
+ bytes[1] == 0xB9 ||
+ bytes[1] == 0xBB ||
+ bytes[1] == 0xBD ||
+ bytes[1] == 0xBF
+ )
+ ) ||
+ (
+ bytes[0] == 0xD3 &&
+ (
+ bytes[1] == 0x82 ||
+ bytes[1] == 0x84 ||
+ bytes[1] == 0x86 ||
+ bytes[1] == 0x88 ||
+ bytes[1] == 0x8A ||
+ bytes[1] == 0x8C ||
+ (0x8E <= bytes[1] && bytes[1] <= 0x8F) ||
+ bytes[1] == 0x91 ||
+ bytes[1] == 0x93 ||
+ bytes[1] == 0x95 ||
+ bytes[1] == 0x97 ||
+ bytes[1] == 0x99 ||
+ bytes[1] == 0x9B ||
+ bytes[1] == 0x9D ||
+ bytes[1] == 0x9F ||
+ bytes[1] == 0xA1 ||
+ bytes[1] == 0xA3 ||
+ bytes[1] == 0xA5 ||
+ bytes[1] == 0xA7 ||
+ bytes[1] == 0xA9 ||
+ bytes[1] == 0xAB ||
+ bytes[1] == 0xAD ||
+ bytes[1] == 0xAF ||
+ bytes[1] == 0xB1 ||
+ bytes[1] == 0xB3 ||
+ bytes[1] == 0xB5 ||
+ bytes[1] == 0xB7 ||
+ bytes[1] == 0xB9 ||
+ bytes[1] == 0xBB ||
+ bytes[1] == 0xBD ||
+ bytes[1] == 0xBF
+ )
+ ) ||
+ (
+ bytes[0] == 0xD4 &&
+ (
+ bytes[1] == 0x81 ||
+ bytes[1] == 0x83 ||
+ bytes[1] == 0x85 ||
+ bytes[1] == 0x87 ||
+ bytes[1] == 0x89 ||
+ bytes[1] == 0x8B ||
+ bytes[1] == 0x8D ||
+ bytes[1] == 0x8F ||
+ bytes[1] == 0x91 ||
+ bytes[1] == 0x93 ||
+ bytes[1] == 0x95 ||
+ bytes[1] == 0x97 ||
+ bytes[1] == 0x99 ||
+ bytes[1] == 0x9B ||
+ bytes[1] == 0x9D ||
+ bytes[1] == 0x9F ||
+ bytes[1] == 0xA1 ||
+ bytes[1] == 0xA3 ||
+ bytes[1] == 0xA5 ||
+ bytes[1] == 0xA7
+ )
+ ) ||
+ (
+ bytes[0] == 0xD5 &&
+ (
+ (0xA1 <= bytes[1] && bytes[1] <= 0xBF)
+ )
+ ) ||
+ (
+ bytes[0] == 0xD6 &&
+ (
+ (0x80 <= bytes[1] && bytes[1] <= 0x87)
+ )
+ )
+ ) {
+ return 1;
+ }
+ if (
+ (
+ bytes[0] == 0xE1 && (
+ (bytes[1] == 0xB4 && 0x80 <= bytes[2] && bytes[2] <= 0xAB) ||
+ (
+ bytes[1] == 0xB5 &&
+ (
+ (0xAB <= bytes[2] && bytes[2] <= 0xB7) ||
+ (0xB9 <= bytes[2] && bytes[2] <= 0xBF)
+ )
+ ) ||
+ (
+ bytes[1] == 0xB6 &&
+ (
+ (0x80 <= bytes[2] && bytes[2] <= 0x9A)
+ )
+ ) ||
+ (
+ bytes[1] == 0xB8 &&
+ (
+ bytes[2] == 0x81 ||
+ bytes[2] == 0x83 ||
+ bytes[2] == 0x85 ||
+ bytes[2] == 0x87 ||
+ bytes[2] == 0x89 ||
+ bytes[2] == 0x8B ||
+ bytes[2] == 0x8D ||
+ bytes[2] == 0x8F ||
+ bytes[2] == 0x91 ||
+ bytes[2] == 0x93 ||
+ bytes[2] == 0x95 ||
+ bytes[2] == 0x97 ||
+ bytes[2] == 0x99 ||
+ bytes[2] == 0x9B ||
+ bytes[2] == 0x9D ||
+ bytes[2] == 0x9F ||
+ bytes[2] == 0xA1 ||
+ bytes[2] == 0xA3 ||
+ bytes[2] == 0xA5 ||
+ bytes[2] == 0xA7 ||
+ bytes[2] == 0xA9 ||
+ bytes[2] == 0xAB ||
+ bytes[2] == 0xAD ||
+ bytes[2] == 0xAF ||
+ bytes[2] == 0xB1 ||
+ bytes[2] == 0xB3 ||
+ bytes[2] == 0xB5 ||
+ bytes[2] == 0xB7 ||
+ bytes[2] == 0xB9 ||
+ bytes[2] == 0xBB ||
+ bytes[2] == 0xBD ||
+ bytes[2] == 0xBF
+ )
+ ) ||
+ (
+ bytes[1] == 0xB9 &&
+ (
+ bytes[2] == 0x81 ||
+ bytes[2] == 0x83 ||
+ bytes[2] == 0x85 ||
+ bytes[2] == 0x87 ||
+ bytes[2] == 0x89 ||
+ bytes[2] == 0x8B ||
+ bytes[2] == 0x8D ||
+ bytes[2] == 0x8F ||
+ bytes[2] == 0x91 ||
+ bytes[2] == 0x93 ||
+ bytes[2] == 0x95 ||
+ bytes[2] == 0x97 ||
+ bytes[2] == 0x99 ||
+ bytes[2] == 0x9B ||
+ bytes[2] == 0x9D ||
+ bytes[2] == 0x9F ||
+ bytes[2] == 0xA1 ||
+ bytes[2] == 0xA3 ||
+ bytes[2] == 0xA5 ||
+ bytes[2] == 0xA7 ||
+ bytes[2] == 0xA9 ||
+ bytes[2] == 0xAB ||
+ bytes[2] == 0xAD ||
+ bytes[2] == 0xAF ||
+ bytes[2] == 0xB1 ||
+ bytes[2] == 0xB3 ||
+ bytes[2] == 0xB5 ||
+ bytes[2] == 0xB7 ||
+ bytes[2] == 0xB9 ||
+ bytes[2] == 0xBB ||
+ bytes[2] == 0xBD ||
+ bytes[2] == 0xBF
+ )
+ ) ||
+ (
+ bytes[1] == 0xBA &&
+ (
+ bytes[2] == 0x81 ||
+ bytes[2] == 0x83 ||
+ bytes[2] == 0x85 ||
+ bytes[2] == 0x87 ||
+ bytes[2] == 0x89 ||
+ bytes[2] == 0x8B ||
+ bytes[2] == 0x8D ||
+ bytes[2] == 0x8F ||
+ bytes[2] == 0x91 ||
+ bytes[2] == 0x93 ||
+ (0x95 <= bytes[2] && bytes[2] <= 0x9D) ||
+ bytes[2] == 0x9F ||
+ bytes[2] == 0xA1 ||
+ bytes[2] == 0xA3 ||
+ bytes[2] == 0xA5 ||
+ bytes[2] == 0xA7 ||
+ bytes[2] == 0xA9 ||
+ bytes[2] == 0xAB ||
+ bytes[2] == 0xAD ||
+ bytes[2] == 0xAF ||
+ bytes[2] == 0xB1 ||
+ bytes[2] == 0xB3 ||
+ bytes[2] == 0xB5 ||
+ bytes[2] == 0xB7 ||
+ bytes[2] == 0xB9 ||
+ bytes[2] == 0xBB ||
+ bytes[2] == 0xBD ||
+ bytes[2] == 0xBF
+ )
+ ) ||
+ (
+ bytes[1] == 0xBB &&
+ (
+ bytes[2] == 0x81 ||
+ bytes[2] == 0x83 ||
+ bytes[2] == 0x85 ||
+ bytes[2] == 0x87 ||
+ bytes[2] == 0x89 ||
+ bytes[2] == 0x8B ||
+ bytes[2] == 0x8D ||
+ bytes[2] == 0x8F ||
+ bytes[2] == 0x91 ||
+ bytes[2] == 0x93 ||
+ bytes[2] == 0x95 ||
+ bytes[2] == 0x97 ||
+ bytes[2] == 0x99 ||
+ bytes[2] == 0x9B ||
+ bytes[2] == 0x9D ||
+ bytes[2] == 0x9F ||
+ bytes[2] == 0xA1 ||
+ bytes[2] == 0xA3 ||
+ bytes[2] == 0xA5 ||
+ bytes[2] == 0xA7 ||
+ bytes[2] == 0xA9 ||
+ bytes[2] == 0xAB ||
+ bytes[2] == 0xAD ||
+ bytes[2] == 0xAF ||
+ bytes[2] == 0xB1 ||
+ bytes[2] == 0xB3 ||
+ bytes[2] == 0xB5 ||
+ bytes[2] == 0xB7 ||
+ bytes[2] == 0xB9 ||
+ bytes[2] == 0xBB ||
+ bytes[2] == 0xBD ||
+ bytes[2] == 0xBF
+ )
+ ) ||
+ (
+ bytes[1] == 0xBC &&
+ (
+ (0x80 <= bytes[2] && bytes[2] <= 0x87) ||
+ (0x90 <= bytes[2] && bytes[2] <= 0x95) ||
+ (0xA0 <= bytes[2] && bytes[2] <= 0xA7) ||
+ (0xB0 <= bytes[2] && bytes[2] <= 0xB7)
+ )
+ ) ||
+ (
+ bytes[1] == 0xBD &&
+ (
+ (0x80 <= bytes[2] && bytes[2] <= 0x85) ||
+ (0x90 <= bytes[2] && bytes[2] <= 0x97) ||
+ (0xA0 <= bytes[2] && bytes[2] <= 0xA7) ||
+ (0xB0 <= bytes[2] && bytes[2] <= 0xBD)
+ )
+ ) ||
+ (
+ bytes[1] == 0xBE &&
+ (
+ (0x80 <= bytes[2] && bytes[2] <= 0x87) ||
+ (0x90 <= bytes[2] && bytes[2] <= 0x97) ||
+ (0xA0 <= bytes[2] && bytes[2] <= 0xA7) ||
+ (0xB0 <= bytes[2] && bytes[2] <= 0xB4) ||
+ (0xB6 <= bytes[2] && bytes[2] <= 0xB7) ||
+ bytes[2] == 0xBE
+ )
+ ) ||
+ (
+ bytes[1] == 0xBF &&
+ (
+ (0x82 <= bytes[2] && bytes[2] <= 0x84) ||
+ (0x86 <= bytes[2] && bytes[2] <= 0x87) ||
+ (0x90 <= bytes[2] && bytes[2] <= 0x93) ||
+ (0x96 <= bytes[2] && bytes[2] <= 0x97) ||
+ (0xA0 <= bytes[2] && bytes[2] <= 0xA7) ||
+ (0xB2 <= bytes[2] && bytes[2] <= 0xB4) ||
+ (0xB6 <= bytes[2] && bytes[2] <= 0xB7)
+ )
+ )
+ )
+ ) ||
+ (
+ bytes[0] == 0xE2 &&
+ ((
+ bytes[1] == 0x84 &&
+ (
+ bytes[2] == 0x8A ||
+ (0x8E <= bytes[2] && bytes[2] <= 0x8F) ||
+ bytes[2] == 0x93 ||
+ bytes[2] == 0xAF ||
+ bytes[2] == 0xB4 ||
+ bytes[2] == 0xB9 ||
+ (0xBC <= bytes[2] && bytes[2] <= 0xBD)
+ )
+ ) ||
+ (
+ bytes[1] == 0x85 &&
+ (
+ (0x86 <= bytes[2] && bytes[2] <= 0x89) ||
+ bytes[2] == 0x8E
+ )
+ ) ||
+ (
+ bytes[1] == 0x86 &&
+ (
+ bytes[2] == 0x84
+ )
+ ) ||
+ (
+ bytes[1] == 0xB0 &&
+ (
+ (0xB0 <= bytes[2] && bytes[2] <= 0xBF)
+ )
+ ) ||
+ (
+ bytes[1] == 0xB1 &&
+ (
+ (0x80 <= bytes[2] && bytes[2] <= 0x9E) ||
+ bytes[2] == 0xA1 ||
+ (0xA5 <= bytes[2] && bytes[2] <= 0xA6) ||
+ bytes[2] == 0xA8 ||
+ bytes[2] == 0xAA ||
+ bytes[2] == 0xAC ||
+ bytes[2] == 0xB1 ||
+ (0xB3 <= bytes[2] && bytes[2] <= 0xB4) ||
+ (0xB6 <= bytes[2] && bytes[2] <= 0xBB)
+ )
+ ) ||
+ (
+ bytes[1] == 0xB2 &&
+ (
+ bytes[2] == 0x81 ||
+ bytes[2] == 0x83 ||
+ bytes[2] == 0x85 ||
+ bytes[2] == 0x87 ||
+ bytes[2] == 0x89 ||
+ bytes[2] == 0x8B ||
+ bytes[2] == 0x8D ||
+ bytes[2] == 0x8F ||
+ bytes[2] == 0x91 ||
+ bytes[2] == 0x93 ||
+ bytes[2] == 0x95 ||
+ bytes[2] == 0x97 ||
+ bytes[2] == 0x99 ||
+ bytes[2] == 0x9B ||
+ bytes[2] == 0x9D ||
+ bytes[2] == 0x9F ||
+ bytes[2] == 0xA1 ||
+ bytes[2] == 0xA3 ||
+ bytes[2] == 0xA5 ||
+ bytes[2] == 0xA7 ||
+ bytes[2] == 0xA9 ||
+ bytes[2] == 0xAB ||
+ bytes[2] == 0xAD ||
+ bytes[2] == 0xAF ||
+ bytes[2] == 0xB1 ||
+ bytes[2] == 0xB3 ||
+ bytes[2] == 0xB5 ||
+ bytes[2] == 0xB7 ||
+ bytes[2] == 0xB9 ||
+ bytes[2] == 0xBB ||
+ bytes[2] == 0xBD ||
+ bytes[2] == 0xBF
+ )
+ ) ||
+ (
+ bytes[1] == 0xB3 &&
+ (
+ bytes[2] == 0x81 ||
+ bytes[2] == 0x83 ||
+ bytes[2] == 0x85 ||
+ bytes[2] == 0x87 ||
+ bytes[2] == 0x89 ||
+ bytes[2] == 0x8B ||
+ bytes[2] == 0x8D ||
+ bytes[2] == 0x8F ||
+ bytes[2] == 0x91 ||
+ bytes[2] == 0x93 ||
+ bytes[2] == 0x95 ||
+ bytes[2] == 0x97 ||
+ bytes[2] == 0x99 ||
+ bytes[2] == 0x9B ||
+ bytes[2] == 0x9D ||
+ bytes[2] == 0x9F ||
+ bytes[2] == 0xA1 ||
+ (0xA3 <= bytes[2] && bytes[2] <= 0xA4) ||
+ bytes[2] == 0xAC ||
+ bytes[2] == 0xAE ||
+ bytes[2] == 0xB3
+ )
+ ) ||
+ (
+ bytes[1] == 0xB4 &&
+ (
+ (0x80 <= bytes[2] && bytes[2] <= 0xA5) ||
+ bytes[2] == 0xA7 ||
+ bytes[2] == 0xAD
+ )
+ )
+ )) ||
+ (
+ bytes[0] == 0xEA &&
+ ((
+ bytes[1] == 0x99 &&
+ (
+ bytes[2] == 0x81 ||
+ bytes[2] == 0x83 ||
+ bytes[2] == 0x85 ||
+ bytes[2] == 0x87 ||
+ bytes[2] == 0x89 ||
+ bytes[2] == 0x8B ||
+ bytes[2] == 0x8D ||
+ bytes[2] == 0x8F ||
+ bytes[2] == 0x91 ||
+ bytes[2] == 0x93 ||
+ bytes[2] == 0x95 ||
+ bytes[2] == 0x97 ||
+ bytes[2] == 0x99 ||
+ bytes[2] == 0x9B ||
+ bytes[2] == 0x9D ||
+ bytes[2] == 0x9F ||
+ bytes[2] == 0xA1 ||
+ bytes[2] == 0xA3 ||
+ bytes[2] == 0xA5 ||
+ bytes[2] == 0xA7 ||
+ bytes[2] == 0xA9 ||
+ bytes[2] == 0xAB ||
+ bytes[2] == 0xAD
+ )
+ ) ||
+ (
+ bytes[1] == 0x9A &&
+ (
+ bytes[2] == 0x81 ||
+ bytes[2] == 0x83 ||
+ bytes[2] == 0x85 ||
+ bytes[2] == 0x87 ||
+ bytes[2] == 0x89 ||
+ bytes[2] == 0x8B ||
+ bytes[2] == 0x8D ||
+ bytes[2] == 0x8F ||
+ bytes[2] == 0x91 ||
+ bytes[2] == 0x93 ||
+ bytes[2] == 0x95 ||
+ bytes[2] == 0x97
+ )
+ ) ||
+ (
+ bytes[1] == 0x9C &&
+ (
+ bytes[2] == 0xA3 ||
+ bytes[2] == 0xA5 ||
+ bytes[2] == 0xA7 ||
+ bytes[2] == 0xA9 ||
+ bytes[2] == 0xAB ||
+ bytes[2] == 0xAD ||
+ (0xAF <= bytes[2] && bytes[2] <= 0xB1) ||
+ bytes[2] == 0xB3 ||
+ bytes[2] == 0xB5 ||
+ bytes[2] == 0xB7 ||
+ bytes[2] == 0xB9 ||
+ bytes[2] == 0xBB ||
+ bytes[2] == 0xBD ||
+ bytes[2] == 0xBF
+ )
+ ) ||
+ (
+ bytes[1] == 0x9D &&
+ (
+ bytes[2] == 0x81 ||
+ bytes[2] == 0x83 ||
+ bytes[2] == 0x85 ||
+ bytes[2] == 0x87 ||
+ bytes[2] == 0x89 ||
+ bytes[2] == 0x8B ||
+ bytes[2] == 0x8D ||
+ bytes[2] == 0x8F ||
+ bytes[2] == 0x91 ||
+ bytes[2] == 0x93 ||
+ bytes[2] == 0x95 ||
+ bytes[2] == 0x97 ||
+ bytes[2] == 0x99 ||
+ bytes[2] == 0x9B ||
+ bytes[2] == 0x9D ||
+ bytes[2] == 0x9F ||
+ bytes[2] == 0xA1 ||
+ bytes[2] == 0xA3 ||
+ bytes[2] == 0xA5 ||
+ bytes[2] == 0xA7 ||
+ bytes[2] == 0xA9 ||
+ bytes[2] == 0xAB ||
+ bytes[2] == 0xAD ||
+ bytes[2] == 0xAF ||
+ (0xB1 <= bytes[2] && bytes[2] <= 0xB8) ||
+ bytes[2] == 0xBA ||
+ bytes[2] == 0xBC ||
+ bytes[2] == 0xBF
+ )
+ ) ||
+ (
+ bytes[1] == 0x9E &&
+ (
+ bytes[2] == 0x81 ||
+ bytes[2] == 0x83 ||
+ bytes[2] == 0x85 ||
+ bytes[2] == 0x87 ||
+ bytes[2] == 0x8C ||
+ bytes[2] == 0x8E ||
+ bytes[2] == 0x91 ||
+ bytes[2] == 0x93 ||
+ bytes[2] == 0xA1 ||
+ bytes[2] == 0xA3 ||
+ bytes[2] == 0xA5 ||
+ bytes[2] == 0xA7 ||
+ bytes[2] == 0xA9
+ )
+ ) ||
+ (
+ bytes[1] == 0x9F &&
+ (
+ bytes[2] == 0xBA
+ )
+ )
+ )) ||
+ (
+ bytes[0] == 0xEF &&
+ ((
+ bytes[1] == 0xAC &&
+ (
+ (0x80 <= bytes[2] && bytes[2] <= 0x86) ||
+ (0x93 <= bytes[2] && bytes[2] <= 0x97)
+ )
+ ) ||
+ (
+ bytes[1] == 0xBD &&
+ (
+ (0x81 <= bytes[2] && bytes[2] <= 0x9A)
+ )
+ ))
+ )) {
+ return 1;
+ }
+ if (
+ (
+ bytes[0] == 0xF0
+ &&
+ (
+ (
+ bytes[1] == 0x90
+ &&
+ (
+ (bytes[2] == 0x90 && 0xA8 <= bytes[3] && bytes[3] <= 0xBF)
+ ||
+ (bytes[2] == 0x91 && 0x80 <= bytes[3] && bytes[3] <= 0x8F)
+ )
+ )
+ ||
+ (
+ bytes[1] == 0x9D
+ && (
+ (bytes[2] == 0x90 && 0x9A <= bytes[3] && bytes[3] <= 0xB3)
+ ||
+ (
+ bytes[2] == 0x91 &&
+ (
+ (0x8E <= bytes[3] && bytes[3] <= 0x94)
+ ||
+ (0x96 <= bytes[3] && bytes[3] <= 0xA7)
+ )
+ )
+ ||
+ (
+ bytes[2] == 0x92 &&
+ (
+ (0x82 <= bytes[3] && bytes[3] <= 0x9B)
+ ||
+ (0xB6 <= bytes[3] && bytes[3] <= 0xB9)
+ ||
+ bytes[3] == 0xBB
+ ||
+ (0xBD <= bytes[3] && bytes[3] <= 0xBF)
+ )
+ )
+ ||
+ (
+ bytes[2] == 0x93 &&
+ (
+ (0x80 <= bytes[3] && bytes[3] <= 0x83) ||
+ (0x85 <= bytes[3] && bytes[3] <= 0x8F) ||
+ (0xAA <= bytes[3] && bytes[3] <= 0xBF)
+ )
+ )
+ ||
+ (
+ bytes[2] == 0x94 &&
+ (
+ (0x80 <= bytes[3] && bytes[3] <= 0x83) ||
+ (0x9E <= bytes[3] && bytes[3] <= 0xB7)
+ )
+ )
+ ||
+ (
+ bytes[2] == 0x95 && 0x92 <= bytes[3] && bytes[3] <= 0xAB
+ )
+ ||
+ (
+ bytes[2] == 0x96 &&
+ (
+ (0x86 <= bytes[3] && bytes[3] <= 0x9F) ||
+ (0xBA <= bytes[3] && bytes[3] <= 0xBF)
+ )
+ )
+ ||
+ (
+ bytes[2] == 0x97 &&
+ (
+ (0x80 <= bytes[3] && bytes[3] <= 0x93) ||
+ (0xAE <= bytes[3] && bytes[3] <= 0xBF)
+ )
+ )
+ ||
+ (
+ bytes[2] == 0x98 &&
+ (
+ (0x80 <= bytes[3] && bytes[3] <= 0x87) ||
+ (0xA2 <= bytes[3] && bytes[3] <= 0xBB)
+ )
+ )
+ ||
+ (bytes[2] == 0x99 && 0x96 <= bytes[3] && bytes[3] <= 0xAF)
+ ||
+ (bytes[2] == 0x9A && 0x8A <= bytes[3] && bytes[3] <= 0xA5)
+ ||
+ (
+ bytes[2] == 0x9B &&
+ (
+ (0x82 <= bytes[3] && bytes[3] <= 0x9A) ||
+ (0x9C <= bytes[3] && bytes[3] <= 0xA1) ||
+ (0xBC <= bytes[3] && bytes[3] <= 0xBF)
+ )
+ )
+ ||
+ (
+ bytes[2] == 0x9C &&
+ (
+ (0x80 <= bytes[3] && bytes[3] <= 0x94) ||
+ (0x96 <= bytes[3] && bytes[3] <= 0x9B) ||
+ (0xB6 <= bytes[3] && bytes[3] <= 0xBF)
+ )
+ )
+ ||
+ (
+ bytes[2] == 0x9D &&
+ (
+ (0x80 <= bytes[3] && bytes[3] <= 0x8E) ||
+ (0x90 <= bytes[3] && bytes[3] <= 0x95) ||
+ (0xB0 <= bytes[3] && bytes[3] <= 0xBF)
+ )
+ )
+ ||
+ (
+ bytes[2] == 0x9E &&
+ (
+ (0x80 <= bytes[3] && bytes[3] <= 0x88) ||
+ (0x8A <= bytes[3] && bytes[3] <= 0x8F) ||
+ (0xAA <= bytes[3] && bytes[3] <= 0xBF)
+ )
+ )
+ ||
+ (
+ bytes[2] == 0x9F &&
+ (
+ (0x80 <= bytes[3] && bytes[3] <= 0x82) ||
+ (0x84 <= bytes[3] && bytes[3] <= 0x89) ||
+ bytes[3] == 0x8B
+ )
+ )
+ )
+ ))
+ )
+ ) {
+ return 1;
+ }
+ return 0;
+ //no lowercase character
+} /* isulower */
diff --git a/btparse/src/util.h b/btparse/src/util.h
new file mode 100644
index 0000000..fc0a665
--- /dev/null
+++ b/btparse/src/util.h
@@ -0,0 +1,7 @@
+#ifndef UTIL_H
+#define UTIL_H
+
+char *strlwr (char *s);
+char *strupr (char *s);
+
+#endif /* UTIL_H */
diff --git a/btparse/tests/case_test.c b/btparse/tests/case_test.c
new file mode 100644
index 0000000..bb7143a
--- /dev/null
+++ b/btparse/tests/case_test.c
@@ -0,0 +1,50 @@
+/*
+ * case_test.c
+ *
+ * GPW 1997/11/25
+ *
+ * $Id$
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include "btparse.h"
+
+
+static void
+show_case_changed (char transform, char * msg, char * string)
+{
+ char * dup;
+
+ dup = strdup (string);
+ bt_change_case (transform, dup, 0);
+ printf ("%s%s\n", msg, dup);
+ free (dup);
+}
+
+
+int
+main (void)
+{
+ char line[1024];
+ int line_num;
+ int len;
+
+ while (! feof (stdin))
+ {
+ if (fgets (line, 1024, stdin) == NULL)
+ break;
+
+ len = strlen (line);
+ if (line[len-1] == '\n') line[len-1] = '\0';
+ line_num++;
+
+ printf ("original_string = %s\n", line);
+
+ show_case_changed ('l', " lowercase = ", line);
+ show_case_changed ('u', " uppercase = ", line);
+ show_case_changed ('t', " title caps = ", line);
+ }
+ return 0;
+}
diff --git a/btparse/tests/data/TESTS b/btparse/tests/data/TESTS
new file mode 100644
index 0000000..391c03f
--- /dev/null
+++ b/btparse/tests/data/TESTS
@@ -0,0 +1,7 @@
+The test data files are as follows:
+
+ regular.bib a simple regular entry (no errors)
+ macro.bib a simple @string (macro definition) entry (no errors)
+ comment.bib a simple @comment entry (no errors)
+ preamble.bib a simple @preamble entry (no errors)
+ simple.bib all of the above concatenated
diff --git a/btparse/tests/data/commas.bib b/btparse/tests/data/commas.bib
new file mode 100644
index 0000000..38feb89
--- /dev/null
+++ b/btparse/tests/data/commas.bib
@@ -0,0 +1,21 @@
+@misc{usingPAPILinux,
+ Author = {Jack Dongarra, Kevin London, Shirley Moore, Philip
+Mucci and Dan
+iel Terpstra},
+ Keywords = {PAPI},
+ Local-Url =
+{/Users/mike/Documents/hpcl/Papers/linux-rev2001.pdf},
+ Month = {July},
+ Title = {Using PAPI for hardware performance monitoring on
+Linux systems},
+ Url =
+{http://icl.cs.utk.edu/projects/papi/documents/pub-papers/2001/linux
+-rev2001.pdf},
+ Year = {2001}}
+
+@Article{wall,
+ author = {Wall,, Larry},
+ title = {Bugs in Perl},
+ journal = {J. Gnats \&\ Gnus},
+ year = 2003
+}
diff --git a/btparse/tests/data/comment.bib b/btparse/tests/data/comment.bib
new file mode 100644
index 0000000..2052513
--- /dev/null
+++ b/btparse/tests/data/comment.bib
@@ -0,0 +1,6 @@
+
+% this is a lexical comment, which is never seen by the grammar and
+% thus can't be tested (well, not yet at least)
+
+@comment(this is a comment entry, anything at all can go in it (as long
+ as parentheses are balanced), even {braces})
diff --git a/btparse/tests/data/empty.bib b/btparse/tests/data/empty.bib
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/btparse/tests/data/empty.bib
diff --git a/btparse/tests/data/foreign.bib b/btparse/tests/data/foreign.bib
new file mode 100644
index 0000000..1cb7671
--- /dev/null
+++ b/btparse/tests/data/foreign.bib
@@ -0,0 +1,12 @@
+% an entry with non-English characters
+
+@article{a001,
+title = "My Article",
+author = "{\AA}se, Stra{\ss}e, and {\o}thers"
+}
+
+% and one with a misplaced bit of TeX markup that might fool
+% the foreign letter detection code
+@article{a002,
+title = "{\tt}foo"
+}
diff --git a/btparse/tests/data/macro.bib b/btparse/tests/data/macro.bib
new file mode 100644
index 0000000..fce1f85
--- /dev/null
+++ b/btparse/tests/data/macro.bib
@@ -0,0 +1,2 @@
+@string(macro = "macro text ",
+ foo = {blah blah } # " ding dong ")
diff --git a/btparse/tests/data/names b/btparse/tests/data/names
new file mode 100644
index 0000000..41afed5
--- /dev/null
+++ b/btparse/tests/data/names
@@ -0,0 +1,17 @@
+John Smith
+Smith, John
+Smith, Jr., John
+J. Smith and Blow, Joe,,
+Harry S Truman
+Truman, Harry S
+Elizabeth {\"U}
+{\"U}, Elizabeth
+Jos{\'e} von Blow
+von Blow, Jos{\'e}
+{\'E}duardo de la Garbanzo
+Charles Louis Xavier Joseph de la Vall{\'e}e Poussin
+U Than
+van der foo, Joe
+van der foo, Jr, Joe
+van der foo, jr, Joe
+Wall,, Larry
diff --git a/btparse/tests/data/preamble.bib b/btparse/tests/data/preamble.bib
new file mode 100644
index 0000000..57d922e
--- /dev/null
+++ b/btparse/tests/data/preamble.bib
@@ -0,0 +1,2 @@
+@preamble{" This is a preamble" #
+ {---the concatenation of several strings}}
diff --git a/btparse/tests/data/purify.strings b/btparse/tests/data/purify.strings
new file mode 100644
index 0000000..0507ea0
--- /dev/null
+++ b/btparse/tests/data/purify.strings
@@ -0,0 +1,50 @@
+Bl{\"o}w, Jo{\'{e}} Q. and J.~R. R. Tolk{\u e}in and {Fo{\'o} Bar ~ {\aa}nd {\SS}on{\v{s}}, Ltd.}
+
+G{\"o}del
+G{\" o}del
+G{\" o }del
+G{\"o }del
+G{\"{o}}del
+G{\" {o}}del
+G{\" { o}}del
+G{\" {o }}del
+G{\" { o }}del
+G{\" { o } }del
+G{\"{o} }del
+G{\" {o} }del
+G{\"o foo}del
+G{\"foo}del
+G{\"{foo}}del
+{G\"odel}
+G{\"o}del
+G{\"{o}}del
+{\ss}uper-duper
+{\ss }uper-duper
+{ \ss}uper-duper
+{\ss{}}uper-duper
+{\ss foo}uper-duper
+{\ss { }}uper-duper
+{\ss {foo}}uper-duper
+{\ss{foo}}uper-duper
+Tom{\`a}{\v s}
+Tom{\`a}{\v{s}}
+Tom{\`a}{{\v s}}
+{Tom{\`a}{\v s}}
+{Tom{\`a}{\v{s}}}
+{Tom{\`a}{\v{ s}}}
+{Tom{\`a}{\v{ s }}}
+{\v s}
+{\x s}
+{\r s}
+{\foo s}
+{\oe}
+{\ae}
+{\aa}
+{\o}
+{\l}
+{\ss}
+{\ae s}
+\TeX
+{\TeX}
+{{\TeX}}
+{\foobar}
diff --git a/btparse/tests/data/regular.bib b/btparse/tests/data/regular.bib
new file mode 100644
index 0000000..9f6c706
--- /dev/null
+++ b/btparse/tests/data/regular.bib
@@ -0,0 +1,8 @@
+% a sample "regular" entry (ie. not a @comment, @preamble, or @string)
+
+@book{abook,
+title = {A } # "Book", % an in-entry comment
+editor = { John Q. Random} # junk,
+publisher = {Foo Bar \& Sons},
+year = 1922
+}
diff --git a/btparse/tests/data/simple.bib b/btparse/tests/data/simple.bib
new file mode 100644
index 0000000..3790ae1
--- /dev/null
+++ b/btparse/tests/data/simple.bib
@@ -0,0 +1,18 @@
+% a sample "regular" entry (ie. not a @comment, @preamble, or @string)
+
+@book{abook,
+title = {A } # "Book", % an in-entry comment
+editor = { John Q. Random} # junk,
+publisher = {Foo Bar \& Sons},
+year = 1922
+}
+@string(macro = "macro text ",
+ foo = {blah blah } # " ding dong ")
+
+% this is a lexical comment, which is never seen by the grammar and
+% thus can't be tested (well, not yet at least)
+
+@comment(this is a comment entry, anything at all can go in it (as long
+ as parentheses are balanced), even {braces})
+@preamble{" This is a preamble" #
+ {---the concatenation of several strings}}
diff --git a/btparse/tests/macro_test.c b/btparse/tests/macro_test.c
new file mode 100644
index 0000000..5126c9c
--- /dev/null
+++ b/btparse/tests/macro_test.c
@@ -0,0 +1,90 @@
+/*
+ * macro_test.c
+ *
+ * Test driver for the btparse macro table. Reads simple one-line commands
+ * from stdin; each one consists of a one-letter action code and possibly
+ * some arguments. The allowed actions are:
+ * a <macro> <text> - add macro
+ * p <macro> - print expansion of macro
+ * d <macro> - delete macro
+ * l - delete all macros
+ *
+ * There must be exactly one space between the action and <macro>, and
+ * between <macro> and <text> (where appropriate).
+ *
+ * GPW 1998/03/01
+ *
+ * $Id$
+ */
+
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include "btparse.h"
+
+
+int
+main (void)
+{
+ char line[1024];
+ int line_num;
+ int i;
+ char action;
+ char * macro;
+ char * text;
+
+ bt_initialize();
+
+ /*
+ * Read lines from stdin. Each one starts with a single-letter command,
+ * which may be one of the following:
+ */
+
+ line_num = 0;
+ while (! feof (stdin))
+ {
+ if (fgets (line, 1024, stdin))
+ {
+ line_num++;
+ action = line[0];
+ if (action != 'l') /* other commands take <macro> arg */
+ {
+ line[1] = (char) 0;
+ i = 2;
+ macro = line+2;
+ while (! isspace (line[i])) i++;
+ line[i++] = (char) 0;
+ text = line+i;
+ text[strlen(text)-1] = (char) 0; /* wipe the newline */
+ }
+
+ switch (action)
+ {
+ case 'a':
+ bt_add_macro_text (macro, text, "stdin", line_num);
+ break;
+ case 'p':
+ text = bt_macro_text (macro, "stdin", line_num);
+ if (text)
+ printf ("%s\n", text);
+ break;
+ case 'd':
+ bt_delete_macro (macro);
+ break;
+ case 'l':
+ bt_delete_all_macros ();
+ break;
+ default:
+ fprintf (stderr, "unknown command '%c'\n", action);
+ }
+
+ /* zzs_stat(); */
+
+ }
+
+ } /* while !eof */
+
+ bt_cleanup();
+ return 0;
+}
diff --git a/btparse/tests/name_test.c b/btparse/tests/name_test.c
new file mode 100644
index 0000000..3577707
--- /dev/null
+++ b/btparse/tests/name_test.c
@@ -0,0 +1,155 @@
+/*
+ * name_test.c
+ *
+ * GPW 1997/11/03
+ *
+ * $Id$
+ */
+
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "btparse.h"
+
+
+static void
+print_tokens (char *partname, char **tokens, int num_tokens)
+{
+ int i;
+
+ if (tokens)
+ {
+ printf ("%s tokens = (", partname);
+ for (i = 0; i < num_tokens; i++)
+ {
+ printf ("%s%c", tokens[i], i == num_tokens-1 ? ')' : '|');
+ }
+ putchar ('\n');
+ }
+}
+
+
+static void
+dump_name (bt_name * name)
+{
+ printf ("total number of tokens = %d\n", name->tokens->num_items);
+ print_tokens ("first", name->parts[BTN_FIRST], name->part_len[BTN_FIRST]);
+ print_tokens ("von", name->parts[BTN_VON], name->part_len[BTN_VON]);
+ print_tokens ("last", name->parts[BTN_LAST], name->part_len[BTN_LAST]);
+ print_tokens ("jr", name->parts[BTN_JR], name->part_len[BTN_JR]);
+}
+
+
+static void
+show_formatted_name (char * msg, bt_name_format * format, bt_name * name)
+{
+ char * fname;
+
+ fname = bt_format_name (name, format);
+ printf ("%s = (%s)\n", msg, fname);
+ free (fname);
+}
+
+
+static void
+process_name (char * name_string, int line_num, int name_num)
+{
+ bt_name * name;
+ bt_name_format * format;
+
+ printf ("original name = %s\n", name_string);
+ name = bt_split_name (name_string, "stdin", line_num, name_num);
+ if (! (name && name->tokens))
+ {
+ fprintf (stderr, "empty name\n");
+ return;
+ }
+
+ dump_name (name);
+
+ /* First "vljf", unabbreviated first name. */
+ format = bt_create_name_format ("vljf", FALSE);
+ show_formatted_name ("fname 1", format, name);
+
+ /* Now abbreviate first name stupidly (ie. with no post-token text) */
+ bt_set_format_options (format, BTN_FIRST, TRUE, BTJ_MAYTIE, BTJ_SPACE);
+ show_formatted_name ("fname 2", format, name);
+
+ /* Add those missing post-token periods */
+ bt_set_format_text (format, BTN_FIRST, NULL, NULL, NULL, ".");
+ show_formatted_name ("fname 3", format, name);
+
+ /* Drop the periods and force no space between first-name tokens */
+ bt_set_format_text (format, BTN_FIRST, NULL, NULL, NULL, "");
+ bt_set_format_options (format, BTN_FIRST, TRUE, BTJ_NOTHING, BTJ_SPACE);
+ show_formatted_name ("fname 4", format, name);
+
+ /* Finish with this format, and create a new one: "fvlj", abbreviated. */
+ bt_free_name_format (format);
+ format = bt_create_name_format ("fvlj", TRUE);
+ show_formatted_name ("fname 5", format, name);
+
+ /* Degenerate to "no periods, no spaces" abbrev again */
+ bt_set_format_text (format, BTN_FIRST, NULL, NULL, NULL, "");
+ bt_set_format_options (format, BTN_FIRST, TRUE, BTJ_NOTHING, BTJ_SPACE);
+ show_formatted_name ("fname 6", format, name);
+
+ /* OK, let's play at something a little more "custom": kindergarten-
+ * style names (full first name, abbreviated last name, forget about
+ * 'von' and 'jr'.
+ */
+ bt_free_name_format (format);
+ format = bt_create_name_format ("fl", FALSE);
+ bt_set_format_text (format, BTN_LAST, NULL, NULL, NULL, ".");
+ bt_set_format_options (format, BTN_LAST, TRUE, BTJ_MAYTIE, BTJ_SPACE);
+ show_formatted_name ("fname 7", format, name);
+
+ /* 'von' and 'last' only, abbreviated with no periods or spaces */
+ bt_free_name_format (format);
+ format = bt_create_name_format ("vl", FALSE);
+ bt_set_format_options (format, BTN_VON, TRUE, BTJ_NOTHING, BTJ_NOTHING);
+ bt_set_format_options (format, BTN_LAST, TRUE, BTJ_NOTHING, BTJ_NOTHING);
+ show_formatted_name ("fname 8", format, name);
+
+ bt_free_name_format (format);
+ bt_free_name (name);
+
+} /* process_name () */
+
+
+int
+main (void)
+{
+ char line[1024];
+ int line_num;
+ int len;
+ bt_stringlist * names;
+ int i;
+
+ while (! feof (stdin))
+ {
+ if (fgets (line, 1024, stdin) == NULL)
+ break;
+
+ len = strlen (line);
+ if (line[len-1] == '\n') line[len-1] = '\0';
+ line_num++;
+
+ names = bt_split_list (line, "and", "stdin", line_num, "name");
+ if (names == NULL)
+ printf ("empty or invalid string\n");
+ else
+ {
+ if (names->num_items > 1)
+ printf ("%d names in string\n", names->num_items);
+
+ for (i = 0; i < names->num_items; i++)
+ {
+ if (names->items[i])
+ process_name (names->items[i], line_num, i+1);
+ }
+ bt_free_list (names);
+ }
+ }
+ return 0;
+}
diff --git a/btparse/tests/namebug.c b/btparse/tests/namebug.c
new file mode 100644
index 0000000..15263c9
--- /dev/null
+++ b/btparse/tests/namebug.c
@@ -0,0 +1,28 @@
+#include <stdio.h>
+#include <string.h>
+#include "btparse.h"
+
+void dump_name(bt_name*);
+
+int main (void)
+{
+ char * snames[4] = { "Joe Blow", "John Smith", "Fred Rogers", "" };
+ bt_name * names[4];
+ int i;
+
+ printf ("split as we go:\n");
+ for (i = 0; i < 4; i++)
+ {
+ names[i] = bt_split_name (strdup (snames[i]), NULL, 0, 0);
+ dump_name (names[i]);
+ }
+
+ printf ("pre-split:\n");
+ for (i = 0; i < 4; i++)
+ {
+ dump_name (names[i]);
+ }
+
+ return 0;
+}
+
diff --git a/btparse/tests/postprocess_test.c b/btparse/tests/postprocess_test.c
new file mode 100644
index 0000000..06debde
--- /dev/null
+++ b/btparse/tests/postprocess_test.c
@@ -0,0 +1,40 @@
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "my_dmalloc.h"
+#include "btparse.h"
+
+void postprocess (char *);
+
+void postprocess (char * string)
+{
+ char * buf;
+
+ buf = (char *) malloc (strlen(string) + 1);
+ strcpy (buf, string);
+ bt_postprocess_string (buf, 0);
+ printf ("[%s] -> [%s] (no collapse)\n", string, buf);
+ bt_postprocess_string (buf, BTO_COLLAPSE);
+ printf ("[%s] -> [%s] (collapse)\n", string, buf);
+ free (buf);
+}
+
+int main (void)
+{
+ postprocess ("vanilla string");
+ postprocess ("nospace");
+ postprocess ("inner space");
+ postprocess (" leading");
+ postprocess (" leading");
+ postprocess ("trailing ");
+ postprocess ("trailing ");
+ postprocess ("");
+ postprocess (" leading&trailing ");
+ postprocess (" leading & trailing ");
+ postprocess (" leading and internal");
+ postprocess ("internal and trailing ");
+ postprocess (" everything at once ");
+
+ return 0;
+}
+
diff --git a/btparse/tests/purify_test.c b/btparse/tests/purify_test.c
new file mode 100644
index 0000000..9b3ac42
--- /dev/null
+++ b/btparse/tests/purify_test.c
@@ -0,0 +1,37 @@
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include "btparse.h"
+
+int
+main (void)
+{
+ char line[1024];
+ int line_num;
+ int len, i;
+
+ while (! feof (stdin))
+ {
+ if (fgets (line, 1024, stdin))
+ {
+ len = strlen (line);
+ if (line[len-1] == '\n') line[len-1] = '\0';
+ line_num++;
+ printf ("original string = %s\n", line);
+ bt_purify_string (line, 0);
+ len = strlen (line);
+
+ /* strip trailing spaces so our output looks like BibTeX's */
+ for (i = len-1; line[i] == ' '; i--)
+ line[i] = (char) 0;
+
+ if (len > 0)
+ printf ("purified string = %s\n", line);
+ else /* more imitating BibTeX's output */
+ printf ("purified string =\n");
+ printf ("purified length = %d\n", len);
+ }
+ }
+ return 0;
+}
+
diff --git a/btparse/tests/read_test.c b/btparse/tests/read_test.c
new file mode 100644
index 0000000..7cec276
--- /dev/null
+++ b/btparse/tests/read_test.c
@@ -0,0 +1,84 @@
+/*
+ * read_test.c
+ *
+ * finding bugs related to reading from an empty file, reading twice
+ * at eof, parsing an empty string, etc.
+ */
+
+#include "bt_config.h" /* for dmalloc() stuff */
+#include <stdlib.h>
+
+#include "testlib.h"
+#include "my_dmalloc.h"
+
+
+int main (void)
+{
+ char filename[256];
+ FILE * infile;
+ AST * entry;
+ boolean entry_ok,
+ ok = TRUE;;
+
+ bt_initialize ();
+
+ /*
+ * First test -- try to read an entry from an empty file. This
+ * triggers an "unexpected eof" syntax error, and puts the file
+ * at eof -- but doesn't do the eof processing (that's for the next
+ * call).
+ */
+ infile = open_file ("empty.bib", DATA_DIR, filename, 255);
+ CHECK (!feof (infile))
+ entry = bt_parse_entry (infile, filename, 0, &entry_ok);
+ CHECK (feof (infile))
+ CHECK (entry == NULL); /* because no entry found */
+ CHECK (!entry_ok); /* and this causes a syntax error */
+
+ /* Now that we're at eof, read again -- this does the normal eof cleanup */
+ entry = bt_parse_entry (infile, filename, 0, &entry_ok);
+ CHECK (entry == NULL); /* because at eof */
+ CHECK (entry_ok); /* ditto */
+
+ /*
+ * And now do an excess read -- this used to crash the library; now it
+ * just triggers a "usage warning".
+ */
+ entry = bt_parse_entry (infile, filename, 0, &entry_ok);
+ CHECK (entry == NULL);
+ CHECK (entry_ok);
+
+ /*
+ * Try to parse an empty string; should trigger a syntax error (eof
+ * when expected an entry), so entry_ok will be false.
+ */
+ entry = bt_parse_entry_s ("", NULL, 1, 0, &entry_ok);
+ CHECK (entry == NULL);
+ CHECK (! entry_ok);
+
+ /*
+ * Try to parse a string with just junk (nothing entry-like) in it --
+ * should cause syntax error just like the empty string.
+ */
+ entry = bt_parse_entry_s ("this is junk", NULL, 1, 0, &entry_ok);
+ CHECK (entry == NULL);
+ CHECK (! entry_ok);
+
+ /* Tell bt_parse_entry_s() to cleanup after itself */
+ entry = bt_parse_entry_s (NULL, NULL, 1, 0, NULL);
+ CHECK (entry == NULL);
+
+ bt_cleanup ();
+
+ if (! ok)
+ {
+ printf ("Some tests failed\n");
+ exit (1);
+ }
+ else
+ {
+ printf ("All tests successful\n");
+ exit (0);
+ }
+
+} /* main() */
diff --git a/btparse/tests/simple_test.c b/btparse/tests/simple_test.c
new file mode 100644
index 0000000..ef27612
--- /dev/null
+++ b/btparse/tests/simple_test.c
@@ -0,0 +1,596 @@
+/* ------------------------------------------------------------------------
+@NAME : simple_test.c
+@INPUT :
+@OUTPUT :
+@RETURNS :
+@DESCRIPTION: Run some basic tests on some simple data. The .bib files
+ processed here are all free of errors, and use just the basic
+ BibTeX syntax. This is just to make sure the parser and
+ library are working in the crudest sense; more elaborate
+ tests will someday performed elsewhere (I hope).
+@GLOBALS :
+@CALLS :
+@CALLERS :
+@CREATED : 1997/07/29, Greg Ward
+@MODIFIED :
+@VERSION : $Id$
+@COPYRIGHT : Copyright (c) 1996-97 by Gregory P. Ward. All rights reserved.
+
+ This file is part of the btparse distribution (but not part
+ of the library itself). This is free software; you can
+ redistribute it and/or modify it under the terms of the GNU
+ General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your
+ option) any later version.
+-------------------------------------------------------------------------- */
+
+#include "bt_config.h"
+#include <stdlib.h>
+#include <string.h>
+#include "btparse.h"
+#include "testlib.h"
+#include "my_dmalloc.h"
+
+int test_num = 0;
+
+
+typedef enum { single, multiple, wholefile } test_mode;
+
+typedef struct
+{
+ int num_fields;
+ char **fields;
+ int num_values;
+ char **values;
+ bt_nodetype * ntypes;
+} test_data;
+
+typedef boolean (*tester) (AST *, test_data *);
+
+typedef struct
+{
+ char * desc;
+ char * filename;
+ btshort options;
+ tester test_func;
+ test_data * data;
+} test;
+
+/* prototypes needed for defining the tests[] array */
+boolean eviltest_regular (AST *entry, test_data *data);
+boolean eviltest_macro (AST *entry, test_data *data);
+boolean eviltest_comment (AST *entry, test_data *data);
+boolean eviltest_preamble (AST *entry, test_data *data);
+boolean goodtest_regular (AST *entry, test_data *data);
+boolean goodtest_macro (AST *entry, test_data *data);
+boolean goodtest_comment (AST *entry, test_data *data);
+boolean goodtest_preamble (AST *entry, test_data *data);
+
+/* and prototypes to keep "gcc -Wall" from whining */
+boolean test_multiple (FILE *, char *, btshort, btshort, int, test *);
+boolean test_wholefile (char *, btshort, btshort, int, test *);
+
+
+/* a priori knowledge about the entry in "regular.bib" (used for both tests) */
+char * regular_fields[] = { "title", "editor", "publisher", "year" };
+char * regular_values[] =
+ { "A ", "Book", " John Q. Random", "junk", "Foo Bar \\& Sons", "1922" };
+char * regular_values_proc[] =
+ { "A Book", "John Q. Random", "Foo Bar \\& Sons", "1922" };
+bt_nodetype regular_ntypes[] =
+ { BTAST_STRING, BTAST_STRING, BTAST_STRING, BTAST_MACRO, BTAST_STRING, BTAST_NUMBER };
+bt_nodetype regular_ntypes_proc[] =
+ { BTAST_STRING, BTAST_STRING, BTAST_STRING, BTAST_STRING };
+
+/* likewise for "macro.bib" */
+char * macro_macros[] = { "macro", "foo" };
+char * macro_values[] =
+ { "macro text ", "blah blah ", " ding dong " };
+char * macro_values_proc[] =
+ { "macro text", "blah blah ding dong" };
+bt_nodetype macro_ntypes[] = { BTAST_STRING, BTAST_STRING, BTAST_STRING };
+
+/* and for "comment.bib" */
+char * comment_value = "this is a comment entry, anything at all can go in it (as long as parentheses are balanced), even {braces}";
+char * comment_value_proc = "this is a comment entry, anything at all can go in it (as long as parentheses are balanced), even {braces}";
+
+/* and for "preamble.bib" */
+char * preamble_values[] =
+ { " This is a preamble",
+ "---the concatenation of several strings" };
+char * preamble_value_proc =
+ "This is a preamble---the concatenation of several strings";
+
+test_data regular_unproc_data =
+ { 4, regular_fields, 6, regular_values, regular_ntypes };
+test_data regular_proc_data =
+ { 4, regular_fields, 4, regular_values_proc, regular_ntypes_proc };
+test_data macro_unproc_data =
+ { 2, macro_macros, 3, macro_values, macro_ntypes };
+test_data macro_proc_data =
+ { 2, macro_macros, 2, macro_values_proc, macro_ntypes };
+test_data comment_unproc_data =
+ { 0, NULL, 1, &comment_value, NULL };
+test_data comment_proc_data =
+ { 0, NULL, 1, &comment_value_proc, NULL };
+test_data preamble_unproc_data =
+ { 0, NULL, 2, preamble_values, NULL };
+test_data preamble_proc_data =
+ { 0, NULL, 1, &preamble_value_proc, NULL };
+
+
+test tests[] =
+{
+ { "regular entry (unprocessed, low-level scan)",
+ "regular.bib", BTO_MINIMAL,
+ eviltest_regular, &regular_unproc_data
+ },
+ { "macro entry (unprocessed, low-level scan)",
+ "macro.bib", BTO_MINIMAL,
+ eviltest_macro, &macro_unproc_data
+ },
+ { "comment entry (unprocessed, low-level scan)",
+ "comment.bib", BTO_MINIMAL,
+ eviltest_comment, &comment_unproc_data
+ },
+ { "preamble entry (unprocessed, low-level scan)",
+ "preamble.bib", BTO_MINIMAL,
+ eviltest_preamble, &preamble_unproc_data
+ },
+ { "regular entry (unprocessed, high-level scan)",
+ "regular.bib", BTO_MINIMAL,
+ goodtest_regular, &regular_unproc_data
+ },
+ { "macro entry (unprocessed, high-level scan)",
+ "macro.bib", BTO_MINIMAL,
+ goodtest_macro, &macro_unproc_data
+ },
+ { "comment entry (unprocessed, high-level scan)",
+ "comment.bib", BTO_MINIMAL,
+ goodtest_comment, &comment_unproc_data
+ },
+ { "preamble entry (unprocessed, high-level scan)",
+ "preamble.bib", BTO_MINIMAL,
+ goodtest_preamble, &preamble_unproc_data
+ },
+ { "regular entry (processed, low-level scan)",
+ "regular.bib", BTO_FULL,
+ eviltest_regular, &regular_proc_data
+ },
+ { "macro entry (processed, low-level scan)",
+ "macro.bib", BTO_FULL,
+ eviltest_macro, &macro_proc_data
+ },
+ { "comment entry (processed, low-level scan)",
+ "comment.bib", BTO_FULL,
+ eviltest_comment, &comment_proc_data
+ },
+ { "preamble entry (processed, low-level scan)",
+ "preamble.bib", BTO_FULL,
+ eviltest_preamble, &preamble_proc_data
+ },
+ { "regular entry (processed, high-level scan)",
+ "regular.bib", BTO_FULL,
+ goodtest_regular, &regular_proc_data
+ },
+ { "macro entry (processed, high-level scan)",
+ "macro.bib", BTO_FULL,
+ goodtest_macro, &macro_proc_data
+ },
+ { "comment entry (processed, high-level scan)",
+ "comment.bib", BTO_FULL,
+ goodtest_comment, &comment_proc_data
+ },
+ { "preamble entry (processed, high-level scan)",
+ "preamble.bib", BTO_FULL,
+ goodtest_preamble, &preamble_proc_data
+ },
+};
+
+
+#define NUM_TESTS sizeof (tests) / sizeof (tests[0])
+
+
+boolean eviltest_regular (AST * entry, test_data * data)
+{
+ boolean ok = TRUE;
+ AST * key;
+ AST * field;
+ AST * value;
+ int field_num;
+ int value_num;
+
+ CHECK_ESCAPE (entry != NULL, return FALSE, "entry")
+ CHECK (entry->nodetype == BTAST_ENTRY)
+ CHECK (entry->metatype == BTE_REGULAR)
+ CHECK (strcmp (entry->text, "book") == 0)
+
+ key = entry->down;
+ CHECK_ESCAPE (key != NULL, return FALSE, "entry")
+ CHECK (key->nodetype == BTAST_KEY)
+ CHECK (key->metatype == BTE_UNKNOWN)
+ CHECK (strcmp (key->text, "abook") == 0)
+
+ field = key;
+ field_num = 0;
+ value_num = 0;
+
+ while ((field = field->right))
+ {
+ CHECK_ESCAPE (field_num < data->num_fields, break, "entry")
+ CHECK (field->nodetype == BTAST_FIELD)
+ CHECK (field->metatype == BTE_UNKNOWN)
+ CHECK (strcmp (field->text, data->fields[field_num++]) == 0)
+
+ value = field->down;
+ while (value)
+ {
+ CHECK_ESCAPE (value_num < data->num_values, break, "field")
+ CHECK (value->nodetype == data->ntypes[value_num])
+ CHECK (strcmp (value->text, data->values[value_num]) == 0)
+ value = value->right;
+ value_num++;
+ }
+ }
+ CHECK (field_num == data->num_fields)
+ CHECK (value_num == data->num_values)
+
+ return ok;
+
+} /* eviltest_regular () */
+
+
+boolean eviltest_macro (AST * entry, test_data * data)
+{
+ boolean ok = TRUE;
+ AST * macro;
+ AST * value;
+ int macro_num;
+ int value_num;
+
+ CHECK (entry != NULL)
+ CHECK (entry->nodetype == BTAST_ENTRY)
+ CHECK (entry->metatype == BTE_MACRODEF)
+ CHECK (strcmp (entry->text, "string") == 0)
+
+ macro_num = 0;
+ value_num = 0;
+ macro = entry->down;
+
+ while (macro)
+ {
+ CHECK_ESCAPE (macro_num < data->num_fields, break, "entry")
+ CHECK (macro->nodetype == BTAST_FIELD)
+ CHECK (macro->metatype == BTE_UNKNOWN)
+ CHECK (strcmp (macro->text, data->fields[macro_num++]) == 0)
+
+ value = macro->down;
+ while (value)
+ {
+ CHECK_ESCAPE (value_num < data->num_values, break, "macro")
+ CHECK (value->nodetype == data->ntypes[value_num])
+ CHECK (strcmp (value->text, data->values[value_num]) == 0)
+ value = value->right;
+ value_num++;
+ }
+ macro = macro->right;
+ }
+ CHECK (macro_num == data->num_fields)
+ CHECK (value_num == data->num_values)
+
+ return ok;
+
+} /* eviltest_macro () */
+
+
+boolean eviltest_comment (AST * entry, test_data * data)
+{
+ boolean ok = TRUE;
+ AST * value;
+
+ CHECK_ESCAPE (entry != NULL, return FALSE, "entry");
+ CHECK (strcmp (entry->text, "comment") == 0);
+
+ value = entry->down;
+ CHECK_ESCAPE (value != NULL, return FALSE, "entry");
+ CHECK (strcmp (value->text, data->values[0]) == 0);
+ CHECK (value->right == NULL);
+ CHECK (value->down == NULL);
+
+ return ok;
+} /* eviltest_comment () */
+
+
+boolean eviltest_preamble (AST * entry, test_data * data)
+{
+ boolean ok = TRUE;
+ AST * value;
+ int value_num;
+
+ CHECK_ESCAPE (entry != NULL, return FALSE, "entry");
+ CHECK (strcmp (entry->text, "preamble") == 0);
+
+ value_num = 0;
+ value = entry->down;
+ while (value)
+ {
+ CHECK_ESCAPE (value_num < data->num_values, break, "entry");
+ CHECK (value->nodetype == BTAST_STRING);
+ CHECK (strcmp (value->text, data->values[value_num]) == 0);
+
+ value = value->right;
+ value_num++;
+ }
+
+ CHECK (value_num == data->num_values);
+ return ok;
+
+} /* eviltest_preamble () */
+
+
+boolean goodtest_regular (AST * entry, test_data * data)
+{
+ boolean ok = TRUE;
+ AST * field;
+ AST * value;
+ char * field_name;
+ char * value_text;
+ bt_nodetype
+ value_nodetype;
+ int field_num;
+ int value_num;
+
+ CHECK (bt_entry_metatype (entry) == BTE_REGULAR);
+ CHECK (strcmp (bt_entry_type (entry), "book") == 0);
+ CHECK (strcmp (bt_entry_key (entry), "abook") == 0);
+
+ field = NULL;
+ field_num = 0;
+ value_num = 0;
+
+ while ((field = bt_next_field (entry, field, &field_name)))
+ {
+ CHECK_ESCAPE (field_num < data->num_fields, break, "entry");
+ CHECK (strcmp (field_name, data->fields[field_num++]) == 0);
+
+ value = NULL;
+ while ((value = bt_next_value (field,value,&value_nodetype,&value_text)))
+ {
+ CHECK_ESCAPE (value_num < data->num_values, break, "field");
+ CHECK (value_nodetype == data->ntypes[value_num]);
+ CHECK (strcmp (value_text, data->values[value_num++]) == 0);
+ }
+ }
+
+ CHECK (field_num == data->num_fields);
+ CHECK (value_num == data->num_values);
+
+ return ok;
+
+}
+
+
+boolean goodtest_macro (AST * entry, test_data * data)
+{
+ boolean ok = TRUE;
+ AST * macro;
+ AST * value;
+ char * macro_name;
+ char * value_text;
+ bt_nodetype
+ value_nodetype;
+ int macro_num;
+ int value_num;
+
+ CHECK (bt_entry_metatype (entry) == BTE_MACRODEF);
+ CHECK (strcmp (bt_entry_type (entry), "string") == 0);
+ CHECK (bt_entry_key (entry) == NULL);
+
+ macro = NULL;
+ macro_num = 0;
+ value_num = 0;
+
+ while ((macro = bt_next_macro (entry, macro, &macro_name)))
+ {
+ CHECK_ESCAPE (macro_num < data->num_fields, break, "entry");
+ CHECK (strcmp (macro_name, data->fields[macro_num++]) == 0);
+
+ value = NULL;
+ while ((value = bt_next_value (macro,value,&value_nodetype,&value_text)))
+ {
+ CHECK_ESCAPE (value_num < data->num_values, break, "macro");
+ CHECK (value_nodetype == data->ntypes[value_num]);
+ CHECK (strcmp (value_text, data->values[value_num++]) == 0);
+ }
+ }
+
+ CHECK (macro_num == data->num_fields);
+ CHECK (value_num == data->num_values);
+
+ return ok;
+
+}
+
+
+boolean goodtest_comment (AST * entry, test_data * data)
+{
+ boolean ok = TRUE;
+ AST * value;
+ char * text;
+
+ CHECK (bt_entry_metatype (entry) == BTE_COMMENT);
+ CHECK (strcmp (bt_entry_type (entry), "comment") == 0);
+
+ value = bt_next_value (entry, NULL, NULL, &text);
+ CHECK (strcmp (text, data->values[0]) == 0);
+
+ return ok;
+}
+
+
+boolean goodtest_preamble (AST * entry, test_data * data)
+{
+ boolean ok = TRUE;
+ AST * value;
+ char * value_text;
+ bt_nodetype
+ value_nodetype;
+ int value_num;
+
+ CHECK (bt_entry_metatype (entry) == BTE_PREAMBLE);
+ CHECK (strcmp (bt_entry_type (entry), "preamble") == 0);
+
+ value = NULL;
+ value_num = 0;
+ while ((value = bt_next_value (entry, value, &value_nodetype, &value_text)))
+ {
+ CHECK_ESCAPE (value_num < data->num_values, break, "entry");
+ CHECK (value_nodetype == BTAST_STRING);
+ CHECK (strcmp (value_text, data->values[value_num++]) == 0);
+ }
+
+ CHECK (value_num == data->num_values);
+ return ok;
+}
+
+
+boolean test_multiple (FILE * file,
+ char * filename,
+ btshort string_opts,
+ btshort other_opts,
+ int num_entries,
+ test * tests)
+{
+ boolean entry_ok;
+ boolean ok;
+ int entry_num;
+ AST * entry;
+
+ ok = TRUE;
+ entry_num = 0;
+
+ printf ("multiple entries in one file, read individually:\n");
+ set_all_stringopts (string_opts);
+
+ while (1)
+ {
+ entry = bt_parse_entry (file, filename, other_opts, &entry_ok);
+ if (!entry) break; /* at eof? */
+
+ CHECK_ESCAPE (entry_num < num_entries, break, "file");
+
+ entry_ok &= tests[entry_num].test_func (entry, tests[entry_num].data);
+ printf (" %s: %s\n",
+ tests[entry_num].desc,
+ entry_ok ? "ok" : "not ok");
+ entry_num++;
+ bt_free_ast (entry);
+
+ ok &= entry_ok;
+ }
+
+ CHECK (entry_num == num_entries);
+ printf ("...%s\n", ok ? "all ok" : "not all ok");
+ return ok;
+
+}
+
+
+boolean test_wholefile (char * filename,
+ btshort string_opts,
+ btshort other_opts,
+ int num_entries,
+ test * tests)
+{
+ boolean entry_ok;
+ boolean ok;
+ int entry_num;
+ AST * entries,
+ * entry;
+
+ ok = TRUE;
+ entry_num = 0;
+
+ printf ("multiple entries in one file, read together:\n");
+ set_all_stringopts (string_opts);
+ entries = bt_parse_file (filename, other_opts, &entry_ok);
+ CHECK (entry_ok);
+
+ entry = NULL;
+ while ((entry = bt_next_entry (entries, entry)))
+ {
+ CHECK_ESCAPE (entry_num < num_entries, break, "file");
+
+ entry_ok = tests[entry_num].test_func (entry, tests[entry_num].data);
+ printf (" %s: %s\n",
+ tests[entry_num].desc,
+ entry_ok ? "ok" : "not ok");
+ entry_num++;
+
+ ok &= entry_ok;
+ }
+
+ CHECK (entry_num == num_entries);
+ bt_free_ast (entries);
+ printf ("...%s\n", ok ? "all ok" : "not all ok");
+ return ok;
+
+}
+
+
+int main (void)
+{
+ unsigned i;
+ char filename[256];
+ FILE * infile;
+ AST * entry;
+ btshort options = 0; /* use default non-string options */
+ boolean ok;
+ int num_failures = 0;
+
+ bt_initialize ();
+
+ for (i = 0; i < NUM_TESTS; i++)
+ {
+ infile = open_file (tests[i].filename, DATA_DIR, filename, 255);
+
+ /* Override string-processing options for all entry metatypes */
+ set_all_stringopts (tests[i].options);
+
+ entry = bt_parse_entry (infile, filename, options, &ok);
+ ok &= tests[i].test_func (entry, tests[i].data);
+ bt_free_ast (entry);
+ entry = bt_parse_entry (infile, filename, options, NULL);
+ CHECK ((entry == NULL));
+ CHECK (feof (infile));
+ fclose (infile);
+
+ printf ("%s: %s\n", tests[i].desc, ok ? "ok" : "not ok");
+ if (!ok) num_failures++;
+ } /* for i */
+
+ infile = open_file ("simple.bib", DATA_DIR, filename, 255);
+ if (! test_multiple (infile, filename, BTO_MINIMAL, options, 4, tests+4))
+ num_failures++;
+ rewind (infile);
+ if (! test_multiple (infile, filename, BTO_FULL, options, 4, tests+12))
+ num_failures++;
+
+ fclose (infile);
+
+ if (! test_wholefile (DATA_DIR "/" "simple.bib",
+ BTO_MINIMAL, options, 4, tests+4))
+ num_failures++;
+ if (! test_wholefile (DATA_DIR "/" "simple.bib",
+ BTO_FULL, options, 4, tests+12))
+ num_failures++;
+
+ bt_cleanup ();
+
+ if (num_failures == 0)
+ printf ("All tests successful\n");
+ else
+ printf ("%d failed tests\n", num_failures);
+
+ return (num_failures > 0);
+}
diff --git a/btparse/tests/testlib.c b/btparse/tests/testlib.c
new file mode 100644
index 0000000..e101a9a
--- /dev/null
+++ b/btparse/tests/testlib.c
@@ -0,0 +1,43 @@
+/* ------------------------------------------------------------------------
+@NAME : testlib.c
+@INPUT :
+@OUTPUT :
+@RETURNS :
+@DESCRIPTION: Code common to all btparse test programs.
+@GLOBALS :
+@CALLS :
+@CALLERS :
+@CREATED : 1997/09/26, Greg Ward
+@MODIFIED :
+@VERSION : $Id$
+-------------------------------------------------------------------------- */
+
+#include "bt_config.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include "testlib.h"
+#include "my_dmalloc.h"
+
+
+FILE *open_file (char *basename, char *dirname, char *filename, int len)
+{
+ FILE * file;
+
+ snprintf (filename, len-1, "%s/%s", dirname, basename);
+ file = fopen (filename, "r");
+ if (file == NULL)
+ {
+ perror (filename);
+ exit (1);
+ }
+ return file;
+}
+
+
+void set_all_stringopts (btshort options)
+{
+ bt_set_stringopts (BTE_REGULAR, options);
+ bt_set_stringopts (BTE_MACRODEF, options);
+ bt_set_stringopts (BTE_COMMENT, options);
+ bt_set_stringopts (BTE_PREAMBLE, options);
+}
diff --git a/btparse/tests/testlib.h b/btparse/tests/testlib.h
new file mode 100644
index 0000000..3df1999
--- /dev/null
+++ b/btparse/tests/testlib.h
@@ -0,0 +1,46 @@
+/* ------------------------------------------------------------------------
+@NAME : testlib.h
+@DESCRIPTION: Macros and prototypes common to all the btparse test programs.
+@CREATED : 1997/09/26, Greg Ward
+@MODIFIED :
+@VERSION : $Id$
+-------------------------------------------------------------------------- */
+
+#ifndef TESTLIB_H
+#define TESTLIB_H
+
+#include "btparse.h"
+
+#ifndef DATA_DIR
+# define DATA_DIR "btparse/tests/data"
+#endif
+
+
+#define CHECK(cond) \
+if (! (cond)) \
+{ \
+ fprintf (stderr, "failed check: %s, at %s line %d\n", \
+ #cond, __FILE__, __LINE__); \
+ ok = FALSE; \
+}
+
+#define CHECK_ESCAPE(cond,escape,what) \
+if (! (cond)) \
+{ \
+ fprintf (stderr, "failed check: %s, at %s line %d\n", \
+ #cond, __FILE__, __LINE__); \
+ if (what) \
+ { \
+ fprintf (stderr, "(skipping the rest of this %s)\n", \
+ what); \
+ } \
+ ok = FALSE; \
+ escape; \
+}
+
+
+FILE *open_file (char *basename, char *dirname, char *filename, int len);
+void set_all_stringopts (btshort options);
+
+
+#endif /* TESTLIB_H */
diff --git a/btparse/tests/tex_test.c b/btparse/tests/tex_test.c
new file mode 100644
index 0000000..e157264
--- /dev/null
+++ b/btparse/tests/tex_test.c
@@ -0,0 +1,42 @@
+/* $Id$ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include "btparse.h"
+
+int main (void)
+{
+ char line[1024];
+ int line_num;
+ int len;
+ bt_tex_tree *
+ tree;
+ char * str;
+
+ line_num = 0;
+ while (! feof (stdin))
+ {
+ if (fgets (line, 1024, stdin))
+ {
+ len = strlen (line);
+ if (line[len-1] == '\n') line[len-1] = '\0';
+ line_num++;
+
+ tree = bt_build_tex_tree (line);
+
+ if (tree)
+ {
+ printf ("tree =\n");
+ bt_dump_tex_tree (tree, 0, stdout);
+
+ str = bt_flatten_tex_tree (tree);
+ printf ("flattened tree = [%s]\n", str);
+ if (strcmp (line, str) != 0)
+ printf ("uh-oh! line and str don't match!\n");
+ free (str);
+ }
+ }
+ }
+ return 0;
+}
diff --git a/examples/append_entries b/examples/append_entries
new file mode 100755
index 0000000..b12ee29
--- /dev/null
+++ b/examples/append_entries
@@ -0,0 +1,78 @@
+#!/usr/bin/perl -w
+
+#
+# append_entries
+#
+# Appends entries from a source file to a destination file. Only regular
+# entries are copied; macro definitions, preambles, and comments are
+# dropped. User may supply a regex which the entry keys must match to be
+# appended.
+#
+# Note that a "real" entry appender/database merger would be a lot more
+# complicated than this! Some things that would have to be handled:
+# * enforcing the structure of input entries (eg. making sure they
+# conform to the rules of some database structure such as 'Bib')
+# * doing any other checks particular to your database, such as ensuring
+# that journal or conference names come from an approved list of
+# "known" journals and conferences (to enforce consistent naming
+# across a large database)
+# * detecting and resolving key collisions
+# * adding any preambles in the source file to the destination file
+# * checking for duplicate macro definitions
+# * ensuring that macros used in the source file are defined in
+# the destination file
+#
+# by Greg Ward, 1998/04/04
+#
+# $Id$
+#
+
+# Copyright (c) 1997-2000 by Gregory P. Ward. All rights reserved. This file
+# is part of the Text::BibTeX library. This library is free software; you
+# may redistribute it and/or modify it under the same terms as Perl itself.
+
+use strict;
+use Text::BibTeX;
+
+my $usage = <<USAGE;
+usage: $0 dest_file source_file [key_pattern]
+ appends regular entries from <source_file> whose keys match
+ <key_pattern> to <dest_file>; if <key_pattern> not supplied, all
+ entries from <source_file> are taken
+USAGE
+
+die $usage unless @ARGV == 2 || @ARGV == 3;
+my ($dest_filename, $source_filename, $key_pattern) = @ARGV;
+
+# Open the two files: dest_file in append mode (ultimately just using
+# perl's builtin 'open'), and source_file in regular read-only mode.
+my $dest_file = Text::BibTeX::File->new(">>$dest_filename")
+ or die "couldn't open $dest_filename for appending: $!\n";
+my $source_file = Text::BibTeX::File->new($source_filename)
+ or die "couldn't open $source_filename: $!\n";
+
+# Turn on 'value preservation' mode for the input file. This is mainly so
+# we don't lose the fact that macros are macros and numbers are numbers,
+# but it also frees us from having to worry about predefined macros
+# (such as the month names).
+$source_file->preserve_values (1);
+
+# And loop over all entries in the source file, optionally appending
+# each one to the destination file.
+
+while (my $entry = Text::BibTeX::Entry->new($source_file))
+{
+ # Skip this entry if it's not a regular entry -- that is, we just
+ # drop '@string', '@comment', and '@preamble' entries, probably
+ # unacceptable in the real world.
+ next unless $entry->metatype == BTE_REGULAR;
+
+ # Skip this entry if the user supplied a regex that keys must match
+ # and this entry's key doesn't match.
+ next if defined $key_pattern && $entry->key !~ /$key_pattern/o;
+
+ # Otherwise, write this entry to the destination file. Since $dest_file
+ # was opened in append mode, $entry will be appended to the end of
+ # $dest_file.
+ $entry->write ($dest_file);
+} # while $source_file
diff --git a/inc/MyBuilder.pm b/inc/MyBuilder.pm
new file mode 100644
index 0000000..d6d2db1
--- /dev/null
+++ b/inc/MyBuilder.pm
@@ -0,0 +1,403 @@
+package MyBuilder;
+use base 'Module::Build';
+
+use warnings;
+use strict;
+
+use Config;
+use Carp;
+
+use Config::AutoConf;
+use ExtUtils::LibBuilder;
+
+use ExtUtils::ParseXS;
+use ExtUtils::Mkbootstrap;
+
+use File::Spec::Functions qw.catdir catfile.;
+use File::Path qw.mkpath.;
+use Cwd 'abs_path';
+
+my @EXTRA_FLAGS = ();
+my @BINARIES = qw(biblex bibparse dumpnames);
+
+## debug
+## @EXTRA_FLAGS = ('-g', "-DDEBUG=2");
+
+sub ACTION_install {
+ my $self = shift;
+
+ my $usrlib = $self->install_path( 'usrlib' );
+
+ if ($^O =~ /darwin/i) {
+ my $libpath = $self->notes('lib_path');
+ $libpath = catfile($libpath, "libbtparse$LIBEXT");
+
+ # do it for binaries as well.
+ `install_name_tool -id "$libpath" ./blib/usrlib/libbtparse.dylib`;
+ # binries
+ my $libfile = "btparse/src/libbtparse$LIBEXT";
+ my $abs_path = abs_path($libfile);
+ foreach my $bin (@BINARIES) {
+ `install_name_tool -change "$abs_path" "$libpath" ./blib/bin/$bin$EXEEXT`;
+ }
+ my $bundle = $self->notes("bundle");
+ `install_name_tool -change "$abs_path" "$libpath" $bundle`;
+ }
+
+ if ($^O =~ /cygwin/i) { # cygwin uses windows lib searching (PATH instead of LD_LIBRARY_PATH)
+ $self->install_path( 'usrlib' => '/usr/local/bin' );
+ }
+ elsif (defined $self->{properties}{install_base}) {
+ $usrlib = catdir($self->{properties}{install_base} => 'lib');
+ $self->install_path( 'usrlib' => $usrlib );
+ }
+ $self->SUPER::ACTION_install;
+ if ($^O =~ /linux/ && $ENV{USER} eq 'root') {
+ my $linux = Config::AutoConf->check_prog("ldconfig");
+ system $linux if (-x $linux);
+ }
+ if ($^O =~ /(?:linux|bsd|sun|sol|dragonfly|hpux|irix|darwin|gnu)/
+ &&
+ $usrlib !~ m!^/usr(/local)?/lib/?$!)
+ {
+ warn "\n** WARNING **\n"
+ . "It seems you are installing in a non standard path.\n"
+ . "You might need to add $usrlib to your library search path.\n";
+ }
+}
+
+sub ACTION_code {
+ my $self = shift;
+
+ for my $path (catdir("blib","bindoc"), catdir("blib","bin")) {
+ mkpath $path unless -d $path;
+ }
+
+ my $libbuilder = ExtUtils::LibBuilder->new;
+ $self->notes('libbuilder', $libbuilder);
+
+ my $version = $self->notes('btparse_version');
+
+ my $alloca_h = 'undef HAVE_ALLOCA_H';
+ $alloca_h = 'define HAVE_ALLOCA_H 1' if Config::AutoConf->check_header("alloca.h");
+
+ my $vsnprintf = 'undef HAVE_VSNPRINTF';
+ $vsnprintf = 'define HAVE_VSNPRINTF 1' if Config::AutoConf->check_func('vsnprintf');
+
+ my $strlcat = 'undef HAVE_STRLCAT';
+ $strlcat = 'define HAVE_STRLCAT 1' if Config::AutoConf->check_func('strlcat');
+
+ _interpolate("btparse/src/bt_config.h.in",
+ "btparse/src/bt_config.h",
+ PACKAGE => "\"libbtparse\"",
+ FPACKAGE => "\"libbtparse $version\"",
+ VERSION => "\"$version\"",
+ ALLOCA_H => $alloca_h,
+ VSNPRINTF => $vsnprintf,
+ STRLCAT => $strlcat
+ );
+
+
+ $self->dispatch("create_manpages");
+ $self->dispatch("create_objects");
+ $self->dispatch("create_library");
+ $self->dispatch("create_binaries");
+ $self->dispatch("create_tests");
+
+ $self->dispatch("compile_xscode");
+
+ $self->SUPER::ACTION_code;
+}
+
+sub ACTION_compile_xscode {
+ my $self = shift;
+ my $cbuilder = $self->cbuilder;
+
+ my $archdir = catdir( $self->blib, 'arch', 'auto', 'Text', 'BibTeX');
+ mkpath( $archdir, 0, 0777 ) unless -d $archdir;
+
+ print STDERR "\n** Preparing XS code\n";
+ my $cfile = catfile("xscode","BibTeX.c");
+ my $xsfile= catfile("xscode","BibTeX.xs");
+
+ $self->add_to_cleanup($cfile); ## FIXME
+ if (!$self->up_to_date($xsfile, $cfile)) {
+ ExtUtils::ParseXS::process_file( filename => $xsfile,
+ prototypes => 0,
+ output => $cfile);
+ }
+
+ my $ofile = catfile("xscode","BibTeX.o");
+ $self->add_to_cleanup($ofile); ## FIXME
+ if (!$self->up_to_date($cfile, $ofile)) {
+ $cbuilder->compile( source => $cfile,
+ extra_compiler_flags => [@EXTRA_FLAGS],
+ include_dirs => [ catdir("btparse","src") ],
+ object_file => $ofile);
+ }
+
+ # Create .bs bootstrap file, needed by Dynaloader.
+ my $bs_file = catfile( $archdir, "BibTeX.bs" );
+ if ( !$self->up_to_date( $ofile, $bs_file ) ) {
+ ExtUtils::Mkbootstrap::Mkbootstrap($bs_file);
+ if ( !-f $bs_file ) {
+ # Create file in case Mkbootstrap didn't do anything.
+ open( my $fh, '>', $bs_file ) or confess "Can't open $bs_file: $!";
+ }
+ utime( (time) x 2, $bs_file ); # touch
+ }
+
+ my $objects = $self->rscan_dir("xscode",qr/\.o$/);
+ # .o => .(a|bundle)
+ my $lib_file = catfile( $archdir, "BibTeX.$Config{dlext}" );
+ $self->notes("bundle", $lib_file); # useful for darwin
+ if ( !$self->up_to_date( [ @$objects ], $lib_file ) ) {
+ my $btparselibdir = $self->install_path('usrlib');
+ $cbuilder->link(
+ module_name => 'Text::BibTeX',
+ extra_linker_flags => "-Lbtparse/src -lbtparse ",
+ objects => $objects,
+ lib_file => $lib_file,
+ );
+ }
+}
+
+sub ACTION_create_manpages {
+ my $self = shift;
+
+ print STDERR "\n** Creating Manpages\n";
+
+ my $pods = $self->rscan_dir(catdir("btparse","doc"), qr/\.pod$/);
+
+ my $version = $self->notes('btparse_version');
+ for my $pod (@$pods) {
+ my $man = $pod;
+ $man =~ s!.pod!.1!;
+ $man =~ s!btparse/doc!blib/bindoc!; ## FIXME - path
+ next if $self->up_to_date($pod, $man);
+ ## FIXME
+ `pod2man --section=1 --center="btparse" --release="btparse, version $version" $pod $man`;
+ }
+
+ my $pod = 'btool_faq.pod';
+ my $man = catfile('blib','bindoc','btool_faq.1');
+ unless ($self->up_to_date($pod, $man)) {
+ ## FIXME
+ `pod2man --section=1 --center="btparse" --release="btparse, version $version" $pod $man`;
+ }
+}
+
+sub ACTION_create_objects {
+ my $self = shift;
+ my $cbuilder = $self->cbuilder;
+
+ print STDERR "\n** Compiling C files\n";
+ my $c_progs = $self->rscan_dir('btparse/progs', qr/\.c$/);
+ my $c_src = $self->rscan_dir('btparse/src', qr/\.c$/);
+ my $c_tests = $self->rscan_dir('btparse/tests', qr/\.c$/);
+ my $c_xs = $self->rscan_dir('xscode/', qr/\.c$/);
+
+ my @c_files = (@$c_progs, @$c_src, @$c_tests, @$c_xs);
+ for my $file (@c_files) {
+ my $object = $file;
+ $object =~ s/\.c/.o/;
+ next if $self->up_to_date($file, $object);
+ $cbuilder->compile(object_file => $object,
+ extra_compiler_flags=>["-D_FORTIFY_SOURCE=1",@EXTRA_FLAGS],
+ source => $file,
+ include_dirs => ["btparse/src"]);
+ }
+}
+
+
+sub ACTION_create_binaries {
+ my $self = shift;
+ my $cbuilder = $self->cbuilder;
+ my $libbuilder = $self->notes('libbuilder');
+ my $EXEEXT = $libbuilder->{exeext};
+ my $btparselibdir = $self->install_path('usrlib');
+
+ print STDERR "\n** Creating binaries (",join(", ", map { $_.$EXEEXT } @BINARIES), ")\n";
+
+ my $extra_linker_flags = sprintf("-Lbtparse/src %s -lbtparse ",
+ ($^O !~ /darwin/)?"-Wl,-R${btparselibdir}":"");
+
+ my @toinstall;
+
+ for my $bin (@BINARIES) {
+ my $exe_file = catfile("btparse","progs","$bin$EXEEXT");
+ push @toinstall, $exe_file;
+ my $objects = [ catfile("btparse","progs","$bin.o") ];
+
+ if ($bin eq "bibparse") { # hack for now
+ $objects = [map {catfile("btparse","progs","$_.o")} (qw.bibparse args getopt getopt1.)];
+ }
+
+ if (!$self->up_to_date($objects, $exe_file)) {
+ $libbuilder->link_executable(exe_file => $exe_file,
+ objects => $objects ,
+ extra_linker_flags => $extra_linker_flags);
+ }
+ }
+
+ for my $file (@toinstall) {
+ $self->copy_if_modified( from => $file,
+ to_dir => "blib/bin",
+ flatten => 1);
+ }
+
+}
+
+sub ACTION_create_tests {
+ my $self = shift;
+ my $cbuilder = $self->cbuilder;
+
+ my $libbuilder = $self->notes('libbuilder');
+ my $EXEEXT = $libbuilder->{exeext};
+
+ print STDERR "\n** Creating test binaries\n";
+
+ my $exe_file = catfile("btparse","tests","simple_test$EXEEXT");
+ my $objects = [ map{catfile("btparse","tests","$_.o")} (qw.simple_test testlib.) ];
+
+ if (!$self->up_to_date($objects, $exe_file)) {
+ $libbuilder->link_executable(exe_file => $exe_file,
+ extra_linker_flags => '-Lbtparse/src -lbtparse ',
+ objects => $objects);
+ }
+
+ $exe_file = catfile("btparse","tests","read_test$EXEEXT");
+ $objects = [ map{catfile("btparse","tests","$_.o")}(qw.read_test testlib.) ];
+ if (!$self->up_to_date($objects, $exe_file)) {
+ $libbuilder->link_executable(exe_file => $exe_file,
+ extra_linker_flags => '-Lbtparse/src -lbtparse ',
+ objects => $objects);
+ }
+
+ $exe_file = catfile("btparse","tests","postprocess_test$EXEEXT");
+ $objects = [ map{catfile("btparse","tests","$_.o")}(qw.postprocess_test.) ];
+ if (!$self->up_to_date($objects, $exe_file)) {
+ $libbuilder->link_executable(exe_file => $exe_file,
+ extra_linker_flags => '-Lbtparse/src -lbtparse ',
+ objects => $objects);
+ }
+
+ $exe_file = catfile("btparse","tests","tex_test$EXEEXT");
+ $objects = [ map{catfile("btparse","tests","$_.o")}(qw.tex_test.) ];
+ if (!$self->up_to_date($objects, $exe_file)) {
+ $libbuilder->link_executable(exe_file => $exe_file,
+ extra_linker_flags => '-Lbtparse/src -lbtparse ',
+ objects => $objects);
+ }
+
+ $exe_file = catfile("btparse","tests","macro_test$EXEEXT");
+ $objects = [ map{catfile("btparse","tests","$_.o")}(qw.macro_test.) ];
+ if (!$self->up_to_date($objects, $exe_file)) {
+ $libbuilder->link_executable(exe_file => $exe_file,
+ extra_linker_flags => '-Lbtparse/src -lbtparse ',
+ objects => $objects);
+ }
+
+ $exe_file = catfile("btparse","tests","name_test$EXEEXT");
+ $objects = [ map{catfile("btparse","tests","$_.o")}(qw.name_test.) ];
+ if (!$self->up_to_date($objects, $exe_file)) {
+ $libbuilder->link_executable(exe_file => $exe_file,
+ extra_linker_flags => '-Lbtparse/src -lbtparse ',
+ objects => $objects);
+ }
+
+ $exe_file = catfile("btparse","tests","purify_test$EXEEXT");
+ $objects = [ map{catfile("btparse","tests","$_.o")}(qw.purify_test.) ];
+ if (!$self->up_to_date($objects, $exe_file)) {
+ $libbuilder->link_executable(exe_file => $exe_file,
+ extra_linker_flags => '-Lbtparse/src -lbtparse ',
+ objects => $objects);
+ }
+}
+
+sub ACTION_create_library {
+ my $self = shift;
+ my $cbuilder = $self->cbuilder;
+
+
+ my $libbuilder = $self->notes('libbuilder');
+ my $LIBEXT = $libbuilder->{libext};
+
+ print STDERR "\n** Creating libbtparse$LIBEXT\n";
+
+ my @modules = qw:init input bibtex err scan error
+ lex_auxiliary parse_auxiliary bibtex_ast sym
+ util postprocess macros traversal modify
+ names tex_tree string_util format_name:;
+
+ my @objects = map { "btparse/src/$_.o" } @modules;
+
+ my $libpath = $self->notes('lib_path');
+ $libpath = catfile($libpath, "libbtparse$LIBEXT");
+ my $libfile = "btparse/src/libbtparse$LIBEXT";
+
+ my $extra_linker_flags = "";
+ if ($^O =~ /darwin/) {
+ my $abs_path = abs_path($libfile);
+ $extra_linker_flags = "-install_name $abs_path";
+ } elsif ($LIBEXT eq ".so") {
+ $extra_linker_flags = "-Wl,-soname,libbtparse$LIBEXT";
+ }
+
+ if (!$self->up_to_date(\@objects, $libfile)) {
+ $libbuilder->link(module_name => 'btparse',
+ objects => \@objects,
+ lib_file => $libfile,
+ extra_linker_flags => $extra_linker_flags);
+ }
+
+ my $libdir = catdir($self->blib, 'usrlib');
+ mkpath( $libdir, 0, 0777 ) unless -d $libdir;
+
+ $self->copy_if_modified( from => $libfile,
+ to_dir => $libdir,
+ flatten => 1 );
+}
+
+sub ACTION_test {
+ my $self = shift;
+
+ if ($^O =~ /darwin/i) {
+ $ENV{DYLD_LIBRARY_PATH} = catdir($self->blib, "usrlib");
+ }
+ elsif ($^O =~ /(?:linux|bsd|sun|sol|dragonfly|hpux|irix|gnu)/i) {
+ $ENV{LD_LIBRARY_PATH} = catdir($self->blib, "usrlib");
+ }
+ elsif ($^O =~ /aix/i) {
+ my $oldlibpath = $ENV{LIBPATH} || '/lib:/usr/lib';
+ $ENV{LIBPATH} = catdir($self->blib, "usrlib").":$oldlibpath";
+ }
+ elsif ($^O =~ /cygwin/i) {
+ # cygwin uses windows lib searching (PATH instead of LD_LIBRARY_PATH)
+ my $oldpath = $ENV{PATH};
+ $ENV{PATH} = catdir($self->blib, "usrlib").":$oldpath";
+ }
+ elsif ($^O =~ /mswin32/i) {
+ my $oldpath = $ENV{PATH};
+ $ENV{PATH} = catdir($self->blib, "usrlib").";$oldpath";
+ }
+ $self->SUPER::ACTION_test
+}
+
+
+sub _interpolate {
+ my ($from, $to, %config) = @_;
+
+ print "Creating new '$to' from '$from'.\n";
+ open FROM, $from or die "Cannot open file '$from' for reading.\n";
+ open TO, ">", $to or die "Cannot open file '$to' for writing.\n";
+ while (<FROM>) {
+ s/\[%\s*(\S+)\s*%\]/$config{$1}/ge;
+ print TO;
+ }
+ close TO;
+ close FROM;
+}
+
+
+1;
diff --git a/lib/Text/BibTeX.pm b/lib/Text/BibTeX.pm
new file mode 100644
index 0000000..41cc2c1
--- /dev/null
+++ b/lib/Text/BibTeX.pm
@@ -0,0 +1,809 @@
+# ----------------------------------------------------------------------
+# NAME : BibTeX.pm
+# DESCRIPTION: Code for the Text::BibTeX module; loads up everything
+# needed for parsing BibTeX files (both Perl and C code).
+# CREATED : February 1997, Greg Ward
+# MODIFIED :
+# VERSION : $Id: BibTeX.pm 7274 2009-05-03 17:18:14Z ambs $
+# COPYRIGHT : Copyright (c) 1997-2000 by Gregory P. Ward. All rights reserved.
+#
+# This file is part of the Text::BibTeX library. This
+# library is free software; you may redistribute it and/or
+# modify it under the same terms as Perl itself.
+# ----------------------------------------------------------------------
+
+package Text::BibTeX;
+use Text::BibTeX::Name;
+use Text::BibTeX::NameFormat;
+
+use 5.008001; # needed for Text::BibTeX::Entry
+
+use strict;
+use Carp;
+use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $AUTOLOAD);
+
+require Exporter;
+require DynaLoader;
+
+our $VERSION='0.85';
+
+@ISA = qw(Exporter DynaLoader);
+%EXPORT_TAGS = (nodetypes => [qw(BTAST_STRING BTAST_MACRO BTAST_NUMBER)],
+ metatypes => [qw(BTE_UNKNOWN BTE_REGULAR BTE_COMMENT
+ BTE_PREAMBLE BTE_MACRODEF)],
+ nameparts => [qw(BTN_FIRST BTN_VON BTN_LAST BTN_JR BTN_NONE)],
+ joinmethods => [qw(BTJ_MAYTIE BTJ_SPACE
+ BTJ_FORCETIE BTJ_NOTHING)],
+ subs => [qw(bibloop split_list
+ purify_string change_case)],
+ macrosubs => [qw(add_macro_text
+ delete_macro
+ delete_all_macros
+ macro_length
+ macro_text)]);
+@EXPORT_OK = (@{$EXPORT_TAGS{'subs'}},
+ @{$EXPORT_TAGS{'macrosubs'}},
+ @{$EXPORT_TAGS{'nodetypes'}},
+ @{$EXPORT_TAGS{'nameparts'}},
+ @{$EXPORT_TAGS{'joinmethods'}},
+ 'check_class', 'display_list' );
+@EXPORT = @{$EXPORT_TAGS{'metatypes'}};
+
+use Encode 'encode', 'decode';
+use Unicode::Normalize;
+
+
+sub _process_result {
+ no strict 'refs';
+ my ( $self, $result, $encoding, $norm ) = @_;
+
+ $norm ||= "NFC"; # best to force it here.
+ my $normsub = \&{"$norm"}; # symbolic ref
+ if ( $encoding eq "utf-8" ) {
+ if ( utf8::is_utf8($result) ) {
+ return $normsub->($result);
+ }
+ else {
+ return $normsub->( decode( $encoding, $result ) );
+ }
+ }
+ else { return $result; }
+
+}
+
+sub _process_argument {
+ my ( $self, $value, $encoding ) = @_;
+
+ if ( $encoding eq "utf-8" && utf8::is_utf8($value)) {
+ return encode( $encoding, $value );
+ }
+ else {
+ return $value;
+ }
+}
+
+sub split_list {
+ my ( $field, $delim, $filename, $line, $desc, $opts ) = @_;
+ $opts ||= {};
+ $opts->{binmode} ||= 'bytes';
+ $opts->{normalization} ||= 'NFC';
+ return
+ map { Text::BibTeX->_process_result( $_, $opts->{binmode}, $opts->{normalization} ) }
+ Text::BibTeX::isplit_list( $field, $delim, $filename, $line, $desc );
+
+}
+
+=encoding UTF-8
+
+=head1 NAME
+
+Text::BibTeX - interface to read and parse BibTeX files
+
+=head1 SYNOPSIS
+
+ use Text::BibTeX;
+
+ my $bibfile = Text::BibTeX::File->new("foo.bib");
+ my $newfile = Text::BibTeX::File->new(">newfoo.bib");
+
+ while ($entry = Text::BibTeX::Entry->new($bibfile))
+ {
+ next unless $entry->parse_ok;
+
+ . # hack on $entry contents, using various
+ . # Text::BibTeX::Entry methods
+ .
+
+ $entry->write ($newfile);
+ }
+
+=head1 DESCRIPTION
+
+The C<Text::BibTeX> module serves mainly as a high-level introduction to
+the C<Text::BibTeX> library, for both code and documentation purposes.
+The code loads the two fundamental modules for processing BibTeX files
+(C<Text::BibTeX::File> and C<Text::BibTeX::Entry>), and this
+documentation gives a broad overview of the whole library that isn't
+available in the documentation for the individual modules that comprise
+it.
+
+In addition, the C<Text::BibTeX> module provides a number of
+miscellaneous functions that are useful in processing BibTeX data
+(especially the kind that comes from bibliographies as defined by BibTeX
+0.99, rather than generic database files). These functions don't
+generally fit in the object-oriented class hierarchy centred around the
+C<Text::BibTeX::Entry> class, mainly because they are specific to
+bibliographic data and operate on generic strings (rather than being
+tied to a particular BibTeX entry). These are also documented here, in
+L<"MISCELLANEOUS FUNCTIONS">.
+
+Note that every module described here begins with the C<Text::BibTeX>
+prefix. For brevity, I have dropped this prefix from most class and
+module names in the rest of this manual page (and in most of the other
+manual pages in the library).
+
+=head1 MODULES AND CLASSES
+
+The C<Text::BibTeX> library includes a number of modules, many of which
+provide classes. Usually, the relationship is simple and obvious: a
+module provides a class of the same name---for instance, the
+C<Text::BibTeX::Entry> module provides the C<Text::BibTeX::Entry> class.
+There are a few exceptions, though: most obviously, the C<Text::BibTeX>
+module doesn't provide any classes itself, it merely loads two modules
+(C<Text::BibTeX::Entry> and C<Text::BibTeX::File>) that do. The other
+exceptions are mentioned in the descriptions below, and discussed in
+detail in the documentation for the respective modules.
+
+The modules are presented roughly in order of increasing specialization:
+the first three are essential for any program that processes BibTeX data
+files, regardless of what kind of data they hold. The later modules are
+specialized for use with bibliographic databases, and serve both to
+emulate BibTeX 0.99's standard styles and to provide an example of how
+to define a database structure through such specialized modules. Each
+module is fully documented in its respective manual page.
+
+=over 4
+
+=item C<Text::BibTeX>
+
+Loads the two fundamental modules (C<Entry> and C<File>), and provides a
+number of miscellaneous functions that don't fit anywhere in the class
+hierarchy.
+
+=item C<Text::BibTeX::File>
+
+Provides an object-oriented interface to BibTeX database files. In
+addition to the obvious attributes of filename and filehandle, the
+"file" abstraction manages properties such as the database structure and
+options for it.
+
+=item C<Text::BibTeX::Entry>
+
+Provides an object-oriented interface to BibTeX entries, which can be
+parsed from C<File> objects, arbitrary filehandles, or strings. Manages
+all the properties of a single entry: type, key, fields, and values.
+Also serves as the base class for the I<structured entry classes>
+(described in detail in L<Text::BibTeX::Structure>).
+
+=item C<Text::BibTeX::Value>
+
+Provides an object-oriented interface to I<values> and I<simple values>,
+high-level constructs that can be used to represent the strings
+associated with each field in an entry. Normally, field values are
+returned simply as Perl strings, with macros expanded and multiple
+strings "pasted" together. If desired, you can instruct C<Text::BibTeX>
+to return C<Text::BibTeX::Value> objects, which give you access to the
+original form of the data.
+
+=item C<Text::BibTeX::Structure>
+
+Provides the C<Structure> and C<StructuredEntry> classes, which serve
+primarily as base classes for the two kinds of classes that define
+database structures. Read this man page for a comprehensive description
+of the mechanism for implementing Perl classes analogous to BibTeX
+"style files".
+
+=item C<Text::BibTeX::Bib>
+
+Provides the C<BibStructure> and C<BibEntry> classes, which serve two
+purposes: they fulfill the same role as the standard style files of
+BibTeX 0.99, and they give an example of how to write new database
+structures. These ultimately derive from, respectively, the
+C<Structure> and C<StructuredEntry> classes provided by the C<Structure>
+module.
+
+=item C<Text::BibTeX::BibSort>
+
+One of the C<BibEntry> class's base classes: handles the generation of
+sort keys for sorting prior to output formatting.
+
+=item C<Text::BibTeX::BibFormat>
+
+One of the C<BibEntry> class's base classes: handles the formatting of
+bibliographic data for output in a markup language such as LaTeX.
+
+=item C<Text::BibTeX::Name>
+
+A class used by the C<Bib> structure and specific to bibliographic data
+as defined by BibTeX itself: parses individual author names into
+"first", "von", "last", and "jr" parts.
+
+=item C<Text::BibTeX::NameFormat>
+
+Also specific to bibliographic data: puts split-up names (as parsed by
+the C<Name> class) back together in a custom way.
+
+=back
+
+For a first time through the library, you'll probably want to confine
+your reading to L<Text::BibTeX::File> and L<Text::BibTeX::Entry>. The
+other modules will come in handy eventually, especially if you need to
+emulate BibTeX in a fairly fine grained way (e.g. parsing names,
+generating sort keys). But for the simple database hacks that are the
+bread and butter of the C<Text::BibTeX> library, the C<File> and
+C<Entry> classes are the bulk of what you'll need. You may also find
+some of the material in this manual page useful, namely L<"CONSTANT
+VALUES"> and L<"UTILITY FUNCTIONS">.
+
+=cut
+
+sub AUTOLOAD
+{
+ # This AUTOLOAD is used to 'autoload' constants from the constant()
+ # XS function.
+
+# print "AUTOLOAD: \$AUTOLOAD=$AUTOLOAD\n";
+
+ my ($constname, $ok, $val);
+ ($constname = $AUTOLOAD) =~ s/.*:://;
+ carp ("Recursive AUTOLOAD--probable compilation error"), return
+ if $constname eq 'constant';
+ $val = constant ($constname)
+ if $constname =~ /^BT/;
+ croak ("Unknown Text::BibTeX function: \"$constname\"")
+ unless (defined $val);
+
+# print " constant ($constname) returned \"$val\"\n";
+
+ eval "sub $AUTOLOAD { $val }";
+ $val;
+}
+
+# Load the two fundamental classes in the Text::BibTeX hierarchy
+require Text::BibTeX::File;
+require Text::BibTeX::Entry;
+
+# Load the XSUB code that's needed to parse BibTeX entries and
+# the strings in them
+bootstrap Text::BibTeX;
+
+# For the curious: I don't put the call to &initialize into a BEGIN block,
+# because then it would come before the bootstrap above, and &initialize is
+# XS code -- bad! (The manifestation of this error is rather interesting:
+# Perl calls my AUTOLOAD routine, which then tries to call `constant', but
+# that's also an as-yet-unloaded XS routine, so it falls back to AUTOLOAD,
+# which tries to call `constant' again, ad infinitum. The moral of the
+# story: beware of what you put in BEGIN blocks in XS-dependent modules!)
+
+initialize(); # these are both XS functions
+END { &cleanup; }
+
+# This can't go in a BEGIN because of the .XS bootstrapping mechanism
+_define_months();
+
+sub _define_months {
+ for my $month (qw.january february march april may june
+ july august september october november december.) {
+ add_macro_text(substr($month, 0, 3), ucfirst($month));
+ }
+}
+
+
+=head1 EXPORTS
+
+The C<Text::BibTeX> module has a number of optional exports, most of
+them constant values described in L<"CONSTANT VALUES"> below. The
+default exports are a subset of these constant values that are used
+particularly often, the "entry metatypes" (also accessible via the
+export tag C<metatypes>). Thus, the following two lines are equivalent:
+
+ use Text::BibTeX;
+ use Text::BibTeX qw(:metatypes);
+
+Some of the various subroutines provided by the module are also
+exportable. C<bibloop>, C<split_list>, C<purify_string>, and
+C<change_case> are all useful in everyday processing of BibTeX data, but
+don't really fit anywhere in the class hierarchy. They may be imported
+from C<Text::BibTeX> using the C<subs> export tag. C<check_class> and
+C<display_list> are also exportable, but only by name; they are not
+included in any export tag. (These two mainly exist for use by other
+modules in the library.) For instance, to use C<Text::BibTeX> and
+import the entry metatype constants and the common subroutines:
+
+ use Text::BibTeX qw(:metatypes :subs);
+
+Another group of subroutines exists for direct manipulation of the macro
+table maintained by the underlying C library. These functions (see
+L<"Macro table functions">, below) allow you to define, delete, and
+query the value of BibTeX macros (or "abbreviations"). They may be
+imported I<en masse> using the C<macrosubs> export tag:
+
+ use Text::BibTeX qw(:macrosubs);
+
+=head1 CONSTANT VALUES
+
+The C<Text::BibTeX> module makes a number of constant values available.
+These correspond to the values of various enumerated types in the
+underlying C library, B<btparse>, and their meanings are more fully
+explained in the B<btparse> documentation.
+
+Each group of constants is optionally exportable using an export tag
+given in the descriptions below.
+
+=over 4
+
+=item Entry metatypes
+
+C<BTE_UNKNOWN>, C<BTE_REGULAR>, C<BTE_COMMENT>, C<BTE_PREAMBLE>,
+C<BTE_MACRODEF>. The C<metatype> method in the C<Entry> class always
+returns one of these values. The latter three describe, respectively,
+C<comment>, C<preamble>, and C<string> entries; C<BTE_REGULAR> describes
+all other entry types. C<BTE_UNKNOWN> should never be seen (it's mainly
+useful for C code that might have to detect half-baked data structures).
+See also L<btparse>. Export tag: C<metatypes>.
+
+=item AST node types
+
+C<BTAST_STRING>, C<BTAST_MACRO>, C<BTAST_NUMBER>. Used to distinguish
+the three kinds of simple values---strings, macros, and numbers. The
+C<SimpleValue> class' C<type> method always returns one of these three
+values. See also L<Text::BibTeX::Value>, L<btparse>. Export tag:
+C<nodetypes>.
+
+=item Name parts
+
+C<BTN_FIRST>, C<BTN_VON>, C<BTN_LAST>, C<BTN_JR>, C<BTN_NONE>. Used to
+specify the various parts of a name after it has been split up. These
+are mainly useful when using the C<NameFormat> class. See also
+L<bt_split_names> and L<bt_format_names>. Export tag: C<nameparts>.
+
+=item Join methods
+
+C<BTJ_MAYTIE>, C<BTJ_SPACE>, C<BTJ_FORCETIE>, C<BTJ_NOTHING>. Used to
+tell the C<NameFormat> class how to join adjacent tokens together; see
+L<Text::BibTeX::NameFormat> and L<bt_format_names>. Export tag:
+C<joinmethods>.
+
+=back
+
+=head1 UTILITY FUNCTIONS
+
+C<Text::BibTeX> provides several functions that operate outside of the
+normal class hierarchy. Of these, only C<bibloop> is likely to be of
+much use to you in writing everyday BibTeX-hacking programs; the other
+two (C<check_class> and C<display_list>) are mainly provided for the use
+of other modules in the library. They are documented here mainly for
+completeness, but also because they might conceivably be useful in other
+circumstances.
+
+=over 4
+
+=item bibloop (ACTION, FILES [, DEST])
+
+Loops over all entries in a set of BibTeX files, performing some
+caller-supplied action on each entry. FILES should be a reference to
+the list of filenames to process, and ACTION a reference to a subroutine
+that will be called on each entry. DEST, if given, should be a
+C<Text::BibTeX::File> object (opened for output) to which entries might
+be printed.
+
+The subroutine referenced by ACTION is called with exactly one argument:
+the C<Text::BibTeX::Entry> object representing the entry currently being
+processed. Information about both the entry itself and the file where
+it originated is available through this object; see
+L<Text::BibTeX::Entry>. The ACTION subroutine is only called if the
+entry was successfully parsed; any syntax errors will result in a
+warning message being printed, and that entry being skipped. Note that
+I<all> successfully parsed entries are passed to the ACTION subroutine,
+even C<preamble>, C<string>, and C<comment> entries. To skip these
+pseudo-entries and only process "regular" entries, then your action
+subroutine should look something like this:
+
+ sub action {
+ my $entry = shift;
+ return unless $entry->metatype == BTE_REGULAR;
+ # process $entry ...
+ }
+
+If your action subroutine needs any more arguments, you can just create
+a closure (anonymous subroutine) as a wrapper, and pass it to
+C<bibloop>:
+
+ sub action {
+ my ($entry, $extra_stuff) = @_;
+ # ...
+ }
+
+ my $extra = ...;
+ Text::BibTeX::bibloop (sub { &action ($_[0], $extra) }, \@files);
+
+If the ACTION subroutine returns a true value and DEST was given, then
+the processed entry will be written to DEST.
+
+=cut
+
+# ----------------------------------------------------------------------
+# NAME : bibloop
+# INPUT : $action
+# $files
+# $dest
+# OUTPUT :
+# RETURNS :
+# DESCRIPTION: Loops over all entries in a set of files, calling
+# &$action on each one.
+# CREATED : summer 1996 (in original Bibtex.pm module)
+# MODIFIED : May 1997 (added to Text::BibTeX with revisions)
+# Feb 1998 (simplified and documented)
+# ----------------------------------------------------------------------
+sub bibloop (&$;$)
+{
+ my ($action, $files, $dest) = @_;
+
+ my $file;
+ while ($file = shift @$files)
+ {
+ my $bib = Text::BibTeX::File->new($file);
+
+ while (! $bib->eof())
+ {
+ my $entry = Text::BibTeX::Entry->new($bib);
+ next unless $entry->parse_ok;
+
+ my $result = &$action ($entry);
+ $entry->write ($dest, 1)
+ if ($result && $dest)
+ }
+ }
+}
+
+=item check_class (PACKAGE, DESCRIPTION, SUPERCLASS, METHODS)
+
+Ensures that a PACKAGE implements a class meeting certain requirements.
+First, it inspects Perl's symbol tables to ensure that a package named
+PACKAGE actually exists. Then, it ensures that the class named by
+PACKAGE derives from SUPERCLASS (using the universal method C<isa>).
+This derivation might be through multiple inheritance, or through
+several generations of a class hierarchy; the only requirement is that
+SUPERCLASS is somewhere in PACKAGE's tree of base classes. Finally, it
+checks that PACKAGE provides each method listed in METHODS (a reference
+to a list of method names). This is done with the universal method
+C<can>, so the methods might actually come from one of PACKAGE's base
+classes.
+
+DESCRIPTION should be a brief string describing the class that was
+expected to be provided by PACKAGE. It is used for generating warning
+messages if any of the class requirements are not met.
+
+This is mainly used by the supervisory code in
+C<Text::BibTeX::Structure>, to ensure that user-supplied structure
+modules meet the rules required of them.
+
+=cut
+
+# ----------------------------------------------------------------------
+# NAME : check_class
+# INPUT : $package - the name of a package that is expected to exist
+# $description
+# - string describing what the package is
+# $superclass
+# - a package name from which $package is expected
+# to inherit
+# $methods - ref to list of method names expected to be
+# available via $package (possibly through
+# inheritance)
+# OUTPUT :
+# RETURNS :
+# DESCRIPTION: Makes sure that a package named by $package exists
+# (by following the chain of symbol tables starting
+# at %::) Dies if not.
+# CALLERS : Text::BibTeX::Structure::new
+# CREATED : 1997/09/09, GPW
+# MODIFIED :
+# ----------------------------------------------------------------------
+sub check_class
+{
+ my ($package, $description, $superclass, $methods) = @_;
+ my (@components, $component, $prev_symtab);
+
+ @components = split ('::', $package);
+ $prev_symtab = \%::;
+ while (@components)
+ {
+ $component = (shift @components) . '::';
+ unless (defined ($prev_symtab = $prev_symtab->{$component}))
+ {
+ die "Text::BibTeX::Structure: $description " .
+ "\"$package\" apparently not supplied\n";
+ }
+ }
+
+ if ($superclass && ! $package->isa($superclass))
+ {
+ die "Text::BibTeX::Structure: $description \"$package\" " .
+ "improperly defined: ! isa ($superclass)\n";
+ }
+
+ my $method;
+ for $method (@$methods)
+ {
+ unless ($package->can($method))
+ {
+ die "Text::BibTeX::Structure: $description \"$package\" " .
+ "improperly defined: no method \"$method\"\n";
+ }
+ }
+} # &check_class
+
+
+=item display_list (LIST, QUOTE)
+
+Converts a list of strings to the grammatical conventions of a human
+language (currently, only English rules are supported). LIST must be a
+reference to a list of strings. If this list is empty, the empty string
+is returned. If it has one element, then just that element is
+returned. If it has two elements, then they are joined with the string
+C<" and "> and the resulting string is returned. Otherwise, the list
+has I<N> elements for I<N> E<gt>= 3; elements 1..I<N>-1 are joined with
+commas, and the final element is tacked on with an intervening
+C<", and ">.
+
+If QUOTE is true, then each string is encased in single quotes before
+anything else is done.
+
+This is used elsewhere in the library for two very distinct purposes:
+for generating warning messages describing lists of fields that should
+be present or are conflicting in an entry, and for generating lists of
+author names in formatted bibliographies.
+
+=cut
+
+# ----------------------------------------------------------------------
+# NAME : display_list
+# INPUT : $list - reference to list of strings to join
+# $quote - if true, they will be single-quoted before join
+# OUTPUT :
+# RETURNS : elements of @$list, joined together into a single string
+# with commas and 'and' as appropriate
+# DESCRIPTION: Formats a list of strings for display as English text.
+# CALLERS : Text::BibTeX::Structure::check_interacting_fields
+# CALLS :
+# CREATED : 1997/09/23, GPW
+# MODIFIED :
+# ----------------------------------------------------------------------
+sub display_list
+{
+ my ($list, $quote) = @_;
+ my @list;
+
+ return '' if @$list == 0;
+ @list = $quote ? map { "'$_'" } @$list : @$list;
+ return $list[0] if @list == 1;
+ return $list[0] . ' and ' . $list[1] if @list == 2;
+ return join (', ', @list[0 .. ($#list-1)]) . ', and ' . $list[-1];
+}
+
+
+=back
+
+=head1 MISCELLANEOUS FUNCTIONS
+
+In addition to loading the C<File> and C<Entry> modules, C<Text::BibTeX>
+loads the XSUB code which bridges the Perl modules to the underlying C
+library, B<btparse>. This XSUB code provides a number of miscellaneous
+utility functions, most of which are put into other packages in the
+C<Text::BibTeX> family for use by the corresponding classes. (For
+instance, the XSUB code loaded by C<Text::BibTeX> provides a function
+C<Text::BibTeX::Entry::parse>, which is actually documented as the
+C<parse> method of the C<Text::BibTeX::Entry> class---see
+L<Text::BibTeX::Entry>. However, for completeness this function---and
+all the other functions that become available when you C<use
+Text::BibTeX>---are at least mentioned here. The only functions from
+this group that you're ever likely to use are described in L<"Generic
+string-processing functions">.
+
+=head2 Startup/shutdown functions
+
+These just initialize and shutdown the underlying C library. Don't call
+either one of them; the C<Text::BibTeX> startup/shutdown code takes care
+of it as appropriate. They're just mentioned here for completeness.
+
+=over 4
+
+=item initialize ()
+
+=item cleanup ()
+
+=back
+
+=head2 Generic string-processing functions
+
+=over 4
+
+=item split_list (STRING, DELIM [, FILENAME [, LINE [, DESCRIPTION [, OPTS]]]])
+
+Splits a string on a fixed delimiter according to the BibTeX rules for
+splitting up lists of names. With BibTeX, the delimiter is hard-coded
+as C<"and">; here, you can supply any string. Instances of DELIM in
+STRING are considered delimiters if they are at brace-depth zero,
+surrounded by whitespace, and not at the beginning or end of STRING; the
+comparison is case-insensitive. See L<bt_split_names> for full details
+of how splitting is done (it's I<not> the same as Perl's C<split>
+function). OPTS is a hash ref of the same binmode and normalization
+arguments as with, e.g. Text::BibTeX::File->open(). split_list calls isplit_list()
+internally but handles UTF-8 conversion and normalization, if requested.
+
+Returns the list of strings resulting from splitting STRING on DELIM.
+
+=item isplit_list (STRING, DELIM [, FILENAME [, LINE [, DESCRIPTION]]])
+
+Splits a string on a fixed delimiter according to the BibTeX rules for
+splitting up lists of names. With BibTeX, the delimiter is hard-coded
+as C<"and">; here, you can supply any string. Instances of DELIM in
+STRING are considered delimiters if they are at brace-depth zero,
+surrounded by whitespace, and not at the beginning or end of STRING; the
+comparison is case-insensitive. See L<bt_split_names> for full details
+of how splitting is done (it's I<not> the same as Perl's C<split>
+function). This function returns bytes. Use Text::BibTeX::split_list to specify
+the same binmode and normalization arguments as with, e.g. Text::BibTeX::File->open()
+
+Returns the list of strings resulting from splitting STRING on DELIM.
+
+=item purify_string (STRING [, OPTIONS])
+
+"Purifies" STRING in the BibTeX way (usually for generation of sort
+keys). See L<bt_misc> for details; note that, unlike the C interface,
+C<purify_string> does I<not> modify STRING in-place. A purified copy of
+the input string is returned.
+
+OPTIONS is currently unused.
+
+=item change_case (TRANSFORM, STRING [, OPTIONS])
+
+Transforms the case of STRING according to TRANSFORM (a single
+character, one of C<'u'>, C<'l'>, or C<'t'>). See L<bt_misc> for
+details; again, C<change_case> differs from the C interface in that
+STRING is not modified in-place---the input string is copied, and the
+transformed copy is returned.
+
+=back
+
+=head2 Entry-parsing functions
+
+Although these functions are provided by the C<Text::BibTeX> module,
+they are actually in the C<Text::BibTeX::Entry> package. That's because
+they are implemented in C, and thus loaded with the XSUB code that
+C<Text::BibTeX> loads; however, they are actually methods in the
+C<Text::BibTeX::Entry> class. Thus, they are documented as methods in
+L<Text::BibTeX::Entry>.
+
+=over 4
+
+=item parse (ENTRY_STRUCT, FILENAME, FILEHANDLE)
+
+=item parse_s (ENTRY_STRUCT, TEXT)
+
+=back
+
+=head2 Macro table functions
+
+These functions allow direct access to the macro table maintained by
+B<btparse>, the C library underlying C<Text::BibTeX>. In the normal
+course of events, macro definitions always accumulate, and are only
+defined as a result of parsing a macro definition (C<@string>) entry.
+B<btparse> never deletes old macro definitions for you, and doesn't have
+any built-in default macros. If, for example, you wish to start fresh
+with new macros for every file, use C<delete_all_macros>. If you wish
+to pre-define certain macros, use C<add_macro_text>. (But note that the
+C<Bib> structure, as part of its mission to emulate BibTeX 0.99, defines
+the standard "month name" macros for you.)
+
+See also L<bt_macros> in the B<btparse> documentation for a description
+of the C interface to these functions.
+
+=over 4
+
+=item add_macro_text (MACRO, TEXT [, FILENAME [, LINE]])
+
+Defines a new macro, or redefines an old one. MACRO is the name of the
+macro, and TEXT is the text it should expand to. FILENAME and LINE are
+just used to generate any warnings about the macro definition. The only
+such warning occurs when you redefine an old macro: its value is
+overridden, and C<add_macro_text()> issues a warning saying so.
+
+=item delete_macro (MACRO)
+
+Deletes a macro from the macro table. If MACRO isn't defined,
+takes no action.
+
+=item delete_all_macros ()
+
+Deletes all macros from the macro table, even the predefined month
+names.
+
+=item macro_length (MACRO)
+
+Returns the length of a macro's expansion text. If the macro is
+undefined, returns 0; no warning is issued.
+
+=item macro_text (MACRO [, FILENAME [, LINE]])
+
+Returns the expansion text of a macro. If the macro is not defined,
+issues a warning and returns C<undef>. FILENAME and LINE, if supplied,
+are used for generating this warning; they should be supplied if you're
+looking up the macro as a result of finding it in a file.
+
+=back
+
+=head2 Name-parsing functions
+
+These are both private functions for the use of the C<Name> class, and
+therefore are put in the C<Text::BibTeX::Name> package. You should use
+the interface provided by that class for parsing names in the BibTeX
+style.
+
+=over 4
+
+=item _split (NAME_STRUCT, NAME, FILENAME, LINE, NAME_NUM, KEEP_CSTRUCT)
+
+=item free (NAME_STRUCT)
+
+=back
+
+=head2 Name-formatting functions
+
+These are private functions for the use of the C<NameFormat> class, and
+therefore are put in the C<Text::BibTeX::NameFormat> package. You
+should use the interface provided by that class for formatting names in
+the BibTeX style.
+
+=over 4
+
+=item create ([PARTS [, ABBREV_FIRST]])
+
+=item free (FORMAT_STRUCT)
+
+=item _set_text (FORMAT_STRUCT, PART, PRE_PART, POST_PART, PRE_TOKEN, POST_TOKEN)
+
+=item _set_options (FORMAT_STRUCT, PART, ABBREV, JOIN_TOKENS, JOIN_PART)
+
+=item format_name (NAME_STRUCT, FORMAT_STRUCT)
+
+=back
+
+=head1 BUGS AND LIMITATIONS
+
+C<Text::BibTeX> inherits several limitations from its base C library,
+B<btparse>; see L<btparse/BUGS AND LIMITATIONS> for details. In addition,
+C<Text::BibTeX> will not work with a Perl binary built using the C<sfio>
+library. This is because Perl's I/O abstraction layer does not extend to
+third-party C libraries that use stdio, and B<btparse> most certainly does
+use stdio.
+
+=head1 SEE ALSO
+
+L<btool_faq>, L<Text::BibTeX::File>, L<Text::BibTeX::Entry>,
+L<Text::BibTeX::Value>
+
+=head1 AUTHOR
+
+Greg Ward <gward@python.net>
+
+=head1 COPYRIGHT
+
+Copyright (c) 1997-2000 by Gregory P. Ward. All rights reserved. This file
+is part of the Text::BibTeX library. This library is free software; you
+may redistribute it and/or modify it under the same terms as Perl itself.
+
+=cut
+
+1;
diff --git a/lib/Text/BibTeX/Bib.pm b/lib/Text/BibTeX/Bib.pm
new file mode 100644
index 0000000..4d8428d
--- /dev/null
+++ b/lib/Text/BibTeX/Bib.pm
@@ -0,0 +1,476 @@
+# ----------------------------------------------------------------------
+# NAME : BibTeX/Bib.pm
+# CLASSES : Text::BibTeX::BibStructure, Text::BibTeX::BibEntry;
+# loads Text::BibTeX::BibSort and Text::BibTeX::BibFormat
+# for use by BibEntry
+# RELATIONS : BibStructure inherits from Structure
+# BibEntry inherits from BibSort and BibFormat, which
+# both inherit from StructuredEntry
+# DESCRIPTION: Implements the "Bib" structure, which provides the
+# same functionality -- though in a completely different
+# context, and much more customizably -- as the standard
+# style files of BibTeX 0.99.
+# CREATED : 1997/09/21, Greg Ward
+# MODIFIED :
+# VERSION : $Id$
+# COPYRIGHT : Copyright (c) 1997-2000 by Gregory P. Ward. All rights
+# reserved.
+#
+# This file is part of the Text::BibTeX library. This
+# library is free software; you may redistribute it and/or
+# modify it under the same terms as Perl itself.
+# ----------------------------------------------------------------------
+
+=head1 NAME
+
+Text::BibTeX::Bib - defines the "Bib" database structure
+
+=head1 SYNOPSIS
+
+ $bibfile = Text::BibTeX::File $filename->new;
+ $bibfile->set_structure ('Bib',
+ # Default option values:
+ sortby => 'name',
+ namestyle => 'full'
+ nameorder => 'first',
+ atitle => 1,
+ labels => 'numeric');
+
+ # Alternate option values:
+ $bibfile->set_option (sortby => 'year');
+ $bibfile->set_option (namestyle => 'nopunct');
+ $bibfile->set_option (namestyle => 'nospace');
+ $bibfile->set_option (nameorder => 'last');
+ $bibfile->set_option (atitle => 0);
+ $bibfile->set_option (labels => 'alpha'); # not implemented yet!
+
+ # parse entry from $bibfile and automatically make it a BibEntry
+ $entry = Text::BibTeX::Entry->new($bibfile);
+
+ # or get an entry from somewhere else which is hard-coded to be
+ # a BibEntry
+ $entry = Text::BibTeX::BibEntry->new(...);
+
+ $sortkey = $entry->sort_key;
+ @blocks = $entry->format;
+
+=head1 DESCRIPTION
+
+(B<NOTE!> Do not believe everything you read in this document. The
+classes described here are unfinished and only lightly tested. The
+current implementation is a proof-of-principle, to convince myself (and
+anyone who might be interested) that it really is possible to
+reimplement BibTeX 0.99 in Perl using the core C<Text::BibTeX> classes;
+this principle is vaguely demonstrated by the current C<Bib*> modules,
+but not really proved. Many important features needed to reimplement
+the standard styles of BibTeX 0.99 are missing, even though this
+document may brashly assert otherwise. If you are interested in using
+these classes, you should start by reading and grokking the code, and
+contributing the missing bits and pieces that you need.)
+
+C<Text::BibTeX::Bib> implements the database structure for
+bibliographies as defined by the standard styles of BibTeX 0.99. It
+does this by providing two classes, C<BibStructure> and C<BibEntry> (the
+leading C<Text::BibTeX> is implied, and will be omitted for the rest of
+this document). These two classes, being specific to bibliographic
+data, are outside of the core C<Text::BibTeX> class hierarchy, but are
+distributed along with it as they provide a canonical example of a
+specific database structure using classes derived from the core
+hierarchy.
+
+C<BibStructure>, which derives from the C<Structure> class, deals with
+the structure as a whole: it handles structure options and describes all
+the types and fields that make up the database structure. If you're
+interested in writing your own database structure modules, the standard
+interface for both of these is described in L<Text::BibTeX::Structure>;
+if you're just interested in finding out the exact database structure or
+the options supported by the C<Bib> structure, you've come to the right
+place. (However, you may have to wade through a bit of excess verbiage
+due to this module's dual purpose: first, to reimplement the standard
+styles of BibTeX 0.99, and second, to provide an example for other
+programmers wishing to implement new or derived database structure
+modules.)
+
+C<BibEntry> derives from the C<StructuredEntry> class and provides
+methods that operate on individual entries presumed to come from a
+database conforming to the structure defined by the C<BibStructure>
+class. (Actually, to be completely accurate, C<BibEntry> inherits from
+two intermediate classes, C<BibSort> and C<BibFormat>. These two
+classes just exist to reduce the amount of code in the C<Bib> module,
+and thanks to the magic of inheritance, their existence is usually
+irrelevant. But you might want to consult those two classes if you're
+interested in the gory details of sorting and formatting entries from
+BibTeX 0.99-style bibliography databases.)
+
+=cut
+
+
+# first, the "structure class" (inherits from Text::BibTeX::Structure)
+
+package Text::BibTeX::BibStructure;
+use strict;
+use vars qw(@ISA $VERSION);
+@ISA = qw(Text::BibTeX::Structure);
+$VERSION = '0.85';
+
+=head1 STRUCTURE OPTIONS
+
+C<BibStructure> handles several user-supplied "structure options" and
+methods for dealing with them. The options currently supported by the
+C<Bib> database structure, and the values allowed for them, are:
+
+=over 4
+
+=item C<sortby>
+
+How to sort entries. Valid values: C<name> (sort on author names, year,
+and title), C<year> (sort on year, author names, and title). Sorting on
+"author names" is a bit more complicated than just using the C<author>
+field; see L<Text::BibTeX::BibSort> for details. Default value: C<name>.
+
+=item C<namestyle>
+
+How to print author (and editor) names: C<full> for unabbreviated first
+names, C<abbrev> for first names abbreviated with periods, C<nopunct>
+for first names abbreviated with space but no periods, and C<nospace> to
+abbreviate without space or periods. Default value: C<full>.
+
+=item C<nameorder>
+
+The order in which to print names: C<first> for "first von last jr"
+order, and C<last> for "von last jr first" order. Default value:
+C<first>.
+
+=item C<atitle_lower>
+
+A boolean option: if true, non-book titles will be changed to "sentence
+capitalization:" words following colons and sentence-ending punctuation
+will be capitalized, and everything else at brace-depth zero will be
+changed to lowercase. Default value: true.
+
+=item C<labels>
+
+The type of bibliographic labels to generate: C<numeric> or C<alpha>.
+(Alphabetic labels are not yet implemented, so this option is currently
+ignored.) Default value: C<numeric>.
+
+=back
+
+Also, several "markup options" are supported. Markup options are
+distinct because they don't change how text is extracted from the
+database entries and subsequently mangled; rather, they supply bits of
+markup that go around the database-derived text. Markup options are
+always two-element lists: the first to "turn on" some feature of the
+markup language, and the second to turn it off. For example, if your
+target language is LaTeX2e and you want journal names emphasized, you
+would supply a list reference C<['\emph{','}']> for the C<journal_mkup>
+option. If you were instead generating HTML, you might supply
+C<['E<lt>emphE<gt>','E<lt>/emphE<gt>']>. To keep the structure module
+general with respect to markup languages, all markup options are empty
+by default. (Or, rather, they are all references to lists consisting of
+two empty strings.)
+
+=over 4
+
+=item C<name_mkup>
+
+Markup to add around the list of author names.
+
+=item C<atitle_mkup>
+
+Markup to add around non-book (article) titles.
+
+=item C<btitle_mkup>
+
+Markup to add around book titles.
+
+=item C<journal_mkup>
+
+Markup to add around journal names.
+
+=back
+
+=cut
+
+my %default_options =
+ (sortby => 'name', # or 'year', 'none'
+ namestyle => 'full', # or 'abbrev', 'nopunct', 'nospace'
+ nameorder => 'first', # or 'last'
+ atitle_lower=> 1, # mangle case of non-book titles?
+ labels => 'numeric', # or 'alpha' (not yet supported!)
+ name_mkup => ['', ''],
+ atitle_mkup => ['', ''],
+ btitle_mkup => ['', ''],
+ journal_mkup=> ['', ''],
+ );
+
+
+=head2 Option methods
+
+As required by the C<Text::BibTeX::Structure> module,
+C<Text::BibTeX::Bib> provides two methods for handling options:
+C<known_option> and C<default_option>. (The other two option methods,
+C<set_options> and C<get_options>, are just inherited from
+C<Text::BibTeX::Structure>.)
+
+=over 4
+
+=item known_option (OPTION)
+
+Returns true if OPTION is one of the options on the above list.
+
+=item default_option (OPTION)
+
+Returns the default value of OPTION, or C<croak>s if OPTION is not a
+supported option.
+
+=back
+
+=cut
+
+sub known_option
+{
+ my ($self, $option) = @_;
+ return exists $default_options{$option};
+}
+
+
+sub default_option
+{
+ my ($self, $option) = @_;
+ return exists $default_options{$option}
+ ? $default_options{$option}
+ : $self->SUPER::default_option ($option);
+}
+
+
+# The field lists in the following documentation are automatically
+# generated by my `doc_structure' program -- I run it and read the
+# output into this file. Wouldn't it be cool if the module could just
+# document itself? Ah well, dreaming again...
+
+=head1 DATABASE STRUCTURE
+
+The other purpose of a structure class is to provide a method,
+C<describe_entry>, that lists the allowed entry types and the known
+fields for the structure. Programmers wishing to write their own
+database structure module should consult L<Text::BibTeX::Structure> for
+the conventions and requirements of this method; the purpose of the
+present document is to describe the C<Bib> database structure.
+
+The allowed entry types, and the fields recognized for them, are:
+
+=over 4
+
+=item C<article>
+
+Required fields: C<author>, C<title>, C<journal>, C<year>.
+Optional fields: C<volume>, C<number>, C<pages>, C<month>, C<note>.
+
+=item C<book>
+
+Required fields: C<title>, C<publisher>, C<year>.
+Optional fields: C<series>, C<address>, C<edition>, C<month>, C<note>.
+Constrained fields: exactly one of C<author>, C<editor>; at most one of C<volume>, C<number>.
+
+=item C<booklet>
+
+Required fields: C<title>.
+Optional fields: C<author>, C<howpublished>, C<address>, C<month>, C<year>, C<note>.
+
+=item C<inbook>
+
+Required fields: C<publisher>, C<year>.
+Optional fields: C<series>, C<type>, C<address>, C<edition>, C<month>, C<note>.
+Constrained fields: exactly one of C<author>, C<editor>; at least one of C<chapter>, C<pages>; at most one of C<volume>, C<number>.
+
+=item C<incollection>
+
+Required fields: C<author>, C<title>, C<booktitle>, C<publisher>, C<year>.
+Optional fields: C<editor>, C<series>, C<type>, C<chapter>, C<pages>, C<address>, C<edition>, C<month>, C<note>.
+Constrained fields: at most one of C<volume>, C<number>.
+
+=item C<inproceedings>
+
+=item C<conference>
+
+Required fields: C<author>, C<title>, C<booktitle>, C<year>.
+Optional fields: C<editor>, C<series>, C<pages>, C<address>, C<month>, C<organization>, C<publisher>, C<note>.
+Constrained fields: at most one of C<volume>, C<number>.
+
+=item C<manual>
+
+Required fields: C<title>.
+Optional fields: C<author>, C<organization>, C<address>, C<edition>, C<month>, C<year>, C<note>.
+
+=item C<mastersthesis>
+
+Required fields: C<author>, C<title>, C<school>, C<year>.
+Optional fields: C<type>, C<address>, C<month>, C<note>.
+
+=item C<misc>
+
+Required fields: none.
+Optional fields: C<author>, C<title>, C<howpublished>, C<month>, C<year>, C<note>.
+
+=item C<phdthesis>
+
+Required fields: C<author>, C<title>, C<school>, C<year>.
+Optional fields: C<type>, C<address>, C<month>, C<note>.
+
+=item C<proceedings>
+
+Required fields: C<title>, C<year>.
+Optional fields: C<editor>, C<series>, C<address>, C<month>, C<organization>, C<publisher>, C<note>.
+Constrained fields: at most one of C<volume>, C<number>.
+
+=item C<techreport>
+
+Required fields: C<author>, C<title>, C<institution>, C<year>.
+Optional fields: C<type>, C<number>, C<address>, C<month>, C<note>.
+
+=item C<unpublished>
+
+Required fields: C<author>, C<title>, C<note>.
+Optional fields: C<month>, C<year>.
+
+=back
+
+=cut
+
+sub describe_entry
+{
+ my $self = shift;
+
+ # Advantages of the current scheme (set all fields for a particular
+ # entry type together):
+ # - groups fields more naturally (by entry type)
+ # - might lend itself to structuring things by 'type' in the object
+ # as well, making it easier to determine if a type is valid
+ # - prevents accidentally giving a type optional fields but no
+ # required fields -- currently this mistake would make the type
+ # 'unknown'
+ #
+ # Requirement of any scheme:
+ # - must be easy for derived classes to override/augment the field
+ # lists defined here! (ie. they should be able just to inherit
+ # describe_entry; or explicitly call SUPER::describe_entry and then
+ # undo/change some of its definitions
+
+ # Things that I don't think are handled by this scheme, but that
+ # bibtex does look out for:
+ # * warns if month but no year
+ # * crossref stuff:
+ # - article can xref article; xref'd entry must have key or journal
+ # - book or inboox can xref book; xref'd entry must have editor,
+ # key, or series
+ # - incollection can xref a book and inproceedings can xref a
+ # proceedings; xref'd entry must have editor, key, or booktitle
+
+ $self->set_fields ('article',
+ [qw(author title journal year)],
+ [qw(volume number pages month note)]);
+ $self->set_fields ('book',
+ [qw(title publisher year)],
+ [qw(series address edition month note)],
+ [1, 1, [qw(author editor)]],
+ [0, 1, [qw(volume number)]]);
+ $self->set_fields ('booklet',
+ [qw(title)],
+ [qw(author howpublished address month year note)]);
+ $self->set_fields ('inbook',
+ [qw(publisher year)],
+ [qw(series type address edition month note)],
+ [1, 1, [qw(author editor)]],
+ [1, 2, [qw(chapter pages)]],
+ [0, 1, [qw(volume number)]]);
+ $self->set_fields ('incollection',
+ [qw(author title booktitle publisher year)],
+ [qw(editor series type chapter pages address
+ edition month note)],
+ [0, 1, [qw(volume number)]]);
+ $self->set_fields ('inproceedings',
+ [qw(author title booktitle year)],
+ [qw(editor series pages address month
+ organization publisher note)],
+ [0, 1, [qw(volume number)]]);
+ $self->set_fields ('conference',
+ [qw(author title booktitle year)],
+ [qw(editor series pages address month
+ organization publisher note)],
+ [0, 1, [qw(volume number)]]);
+ $self->set_fields ('manual',
+ [qw(title)],
+ [qw(author organization address edition
+ month year note)]);
+ $self->set_fields ('mastersthesis',
+ [qw(author title school year)],
+ [qw(type address month note)]);
+ $self->set_fields ('misc',
+ [],
+ [qw(author title howpublished month year note)]);
+ $self->set_fields ('phdthesis',
+ [qw(author title school year)],
+ [qw(type address month note)]);
+ $self->set_fields ('proceedings',
+ [qw(title year)],
+ [qw(editor series address month
+ organization publisher note)],
+ [0, 1, [qw(volume number)]]);
+ $self->set_fields ('techreport',
+ [qw(author title institution year)],
+ [qw(type number address month note)]);
+ $self->set_fields ('unpublished',
+ [qw(author title note)],
+ [qw(month year)]);
+
+} # describe_entry
+
+
+=head1 STRUCTURED ENTRY CLASS
+
+The second class provided by the C<Text::BibTeX::Bib> module is
+C<BibEntry> (again, a leading C<Text::BibTeX> is implied). This being a
+structured entry class, it derives from C<StructuredEntry>. The
+conventions and requirements for such a class are documented in
+L<Text::BibTeX::Structure> for the benefit of programmers implementing
+their own structure modules.
+
+If you wish to write utilities making use of the C<Bib> database
+structure, then you should call one of the "officially supported"
+methods provided by the C<BibEntry> class. Currently, there are only
+two of these: C<sort_key> and C<format>. These are actually implemented
+in the C<BibSort> and C<BibFormat> classes, respectively, which are base
+classes of C<BibEntry>. Thus, see L<Text::BibTeX::BibSort> and
+L<Text::BibTeX::BibFormat> for details on these two methods.
+
+=cut
+
+package Text::BibTeX::BibEntry;
+use strict;
+use vars qw(@ISA $VERSION);
+
+$VERSION = '0.85';
+
+use Text::BibTeX::BibSort;
+use Text::BibTeX::BibFormat;
+
+@ISA = qw(Text::BibTeX::BibSort Text::BibTeX::BibFormat);
+
+
+1;
+
+=head1 SEE ALSO
+
+L<Text::BibTeX::Structure>, L<Text::BibTeX::BibSort>,
+L<Text::BibTeX::BibFormat>.
+
+=head1 AUTHOR
+
+Greg Ward <gward@python.net>
+
+=head1 COPYRIGHT
+
+Copyright (c) 1997-2000 by Gregory P. Ward. All rights reserved. This file
+is part of the Text::BibTeX library. This library is free software; you
+may redistribute it and/or modify it under the same terms as Perl itself.
diff --git a/lib/Text/BibTeX/BibFormat.pm b/lib/Text/BibTeX/BibFormat.pm
new file mode 100644
index 0000000..ddea761
--- /dev/null
+++ b/lib/Text/BibTeX/BibFormat.pm
@@ -0,0 +1,500 @@
+# ----------------------------------------------------------------------
+# NAME : BibFormat.pm
+# CLASSES : Text::BibTeX::BibFormat
+# RELATIONS : sub-class of Text::BibTeX::StructuredEntry
+# super-class of Text::BibTeX::BibEntry
+# DESCRIPTION: Provides methods for final output formatting of
+# bibliographic entries.
+# CREATED : 1997/11/24, GPW
+# MODIFIED :
+# VERSION : $Id$
+# COPYRIGHT : Copyright (c) 1997 by Gregory P. Ward. All rights reserved.
+#
+# This file is part of the Text::BibTeX library. This is free
+# software; you can redistribute it and/or modify it under the
+# same terms as Perl itself.
+# ----------------------------------------------------------------------
+
+package Text::BibTeX::BibFormat;
+
+use Carp;
+use strict;
+use vars qw(@ISA $VERSION);
+
+use Text::BibTeX::Name;
+use Text::BibTeX::NameFormat;
+use Text::BibTeX::Structure;
+
+@ISA = qw(Text::BibTeX::StructuredEntry);
+$VERSION = 0.85;
+
+use Text::BibTeX qw(:subs display_list :nameparts :joinmethods);
+
+=head1 NAME
+
+Text::BibTeX::BibFormat - formats bibliography entries
+
+=head1 SYNOPSIS
+
+ # Assuming $entry comes from a database of the 'Bib' structure
+ # (i.e., that it's blessed into the BibEntry class, which inherits
+ # the format method from BibFormat):
+ @blocks = $entry->format;
+
+=head1 DESCRIPTION
+
+The C<Text::BibTeX::BibFormat> class is a base class of
+C<Text::BibTeX::BibEntry> for formatting bibliography entries. It thus
+performs the main job of any program that would hope to supplant BibTeX
+itself; the other important job (sorting) is handled by its companion
+class, C<Text::BibTeX::BibSort>.
+
+C<BibFormat> (the C<Text::BibTeX> prefix will be dropped for brevity)
+pays attention to almost all of the structure options described in
+L<Text::BibTeX::Bib>; it only ignores those that cover sorting,
+currently just C<sortby>. In particular, all of the "markup" options
+control what language is generated by C<BibFormat>; if none of those
+options are set, then it will generate plain, unmarked text.
+
+The only method in C<BibFormat>'s documented interface (so far) is
+C<format>. (The class defines many other methods, but these should not
+be necessary to outsiders, so they are undocumented and subject to
+change.)
+
+=head1 METHODS
+
+=over 4
+
+=cut
+
+# ----------------------------------------------------------------------
+# Ordinary subroutines:
+
+sub connect_words
+{
+ my ($s1, $s2) = @_;
+
+ return $s1 . ((length ($s2) < 3) ? '~' : ' ') . $s2;
+}
+
+
+# ----------------------------------------------------------------------
+# Utility methods (eg. apply a bit of markup to a string or field)
+
+sub markup_field
+{
+ my ($self, $markup, $field) = @_;
+
+ $markup = $self->structure->get_options ("${markup}_mkup")
+ unless (ref $markup eq 'ARRAY' && @$markup == 2);
+ croak "${markup}_mkup option not defined"
+ unless defined $markup;
+
+ $self->exists ($field)
+ ? $markup->[0] . $self->get ($field) . $markup->[1]
+ : '';
+}
+
+
+sub markup_string
+{
+ my ($self, $markup, $string) = @_;
+
+ $markup = $self->structure->get_options ("${markup}_mkup")
+ unless (ref $markup eq 'ARRAY' && @$markup == 2);
+ croak "${markup}_mkup option not defined"
+ unless defined $markup;
+
+ $markup->[0] . $string . $markup->[1];
+}
+
+
+# ----------------------------------------------------------------------
+# Formatting methods I: utility methods called by the entry-formatters
+
+sub format_authors
+{
+ my $self = shift;
+
+ return '' unless $self->exists ('author');
+ my @authors = $self->names ('author');
+ $self->format_names (\@authors)
+}
+
+
+sub format_editors
+{
+ my $self = shift;
+
+ # The word used to indicate editorship should be customizable --
+ # might want it in another language, or abbreviated, or both.
+ return '' unless $self->exists ('editor');
+ my @editors = $self->names ('editor');
+ my $tackon = (@editors == 1) ? ', editor' : ', editors';
+ $self->format_names (\@editors) . $tackon;
+}
+
+
+sub format_names
+{
+ my ($self, $names) = @_;
+ my ($format, $name);
+
+ my ($order, $style) =
+ $self->structure->get_options ('nameorder', 'namestyle');
+ croak "format_names: bad nameorder option \"$order\""
+ unless $order eq 'first' || $order eq 'last';
+ croak "format_names: bad namestyle option \"$style\""
+ unless $style =~ /^(full|abbrev|nopunct|nospace)$/;
+
+ $order = ($order eq 'first') ? 'fvlj' : 'vljf';
+ $format = Text::BibTeX::NameFormat->new ($order, ! ($style eq 'full'));
+
+ $format->set_text (&BTN_FIRST, undef, undef, undef, '')
+ if $style eq 'nopunct' || $style eq 'nospace';
+ $format->set_options (&BTN_FIRST, 1, &BTJ_NOTHING, &BTJ_SPACE)
+ if $style eq 'nospace';
+
+ foreach $name (@$names)
+ {
+ $name = $name->format ($format);
+ $name = 'et. al.' if $name eq 'others';
+ }
+
+ return $self->markup_string ('name', display_list($names,0));
+}
+
+
+sub format_atitle
+{
+ my $self = shift;
+
+ my $lower = $self->structure->get_options ('atitle_lower');
+ my $title = $self->get ('title');
+ $title = change_case ('t', $title) if $lower;
+ $self->markup_string ('atitle', $title);
+# $markup->[0] . $title . $markup->[1];
+}
+
+
+sub format_btitle
+{
+ my $self = shift;
+
+ $self->markup_field ('btitle', 'title');
+# my $markup = $self->structure->get_options ('btitle_mkup');
+# my $title = $self->get ('title');
+# $markup->[0] . $title . $markup->[1];
+}
+
+
+# sub format_xref_article
+# {
+# my $self = shift;
+
+# # N.B. this assumes that the appropriate fields from the cross-
+# # referenced entry have already been put into the current entry!
+
+# # XXX hard-coded LaTeX markup here!!!
+
+# my ($key, $journal, $crossref);
+# $key = $self->get ('key');
+# $journal = $self->get ('journal');
+# $crossref = $self->get ('crossref');
+# if (defined $key)
+# {
+# return "In $key \cite{$crossref}";
+# }
+# else
+# {
+# if (defined $journal)
+# {
+# return "In {\em $journal} \cite{$crossref}";
+# }
+# else
+# {
+# $self->warn ("need key or journal for crossref");
+# return " \cite{$crossref}";
+# }
+# }
+# }
+
+
+sub format_pages
+{
+ my $self = shift;
+
+ my $pages = $self->get ('pages');
+ if ($pages =~ /[,+-]/) # multiple pages?
+ {
+ $pages =~ s/([^-])-([^-])/$1--$2/g;
+ return connect_words ("pages", $pages);
+ }
+ else
+ {
+ return connect_words ("page", $pages);
+ }
+}
+
+
+sub format_vol_num_pages
+{
+ my $self = shift;
+
+ my ($vol, $num, $pages) = $self->get ('volume', 'number', 'pages');
+ my $vnp = '';
+ $vnp .= $vol if defined $vol;
+ $vnp .= "($num)" if defined $num;
+ $vnp .= ":$pages" if defined $pages;
+ return $vnp;
+}
+
+
+sub format_bvolume
+{
+ my $self = shift;
+ my $volser; # potentially volume and series
+
+ if ($self->exists ('volume'))
+ {
+ $volser = connect_words ('volume', $self->get ('volume'));
+ $volser .= ' of ' . $self->markup_field ('btitle', 'series')
+ if $self->exists ('series');
+ return $volser;
+ }
+ else
+ {
+ return '';
+ }
+}
+
+
+sub format_number_series
+{
+ my ($self, $mid_sentence) = @_;
+
+ if ($self->exists ('volume'))
+ {
+ # if 'volume' field exists, then format_bvolume took care of
+ # formatting it, so don't do anything here
+ return '';
+ }
+ else
+ {
+ if ($self->exists ('number'))
+ {
+ my $numser;
+
+ $numser = connect_words ($mid_sentence ? 'number' : 'Number',
+ $self->get ('number'));
+ if ($self->exists ('series'))
+ {
+ $numser .= ' in ' . $self->get ('series');
+ }
+ else
+ {
+ $self->warn ("there's a number but no series " .
+ "(is this warning redundant?!?)");
+ }
+ return $numser;
+ }
+ else
+ {
+ # No 'number' -- just return the 'series' (or undef if none)
+ return $self->get ('series');
+ }
+ } # no 'volume' field
+} # format_number_series
+
+
+sub format_edition
+{
+ my ($self, $mid_sentence) = @_;
+
+ # XXX more fodder for I18N here: the word 'edition'
+ return '' unless $self->exists ('edition');
+ my $case_transform = $mid_sentence ? 'l' : 't';
+ return change_case ($case_transform, $self->get ('edition')) . ' edition';
+
+} # format_edition
+
+
+sub format_date
+{
+ my $self = shift;
+
+ my @date = grep ($_, $self->get ('month', 'year'));
+ return join (' ', @date);
+}
+
+
+# ----------------------------------------------------------------------
+# The actual entry-formatting methods:
+# format_article
+# format_book
+# format_inbook
+# ...and so on.
+
+# Each of these returns a list of blocks.
+# A block is a list of sentences.
+# A sentence is either a string or a list of clauses.
+# Any clause, sentence, or block in any list may be empty or undefined;
+# it should be removed before output.
+# If a sentence consists of a list of clauses, they should be joined
+# together with ", " to form the sentence-as-string.
+#
+# For example, the formatted entry for an article (in the absence of
+# cross-references) consists of four blocks:
+# - the name block, which has a single sentence; this sentence
+# has a single clause (the list of author names), and thus is
+# represented as a string like "Joe Blow, Fred Jones, and John Smith"
+# - the title block, which has a single sentence; this sentence
+# has a single clause, the title of the article, eg. "The mating
+# habits of foobars"
+# - the journal block, which consists of a single sentence that has
+# three clauses: the journal name, the volume/number/pages, and
+# the date. When the three clauses are joined, we get something like
+# "Journal of Foo, 4(5):122--130, May 1996" for the single sentence
+# in the block.
+# - the note block -- if the entry has no `note' field, this block
+# will be an undefined value rather than a list of sentences
+#
+# These four blocks are returned from `format_article' (and thus from
+# `format') as a list-of-lists-of-(strings or lists-of-strings. That
+# is, each format methods returns a list of blocks, each of which is in
+# turn a list of sentences. (Hence "list of lists of X".) Each
+# sentence is either a string ("list of lists of strings") or a list of
+# clauses ("list of lists of lists of strings'). Clear? Hope so!
+#
+# [ # enter list of blocks
+# ["Joe Blow, Fred Jones, and John Smith"] # name block:
+# # 1 sentence w/ 1 clause
+# ["The mating habits of foobars"] # title block:
+# # 1 sentence w/ 1 clause
+# [["Journal of Foo", # journal block:
+# "4(5):122--130", # 1 sentence w/ 3 clauses
+# "May 1996"]]
+# undef
+# ]
+#
+# A note: the journal name will normally have a bit of markup around it,
+# say to italicize it -- that's determined by the calling application,
+# though; the default markups are all empty strings. There could
+# probably be arbitrary markup for every element of an entry, but I
+# haven't gone that far yet.
+#
+# It is then the responsibility of the calling application to apply the
+# appropriate punctuation and munge all those lists of strings together
+# into something worth printing. The canonical application for doing
+# this is btformat, which supports LaTeX 2.09, LaTeX2e, and HTML markup
+# and output.
+
+
+sub format_article
+{
+ my $self = shift;
+
+
+ my $name_block = [$self->format_authors];
+ my $title_block = [$self->format_atitle];
+ my $journal_block = [[$self->markup_string('journal', $self->get ('journal')),
+ $self->format_vol_num_pages,
+ $self->format_date]];
+
+# if ($self->exists ('crossref'))
+# {
+# push (@blocks, [[$self->format_xref_article,
+# $self->format_pages]]);
+# }
+# else
+# {
+# }
+
+# push (@blocks, [$self->get ('note')]) if $self->exists ('note');
+# @blocks;
+
+ ($name_block, $title_block, $journal_block, $self->get ('note'));
+} # format_article
+
+
+sub format_book
+{
+ my $self = shift;
+
+ my $name_block = # author(s) or editor(s)
+ ($self->exists ('author'))
+ ? [$self->format_authors]
+ : [$self->format_editors];
+ my $title_block = # title (and volume)
+ [[$self->format_btitle, $self->format_bvolume]];
+ my $from_block = # number/series; publisher, address,
+ [$self->format_number_series (0), # edition, date
+ [$self->get ('publisher'), $self->get ('address'),
+ $self->format_edition (0), $self->format_date]];
+
+ ($name_block, $title_block, $from_block, $self->get('note'));
+
+} # format_book
+
+
+# ----------------------------------------------------------------------
+# Finally, the `format' method -- just calls one of the
+# type-specific format methods (format_article, etc.)
+
+=item format ()
+
+Formats a single entry for inclusion in the bibliography of some
+document. The exact processing performed is highly dependent on the
+entry type and the fields present; in general, you should be able to
+join C<format>'s outputs together to create a single paragraph for
+inclusion in a document of whatever markup language you're working with.
+
+Returns a list of "blocks," which can either be jammed together like
+sentences (for a traditional "tight" bibliography) or printed on
+separate lines (for an "open" bibliography format). Each block is a
+reference to a list of sentences; sentences should be joined together
+with an intervening period. Each sentence is either a single string or
+a list of clauses; clauses should be joined together with an intervening
+comma. Each clause is just a simple string.
+
+See the source code for C<btformat> for an example of how to use the
+output of C<format>.
+
+=cut
+
+sub format
+{
+ my $self = shift;
+
+ my $type = $self->type;
+ my $key = $self->key;
+ my $method_name = 'format_' . $type;
+ my $method = $self->can ($method_name);
+ unless ($method)
+ {
+ $self->warn ("can't format entry: " .
+ "no method $method_name (for type $type)");
+ return;
+ }
+
+ return &$method ($self);
+}
+
+1;
+
+=back
+
+=head1 SEE ALSO
+
+L<Text::BibTeX::Structure>, L<Text::BibTeX::Bib>,
+L<Text::BibTeX::BibSort>
+
+=head1 AUTHOR
+
+Greg Ward <gward@python.net>
+
+=head1 COPYRIGHT
+
+Copyright (c) 1997-2000 by Gregory P. Ward. All rights reserved. This file
+is part of the Text::BibTeX library. This library is free software; you
+may redistribute it and/or modify it under the same terms as Perl itself.
diff --git a/lib/Text/BibTeX/BibSort.pm b/lib/Text/BibTeX/BibSort.pm
new file mode 100644
index 0000000..2cc688b
--- /dev/null
+++ b/lib/Text/BibTeX/BibSort.pm
@@ -0,0 +1,245 @@
+# ----------------------------------------------------------------------
+# NAME : BibSort.pm
+# CLASSES : Text::BibTeX::BibSort
+# RELATIONS : sub-class of StructuredEntry
+# super-class of BibEntry
+# DESCRIPTION: Provides methods for generating sort keys of entries
+# in a BibTeX-style bibliographic database.
+# CREATED : 1997/11/24, GPW (taken from Bib.pm)
+# MODIFIED :
+# VERSION : $Id$
+# COPYRIGHT : Copyright (c) 1997-2000 by Gregory P. Ward. All rights
+# reserved.
+#
+# This file is part of the Text::BibTeX library. This is free
+# software; you can redistribute it and/or modify it under the
+# same terms as Perl itself.
+# ----------------------------------------------------------------------
+
+package Text::BibTeX::BibSort;
+use strict;
+use vars qw(@ISA $VERSION);
+
+use Text::BibTeX::Structure;
+
+@ISA = qw(Text::BibTeX::StructuredEntry);
+$VERSION = 0.85;
+
+use Text::BibTeX qw(purify_string change_case);
+
+use Carp;
+
+=head1 NAME
+
+Text::BibTeX::BibSort - generate sort keys for bibliographic entries
+
+=head1 SYNOPSIS
+
+ # Assuming $entry comes from a database of the 'Bib' structure
+ # (i.e., that it's blessed into the BibEntry class, which inherits
+ # the sort_key method from BibSort):
+ $sort_key = $entry->sort_key;
+
+=head1 DESCRIPTION
+
+C<Text::BibTeX::BibSort> is a base class of C<Text::BibTeX::BibEntry>
+for generating sort keys from bibliography entries. It could in
+principle (and, someday, might) offer a wide range of highly
+customizable sort-key generators. Currently, though, it provides only a
+single method (C<sort_key>) for public use, and that method only pays
+attention to one structure option, C<sortby>.
+
+=head1 METHODS
+
+=over 4
+
+=item sort_key ()
+
+Generates a sort key for a single bibliographic entry. Assumes this
+entry conforms to the C<Bib> database structure. The nature of this
+sort key is controlled by the C<sortby> option, which can be either
+C<"name"> or C<"year">. (The C<namestyle> also has a role, in
+determining how author/editor names are formatted for inclusion in the
+sort key.)
+
+For by-name sorting (which is how BibTeX's standard styles work), the sort
+key consists of one of the C<author>, C<editor>, C<organization>, or C<key>
+fields (depending on the entry type and which fields are actually present),
+followed by the year and the title. All fields are drastically simplified
+to produce the sort key: non-English letters are mercilessly anglicized,
+non-alphabetic characters are stripped, and everything is forced to
+lowercase. (The first two steps are done by the C<purify_string> routine;
+see L<Text::BibTeX/"Generic string-processing functions"> for a brief
+description, and the description of the C function C<bt_purify_string()> in
+L<bt_misc> for all the gory details.)
+
+=cut
+
+# methods for sorting -- everything here is geared towards generating
+# a sort key; it's up to external code to actually order entries (since,
+# of course, a single entry doesn't know anything about any other
+# entries!)
+
+# also, we assume that an entry has been checked and coerced into
+# shape -- that way we don't need to check for defined-ness of
+# strings, or check the type, or anything.
+
+sub sort_key
+{
+ my $self = shift;
+ my ($sortby, $type, $nkey, $skey);
+
+ $sortby = $self->structure->get_options ('sortby');
+ croak ("BibSort::sort_key: sortby option is 'none'")
+ if $sortby eq 'none';
+ croak ("BibSort::sort_key: unknown sortby option '$sortby'")
+ unless $sortby eq 'name' || $sortby eq 'year';
+
+ $type = $self->type;
+
+ if ($type eq 'book' || $type eq 'inbook')
+ {
+ $nkey = $self->format_alt_fields ('author' => 'sort_format_names',
+ 'editor' => 'sort_format_names',
+ 'key' => 'sortify');
+ }
+ elsif ($type eq 'proceedings')
+ {
+ $nkey = $self->format_alt_fields ('editor' => 'sort_format_names',
+ 'organization' => 'sort_format_org',
+ 'key' => 'sortify');
+ }
+ elsif ($type eq 'manual')
+ {
+ $nkey = $self->format_alt_fields ('author' => 'sort_format_names',
+ 'organization' => 'sort_format_org',
+ 'key' => 'sortify');
+ }
+ else
+ {
+ $nkey = $self->format_alt_fields ('author' => 'sort_format_names',
+ 'key' => 'sortify');
+ }
+
+ my $ykey = change_case ('l', (purify_string ($self->get ('year'))));
+ $skey = ($sortby eq 'name')
+ ? $nkey . ' ' . $ykey
+ : $ykey . ' ' . $nkey;
+ $skey .= ' ' . $self->sort_format_title ('title');
+ return $skey;
+
+} # sort_key
+
+
+sub sortify
+{
+ my ($self, $field) = @_;
+ return lc (purify_string ($self->get ($field)));
+}
+
+
+sub sort_format_names
+{
+ require Text::BibTeX::Name;
+ require Text::BibTeX::NameFormat;
+
+ my ($self, $field) = @_;
+ my ($abbrev, $format, $name);
+
+ $abbrev = ! ($self->structure->get_options ('namestyle') eq 'full');
+ $format = Text::BibTeX::NameFormat->new ("vljf", $abbrev);
+ $name = Text::BibTeX::Name->new;
+
+ my (@snames, $i, $sname);
+ @snames = $self->split ($field);
+ for $i (0 .. $#snames)
+ {
+ $sname = $snames[$i];
+ if ($sname eq 'others') # hmmm... should we only do this on
+ { # the final name?
+ $sname = 'et al'; # purified version of "et. al."
+ }
+ else
+ {
+ # A spot of ugliness here:
+ # - lc (purify_string (x)) ought to be sortify (x), but I have
+ # already made sortify a method that only operates on a field,
+ # rather than a generic function (as it is in BibTeX)
+
+ $name->split ($sname, $self->filename, $self->line ($field), $i+1);
+ $sname = $name->format ($format);
+# print "s_f_n: about to purify >$sname<\n";
+ $snames[$i] = lc (purify_string ($sname));
+ }
+ }
+ return join (' ', @snames);
+}
+
+
+
+# sort_format_org and sort_format_title are suspiciously similar...
+# could probably have one method to handle both tasks...
+
+sub sort_format_org
+{
+ my ($self, $field) = @_;
+
+ my $value = $self->get ($field);
+ $value =~ s/^the\b\s*//i;
+ return lc (purify_string ($value));
+}
+
+
+sub sort_format_title
+{
+ my ($self, $field) = @_;
+
+ my $value = $self->get ($field);
+ $value =~ s/^(the|an?)\b\s*//i;
+ return lc (purify_string ($value));
+}
+
+
+# Hmm, I suspect format_alt_fields is a little more general purpose --
+# probably belongs outside of the "generate sort key" methods.
+# (Or.... does it maybe belong in one of the base classes, StructuredEntry
+# or even Entry?)
+
+sub format_alt_fields
+{
+ my $self = shift;
+ my ($field, $method);
+
+ while (@_)
+ {
+ ($field, $method) = (shift, shift);
+ if ($self->exists ($field))
+ {
+ $method = $self->can ($method)
+ || croak ("unknown method in class " . (ref $self));
+ return &$method ($self, $field);
+ }
+ }
+
+ return undef; # whoops, none of the alternate fields
+ # were present
+}
+
+1;
+
+=back
+
+=head1 SEE ALSO
+
+L<Text::BibTeX::Structure>, L<Text::BibTeX::Bib>,
+L<Text::BibTeX::BibFormat>
+
+=head1 AUTHOR
+
+Greg Ward <gward@python.net>
+
+=head1 COPYRIGHT
+
+Copyright (c) 1997-2000 by Gregory P. Ward. All rights reserved. This file
+is part of the Text::BibTeX library. This library is free software; you
+may redistribute it and/or modify it under the same terms as Perl itself.
diff --git a/lib/Text/BibTeX/Entry.pm b/lib/Text/BibTeX/Entry.pm
new file mode 100644
index 0000000..36fd44a
--- /dev/null
+++ b/lib/Text/BibTeX/Entry.pm
@@ -0,0 +1,1087 @@
+# ----------------------------------------------------------------------
+# NAME : BibTeX/Entry.pm
+# CLASSES : Text::BibTeX::Entry
+# RELATIONS : base class for Text::BibTeX::StructuredEntry, and
+# ultimately for all user-supplied structured entry classes
+# DESCRIPTION: Provides an object-oriented interface to BibTeX entries.
+# CREATED : March 1997, Greg Ward
+# MODIFIED :
+# VERSION : $Id$
+# COPYRIGHT : Copyright (c) 1997-2000 by Gregory P. Ward. All rights
+# reserved.
+#
+# This file is part of the Text::BibTeX library. This
+# library is free software; you may redistribute it and/or
+# modify it under the same terms as Perl itself.
+# ----------------------------------------------------------------------
+package Text::BibTeX::Entry;
+
+require 5.004; # for isa, and delete on a slice
+
+use strict;
+use vars qw'$VERSION';
+use Carp;
+use Text::BibTeX qw(:metatypes :nodetypes);
+
+$VERSION = 0.85;
+
+=head1 NAME
+
+Text::BibTeX::Entry - read and parse BibTeX files
+
+=head1 SYNOPSIS
+
+ use Text::BibTeX::Entry;
+
+ # ...assuming that $bibfile and $newbib are both objects of class
+ # Text::BibTeX::File, opened for reading and writing (respectively):
+
+ # Entry creation/parsing methods:
+ $entry = Text::BibTeX::Entry->new();
+ $entry->read ($bibfile);
+ $entry->parse ($filename, $filehandle);
+ $entry->parse_s ($entry_text);
+
+ # or:
+ $entry = Text::BibTeX::Entry->new( $bibfile );
+ $entry = Text::BibTeX::Entry->new( $filename, $filehandle );
+ $entry = Text::BibTeX::Entry->new( $entry_text );
+
+ # Entry query methods
+ warn "error in input" unless $entry->parse_ok;
+ $metatype = $entry->metatype;
+ $type = $entry->type;
+
+ # if metatype is BTE_REGULAR or BTE_MACRODEF:
+ $key = $entry->key; # only for BTE_REGULAR metatype
+ $num_fields = $entry->num_fields;
+ @fieldlist = $entry->fieldlist;
+ $has_title = $entry->exists ('title');
+ $title = $entry->get ('title');
+ # or:
+ ($val1,$val2,...$valn) = $entry->get ($field1, $field2, ..., $fieldn);
+
+ # if metatype is BTE_COMMENT or BTE_PREAMBLE:
+ $value = $entry->value;
+
+ # Author name methods
+ @authors = $entry->split ('author');
+ ($first_author) = $entry->names ('author');
+
+ # Entry modification methods
+ $entry->set_type ($new_type);
+ $entry->set_key ($new_key);
+ $entry->set ('title', $new_title);
+ # or:
+ $entry->set ($field1, $val1, $field2, $val2, ..., $fieldn, $valn);
+ $entry->delete (@fields);
+ $entry->set_fieldlist (\@fieldlist);
+
+ # Entry output methods
+ $entry->write ($newbib);
+ $entry->print ($filehandle);
+ $entry_text = $entry->print_s;
+
+ # Reset internal parser state:
+ $entry = Text::BibTeX::Entry->new();
+ $entry->parse ($filename, undef);
+ $entry->parse_s (undef);
+
+ # or:
+ $entry = Text::BibTeX::Entry->new( $filename, undef );
+ $entry = Text::BibTeX::Entry->new( undef );
+
+ # Miscellaneous methods
+ $entry->warn ($entry_warning);
+ # or:
+ $entry->warn ($field_warning, $field);
+ $entry->clone;
+
+=head1 DESCRIPTION
+
+C<Text::BibTeX::Entry> does all the real work of reading and parsing
+BibTeX files. (Well, actually it just provides an object-oriented Perl
+front-end to a C library that does all that. But that's not important
+right now.)
+
+BibTeX entries can be read either from C<Text::BibTeX::File> objects (using
+the C<read> method), or directly from a filehandle (using the C<parse>
+method), or from a string (using C<parse_s>). The first is preferable,
+since you don't have to worry about supplying the filename, and because of
+the extra functionality provided by the C<Text::BibTeX::File> class.
+Currently, this means that you may specify the I<database structure> to
+which entries are expected to conform via the C<File> class. This lets you
+ensure that entries follow the rules for required fields and mutually
+constrained fields for a particular type of database, and also gives you
+access to all the methods of the I<structured entry class> for this
+database structure. See L<Text::BibTeX::Structure> for details on database
+structures.
+
+Once you have the entry, you can query it or change it in a variety of
+ways. The query methods are C<parse_ok>, C<type>, C<key>, C<num_fields>,
+C<fieldlist>, C<exists>, and C<get>. Methods for changing the entry are
+C<set_type>, C<set_key>, C<set_fieldlist>, C<delete>, and C<set>.
+
+Finally, you can output BibTeX entries, again either to an open
+C<Text::BibTeX::File> object, a filehandle or a string. (A filehandle or
+C<File> object must, of course, have been opened in write mode.) Output to
+a C<File> object is done with the C<write> method, to a filehandle via
+C<print>, and to a string with C<print_s>. Using the C<File> class is
+recommended for future extensibility, although it currently doesn't offer
+anything extra.
+
+=head1 METHODS
+
+=head2 Entry creation/parsing methods
+
+=over 4
+
+=item new ([OPTS ,] [SOURCE])
+
+Creates a new C<Text::BibTeX::Entry> object. If the SOURCE parameter is
+supplied, it must be one of the following: a C<Text::BibTeX::File> (or
+descendant class) object, a filename/filehandle pair, or a string. Calls
+C<read> to read from a C<Text::BibTeX::File> object, C<parse> to read from
+a filehandle, and C<parse_s> to read from a string.
+
+A filehandle can be specified as a GLOB reference, or as an
+C<IO::Handle> (or descendants) object, or as a C<FileHandle> (or
+descendants) object. (But there's really no point in using
+C<FileHandle> objects, since C<Text::BibTeX> requires Perl 5.004, which
+always includes the C<IO> modules.) You can I<not> pass in the name of
+a filehandle as a string, though, because C<Text::BibTeX::Entry>
+conforms to the C<use strict> pragma (which disallows such symbolic
+references).
+
+The corresponding filename should be supplied in order to allow for
+accurate error messages; if you simply don't have the filename, you can
+pass C<undef> and you'll get error messages without a filename. (It's
+probably better to rearrange your code so that the filename is
+available, though.)
+
+Thus, the following are equivalent to read from a file named by
+C<$filename> (error handling ignored):
+
+ # good ol' fashioned filehandle and GLOB ref
+ open (BIBFILE, $filename);
+ $entry = Text::BibTeX::Entry->new($filename, \*BIBFILE);
+
+ # newfangled IO::File thingy
+ $file = IO::File->new($filename);
+ $entry = Text::BibTeX::Entry->new($filename, $file);
+
+But using a C<Text::BibTeX::File> object is simpler and preferred:
+
+ $file = Text::BibTeX::File->new($filename);
+ $entry = Text::BibTeX::Entry->new($file);
+
+Returns the new object, unless SOURCE is supplied and reading/parsing
+the entry fails (e.g., due to end of file) -- then it returns false.
+
+You may supply a reference to an option hash as first argument.
+Supported options are:
+
+=over 4
+
+=item BINMODE
+
+Set the way Text::BibTeX deals with strings. By default it manages
+strings as bytes. You can set BINMODE to 'utf-8' to get NFC normalized
+
+Text::BibTeX::Entry->new(
+ { binmode => 'utf-8', normalization => 'NFD' },
+ $file });
+
+
+=item NORMALIZATION
+
+UTF-8 strings and you can customise the normalization with the NORMALIZATION option.
+
+=back
+
+
+=cut
+
+sub new
+{
+ my ($class, @source) = @_;
+
+ $class = ref ($class) || $class;
+
+ my $self = {'file' => undef,
+ 'type' => undef,
+ 'key' => undef,
+ 'status' => undef,
+ 'metatype' => undef,
+ 'fields' => [],
+ 'values' => {}};
+ bless $self, $class;
+
+ my $opts = {};
+ $opts = shift @source if scalar(@source) and ref $source[0] eq "HASH";
+ $opts->{ lc $_ } = $opts->{$_} for ( keys %$opts );
+ $self->{binmode} = 'utf-8'
+ if exists $opts->{binmode} && $opts->{binmode} =~ /utf-?8/i;
+ $self->{normalization} = $opts->{normalization} if exists $opts->{normalization};
+
+ if (@source)
+ {
+ my $status;
+
+ if (@source == 1 && ref($source[0]) && $source[0]->isa ('Text::BibTeX::File'))
+ {
+ my $file = $source[0];
+ $status = $self->read ($file);
+ if (my $structure = $file->structure)
+ {
+ $self->{structure} = $structure;
+ bless $self, $structure->entry_class;
+ }
+ }
+ elsif (@source == 2 && (defined ($source[0]) && ! ref ($source[0])) && (!defined ($source[1]) || fileno ($source[1]) >= 0))
+ { $status = $self->parse ($source[0], $source[1]) }
+ elsif (@source == 1 && ! ref ($source[0]))
+ { $status = $self->parse_s ($source[0]) }
+ else
+ { croak "new: source argument must be either a Text::BibTeX::File " .
+ "(or descendant) object, filename/filehandle pair, or " .
+ "a string"; }
+
+ return $status unless $status; # parse failed -- tell our caller
+ }
+ $self;
+}
+
+=item clone
+
+Clone a Text::BibTeX::Entry object, returning the clone. This re-uses the reference to any
+Text::BibTeX::Structure or Text::BibTeX::File but copies everything else,
+so that the clone can be modified apart from the original.
+
+=cut
+
+sub clone
+{
+ my $self = shift;
+ my $clone = {};
+ # Use the same structure object - won't be changed
+ if ($self->{structure}) {
+ $clone->{structure} = $self->{structure};
+ }
+ # Use the same file object - won't be changed
+ if ($self->{file}) {
+ $clone->{file} = $self->{file}
+ }
+ # These might be changed so make copies
+ $clone->{binmode} = $self->{binmode};
+ $clone->{normalization} = $self->{normalization};
+ $clone->{type} = $self->{type};
+ $clone->{key} = $self->{key};
+ $clone->{status} = $self->{status};
+ $clone->{metatype} = $self->{metatype};
+ $clone->{fields} = [ map {$_} @{$self->{fields}} ];
+ while (my ($k, $v) = each %{$self->{values}}) {
+ $clone->{values}{$k} = $v;
+ }
+ while (my ($k, $v) = each %{$self->{lines}}) {
+ $clone->{lines}{$k} = $v;
+ }
+ bless $clone, ref($self);
+ return $clone;
+}
+
+=item read (BIBFILE)
+
+Reads and parses an entry from BIBFILE, which must be a
+C<Text::BibTeX::File> object (or descendant). The next entry will be read
+from the file associated with that object.
+
+Returns the same as C<parse> (or C<parse_s>): false if no entry found
+(e.g., at end-of-file), true otherwise. To see if the parse itself failed
+(due to errors in the input), call the C<parse_ok> method.
+
+=cut
+
+sub read
+{
+ my ($self, $source, $preserve) = @_;
+ croak "`source' argument must be ref to open Text::BibTeX::File " .
+ "(or descendant) object"
+ unless ($source->isa('Text::BibTeX::File'));
+
+ my $fn = $source->{'filename'};
+ my $fh = $source->{'handle'};
+ $self->{'file'} = $source; # store File object for later use
+ ## Propagate flags
+ for my $f (qw.binmode normalization.) {
+ $self->{$f} = $source->{$f} unless exists $self->{$f};
+ }
+ return $self->parse ($fn, $fh, $preserve);
+}
+
+
+=item parse (FILENAME, FILEHANDLE)
+
+Reads and parses the next entry from FILEHANDLE. (That is, it scans the
+input until an '@' sign is seen, and then slurps up to the next '@'
+sign. Everything between the two '@' signs [including the first one,
+but not the second one -- it's pushed back onto the input stream for the
+next entry] is parsed as a BibTeX entry, with the simultaneous
+construction of an abstract syntax tree [AST]. The AST is traversed to
+ferret out the most interesting information, and this is stuffed into a
+Perl hash, which coincidentally is the C<Text::BibTeX::Entry> object
+you've been tossing around. But you don't need to know any of that -- I
+just figured if you've read this far, you might want to know something
+about the inner workings of this module.)
+
+The success of the parse is stored internally so that you can later
+query it with the C<parse_ok> method. Even in the presence of syntax
+errors, you'll usually get something resembling your input, but it's
+usually not wise to try to do anything with it. Just call C<parse_ok>,
+and if it returns false then silently skip to the next entry. (The
+error messages printed out by the parser should be quite adequate for
+the user to figure out what's wrong. And no, there's currently no way
+for you to capture or redirect those error messages -- they're always
+printed to C<stderr> by the underlying C code. That should change in
+future releases.)
+
+If no '@' signs are seen on the input before reaching end-of-file, then
+we've exhausted all the entries in the file, and C<parse> returns a
+false value. Otherwise, it returns a true value -- even if there were
+syntax errors. Hence, it's important to check C<parse_ok>.
+
+The FILENAME parameter is only used for generating error messages, but
+anybody using your program will certainly appreciate your setting it
+correctly!
+
+Passing C<undef> to FILEHANDLE will reset the state of the underlying
+C parser, which is required in order to parse multiple files.
+
+=item parse_s (TEXT)
+
+Parses a BibTeX entry (using the above rules) from the string TEXT. The
+string is not modified; repeatedly calling C<parse_s> with the same string
+will give you the same results each time. Thus, there's no point in
+putting multiple entries in one string.
+
+Passing C<undef> to TEXT will reset the state of the underlying
+C parser, which may be required in order to parse multiple strings.
+
+=back
+
+=cut
+
+sub _preserve
+{
+ my ($self, $preserve) = @_;
+
+ $preserve = $self->{'file'}->preserve_values
+ if ! defined $preserve &&
+ defined $self->{'file'} &&
+ $self->{'file'}->isa ('Text::BibTeX::File');
+ require Text::BibTeX::Value if $preserve;
+ $preserve;
+}
+
+sub parse
+{
+ my ($self, $filename, $filehandle, $preserve) = @_;
+
+ $preserve = $self->_preserve ($preserve);
+ if (defined $filehandle) {
+ _parse ($self, $filename, $filehandle, $preserve);
+ } else {
+ _reset_parse ();
+ }
+}
+
+
+sub parse_s
+{
+ my ($self, $text, $preserve) = @_;
+
+ $preserve = $self->_preserve ($preserve);
+ if (defined $text) {
+ _parse_s ($self, $text, $preserve);
+ } else {
+ _reset_parse_s ();
+ }
+}
+
+
+=head2 Entry query methods
+
+=over 4
+
+=item parse_ok ()
+
+Returns false if there were any serious errors encountered while parsing
+the entry. (A "serious" error is a lexical or syntax error; currently,
+warnings such as "undefined macro" result in an error message being
+printed to C<stderr> for the user's edification, but no notice is
+available to the calling code.)
+
+=item type ()
+
+Returns the type of the entry. (The `type' is the word that follows the
+'@' sign; e.g. `article', `book', `inproceedings', etc. for the standard
+BibTeX styles.)
+
+=item metatype ()
+
+Returns the metatype of the entry. (The `metatype' is a numeric value used
+to classify entry types into four groups: comment, preamble, macro
+definition (C<@string> entries), and regular (all other entry types).
+C<Text::BibTeX> exports four constants for these metatypes: C<BTE_COMMENT>,
+C<BTE_PREAMBLE>, C<BTE_MACRODEF>, and C<BTE_REGULAR>.)
+
+=item key ()
+
+Returns the key of the entry. (The key is the token immediately
+following the opening `{' or `(' in "regular" entries. Returns C<undef>
+for entries that don't have a key, such as macro definition (C<@string>)
+entries.)
+
+=item num_fields ()
+
+Returns the number of fields in the entry. (Note that, currently, this is
+I<not> equivalent to putting C<scalar> in front of a call to C<fieldlist>.
+See below for the consequences of calling C<fieldlist> in a scalar
+context.)
+
+=item fieldlist ()
+
+Returns the list of fields in the entry.
+
+B<WARNING> In scalar context, it no longer returns a
+reference to the object's own list of fields.
+
+=cut
+
+sub parse_ok { shift->{'status'}; }
+
+sub metatype {
+ my $self = shift;
+ Text::BibTeX->_process_result( $self->{'metatype'}, $self->{binmode}, $self->{normalization} );
+}
+
+sub type {
+ my $self = shift;
+ Text::BibTeX->_process_result( $self->{'type'}, $self->{binmode}, $self->{normalization} );
+}
+
+sub key {
+ my $self = shift;
+ exists $self->{key}
+ ? Text::BibTeX->_process_result($self->{key}, $self->{binmode}, $self->{normalization})
+ : undef;
+}
+
+sub num_fields { scalar @{shift->{'fields'}}; }
+
+sub fieldlist {
+ my $self = shift;
+ return map { Text::BibTeX->_process_result($_, $self->{binmode}, $self->{normalization})} @{$self->{'fields'}};
+}
+
+=item exists (FIELD)
+
+Returns true if a field named FIELD is present in the entry, false
+otherwise.
+
+=item get (FIELD, ...)
+
+Returns the value of one or more FIELDs, as a list of values. For example:
+
+ $author = $entry->get ('author');
+ ($author, $editor) = $entry->get ('author', 'editor');
+
+If a FIELD is not present in the entry, C<undef> will be returned at its
+place in the return list. However, you can't completely trust this as a
+test for presence or absence of a field; it is possible for a field to be
+present but undefined. Currently this can only happen due to certain
+syntax errors in the input, or if you pass an undefined value to C<set>, or
+if you create a new field with C<set_fieldlist> (the new field's value is
+implicitly set to C<undef>).
+
+Normally, the field value is what the input looks like after "maximal
+processing"--quote characters are removed, whitespace is collapsed (the
+same way that BibTeX itself does it), macros are expanded, and multiple
+tokens are pasted together. (See L<bt_postprocess> for details on the
+post-processing performed by B<btparse>.)
+
+For example, if your input file has the following:
+
+ @string{of = "of"}
+ @string{foobars = "Foobars"}
+
+ @article{foobar,
+ title = { The Mating Habits } # of # " Adult " # foobars
+ }
+
+then using C<get> to query the value of the C<title> field from the
+C<foobar> entry would give the string "The Mating Habits of Adult Foobars".
+
+However, in certain circumstances you may wish to preserve the values as
+they appear in the input. This is done by setting a C<preserve_values>
+flag at some point; then, C<get> will return not strings but
+C<Text::BibTeX::Value> objects. Each C<Value> object is a list of
+C<Text::BibTeX::SimpleValue> objects, which in turn consists of a simple
+value type (string, macro, or number) and the text of the simple value.
+Various ways to set the C<preserve_values> flag and the interface to
+both C<Value> and C<SimpleValue> objects are described in
+L<Text::BibTeX::Value>.
+
+=item value ()
+
+Returns the single string associated with C<@comment> and C<@preamble>
+entries. For instance, the entry
+
+ @preamble{" This is a preamble" #
+ {---the concatenation of several strings}}
+
+would return a value of "This is a preamble---the concatenation of
+several strings".
+
+If this entry was parsed in "value preservation" mode, then C<value>
+acts like C<get>, and returns a C<Value> object rather than a simple
+string.
+
+=back
+
+=cut
+
+sub exists
+{
+ my ($self, $field) = @_;
+
+ exists $self->{values}{Text::BibTeX->_process_argument($field, $self->{binmode}, $self->{normalization})};
+}
+
+sub get
+{
+ my ($self, @fields) = @_;
+
+ my @x = @{$self->{'values'}}{map {Text::BibTeX->_process_argument($_, $self->{binmode}, $self->{normalization})} @fields};
+
+ @x = map {defined($_) ? Text::BibTeX->_process_result($_, $self->{binmode}, $self->{normalization}): undef} @x;
+
+ return (@x > 1) ? @x : $x[0];
+}
+
+sub value {
+ my $self = shift;
+ Text::BibTeX->_process_result($self->{value}, $self->{binmode}, $self->{normalization});
+}
+
+
+=head2 Author name methods
+
+This is the only part of the module that makes any assumption about the
+nature of the data, namely that certain fields are lists delimited by a
+simple word such as "and", and that the delimited sub-strings are human
+names of the "First von Last" or "von Last, Jr., First" style used by
+BibTeX. If you are using this module for anything other than
+bibliographic data, you can most likely forget about these two methods.
+However, if you are in fact hacking on BibTeX-style bibliographic data,
+these could come in very handy -- the name-parsing done by BibTeX is not
+trivial, and the list-splitting would also be a pain to implement in
+Perl because you have to pay attention to brace-depth. (Not that it
+wasn't a pain to implement in C -- it's just a lot more efficient than a
+Perl implementation would be.)
+
+Incidentally, both of these methods assume that the strings being split
+have already been "collapsed" in the BibTeX way, i.e. all leading and
+trailing whitespace removed and internal whitespace reduced to single
+spaces. This should always be the case when using these two methods on
+a C<Text::BibTeX::Entry> object, but these are actually just front ends
+to more general functions in C<Text::BibTeX>. (More general in that you
+supply the string to be parsed, rather than supplying the name of an
+entry field.) Should you ever use those more general functions
+directly, you might have to worry about collapsing whitespace; see
+L<Text::BibTeX> (the C<split_list> and C<split_name> functions in
+particular) for more information.
+
+Please note that the interface to author name parsing is experimental,
+subject to change, and open to discussion. Please let me know if you
+have problems with it, think it's just perfect, or whatever.
+
+=over 4
+
+=item split (FIELD [, DELIM [, DESC]])
+
+Splits the value of FIELD on DELIM (default: `and'). Don't assume that
+this works the same as Perl's builtin C<split> just because the names are
+the same: in particular, DELIM must be a simple string (no regexps), and
+delimiters that are at the beginning or end of the string, or at non-zero
+brace depth, or not surrounded by whitespace, are ignored. Some examples
+might illuminate matters:
+
+ if field F is... then split (F) returns...
+ 'Name1 and Name2' ('Name1', 'Name2')
+ 'Name1 and and Name2' ('Name1', undef, 'Name2')
+ 'Name1 and' ('Name1 and')
+ 'and Name2' ('and Name2')
+ 'Name1 {and} Name2 and Name3' ('Name1 {and} Name2', 'Name3')
+ '{Name1 and Name2} and Name3' ('{Name1 and Name2}', 'Name3')
+
+Note that a warning will be issued for empty names (as in the second
+example above). A warning ought to be issued for delimiters at the
+beginning or end of a string, but currently this isn't done. (Hmmm.)
+
+DESC is a one-word description of the substrings; it defaults to 'name'.
+It is only used for generating warning messages.
+
+=item names (FIELD)
+
+Splits FIELD as described above, and further splits each name into four
+components: first, von, last, and jr.
+
+Returns a list of C<Text::BibTeX::Name> objects, each of which represents
+one name. Use the C<part> method to query these objects; see
+L<Text::BibTeX::Name> for details on the interface to name objects (and on
+name-parsing as well).
+
+For example if this entry:
+
+ @article{foo,
+ author = {John Smith and
+ Hacker, J. Random and
+ Ludwig van Beethoven and
+ {Foo, Bar and Company}}}
+
+has been parsed into a C<Text::BibTeX::Entry> object C<$entry>, then
+
+ @names = $entry->names ('author');
+
+will put a list of C<Text::BibTeX::Name> objects in C<@names>. These can
+be queried individually as described in L<Text::BibTeX::Name>; for instance,
+
+ @last = $names[0]->part ('last');
+
+would put the list of tokens comprising the last name of the first author
+into the C<@last> array: C<('Smith')>.
+
+=cut
+
+sub split
+{
+ my ($self, $field, $delim, $desc) = @_;
+
+ return unless $self->exists($field);
+ $delim ||= 'and';
+ $desc ||= 'name';
+
+# local $^W = 0 # suppress spurious warning from
+# unless defined $filename; # undefined $filename
+ Text::BibTeX::split_list($self->{values}{$field},
+ $delim,
+ ($self->{file} && $self->{file}{filename}),
+ $self->{lines}{$field},
+ $desc,
+ {binmode => $self->{binmode},
+ normalization => $self->{normalization}});
+}
+
+sub names
+{
+ require Text::BibTeX::Name;
+
+ my ($self, $field) = @_;
+ my (@names, $i);
+
+ my $filename = ($self->{'file'} && $self->{'file'}{'filename'});
+ my $line = $self->{'lines'}{$field};
+
+ @names = $self->split ($field);
+# local $^W = 0 # suppress spurious warning from
+# unless defined $filename; # undefined $filename
+ for $i (0 .. $#names)
+ {
+ $names[$i] = Text::BibTeX::Name->new(
+ {binmode => $self->{binmode}, normalization => $self->{normalization}},$names[$i], $filename, $line, $i);
+ }
+ @names;
+}
+
+=back
+
+=head2 Entry modification methods
+
+=over 4
+
+=item set_type (TYPE)
+
+Sets the entry's type.
+
+=item set_metatype (METATYPE)
+
+Sets the entry's metatype (must be one of the four constants
+C<BTE_COMMENT>, C<BTE_PREAMBLE>, C<BTE_MACRODEF>, and C<BTE_REGULAR>, which
+are all optionally exported from C<Text::BibTeX>).
+
+=item set_key (KEY)
+
+Sets the entry's key.
+
+=item set (FIELD, VALUE, ...)
+
+Sets the value of field FIELD. (VALUE might be C<undef> or unsupplied,
+in which case FIELD will simply be set to C<undef> -- this is where the
+difference between the C<exists> method and testing the definedness of
+field values becomes clear.)
+
+Multiple (FIELD, VALUE) pairs may be supplied; they will be processed in
+order (i.e. the input is treated like a list, not a hash). For example:
+
+ $entry->set ('author', $author);
+ $entry->set ('author', $author, 'editor', $editor);
+
+VALUE can be either a simple string or a C<Text::BibTeX::Value> object;
+it doesn't matter if the entry was parsed in "full post-processing" or
+"preserve input values" mode.
+
+=item delete (FIELD)
+
+Deletes field FIELD from an entry.
+
+=item set_fieldlist (FIELDLIST)
+
+Sets the entry's list of fields to FIELDLIST, which must be a list
+reference. If any of the field names supplied in FIELDLIST are not
+currently present in the entry, they are created with the value C<undef>
+and a warning is printed. Conversely, if any of the fields currently
+present in the entry are not named in the list of fields supplied to
+C<set_fields>, they are deleted from the entry and another warning is
+printed.
+
+=back
+
+=cut
+
+sub set_type
+{
+ my ($self, $type) = @_;
+
+ $self->{'type'} = $type;
+}
+
+sub set_metatype
+{
+ my ($self, $metatype) = @_;
+
+ $self->{'metatype'} = $metatype;
+}
+
+sub set_key
+{
+ my ($self, $key) = @_;
+
+ $self->{'key'} = Text::BibTeX->_process_argument($key, $self->{binmode}, $self->{normalization});
+}
+
+sub set
+{
+ my $self = shift;
+ croak "set: must supply an even number of arguments"
+ unless (@_ % 2 == 0);
+ my ($field, $value);
+
+ while (@_)
+ {
+ ($field,$value) = (shift,Text::BibTeX->_process_argument(shift, $self->{binmode}, $self->{normalization}));
+ push (@{$self->{'fields'}}, $field)
+ unless exists $self->{'values'}{$field};
+ $self->{'values'}{$field} = $value;
+ }
+}
+
+sub delete
+{
+ my ($self, @fields) = @_;
+ my (%gone);
+
+ %gone = map {$_, 1} @fields;
+ @{$self->{'fields'}} = grep (! $gone{$_}, @{$self->{'fields'}});
+ delete @{$self->{'values'}}{@fields};
+}
+
+sub set_fieldlist
+{
+ my ($self, $fields) = @_;
+
+ # Warn if any of the caller's fields aren't already present in the entry
+
+ my ($field, %in_list);
+ foreach $field (@$fields)
+ {
+ $in_list{$field} = 1;
+ unless (exists $self->{'values'}{$field})
+ {
+ carp "Implicitly adding undefined field \"$field\"";
+ $self->{'values'}{$field} = undef;
+ }
+ }
+
+ # And see if there are any fields in the entry that aren't in the user's
+ # list; delete them from the entry if so
+
+ foreach $field (keys %{$self->{'values'}})
+ {
+ unless ($in_list{$field})
+ {
+ carp "Implicitly deleting field \"$field\"";
+ delete $self->{'values'}{$field};
+ }
+ }
+
+ # Now we can install (a copy of) the caller's desired field list;
+
+ $self->{'fields'} = [@$fields];
+}
+
+
+=head2 Entry output methods
+
+=over 4
+
+=item write (BIBFILE)
+
+Prints a BibTeX entry on the filehandle associated with BIBFILE (which
+should be a C<Text::BibTeX::File> object, opened for output). Currently
+the printout is not particularly human-friendly; a highly configurable
+pretty-printer will be developed eventually.
+
+=item print (FILEHANDLE)
+
+Prints a BibTeX entry on FILEHANDLE.
+
+=item print_s ()
+
+Prints a BibTeX entry to a string, which is the return value.
+
+=cut
+
+sub write
+{
+ my ($self, $bibfile) = @_;
+
+ my $fh = $bibfile->{'handle'};
+ $self->print ($fh);
+}
+
+sub print
+{
+ my ($self, $handle) = @_;
+
+ $handle ||= \*STDOUT;
+ print $handle $self->print_s;
+}
+
+sub print_s
+{
+ my $self = shift;
+ my ($field, $output);
+
+ sub value_to_string
+ {
+ my $value = shift;
+
+ if (! ref $value) # just a string
+ {
+ return "{$value}";
+ }
+ else # a Text::BibTeX::Value object
+ {
+ confess "value is a reference, but not to Text::BibTeX::Value object"
+ unless $value->isa ('Text::BibTeX::Value');
+ my @values = $value->values;
+ foreach (@values)
+ {
+ $_ = $_->type == &BTAST_STRING ? '{' . $_->text . '}' : $_->text;
+ }
+ return join (' # ', @values);
+ }
+ }
+
+ carp "entry type undefined" unless defined $self->{'type'};
+ carp "entry metatype undefined" unless defined $self->{'metatype'};
+
+ # Regular and macro-def entries have to be treated differently when
+ # printing the first line, because the former have keys and the latter
+ # do not.
+ if ($self->{'metatype'} == &BTE_REGULAR)
+ {
+ carp "entry key undefined" unless defined $self->{'key'};
+ $output = sprintf ("@%s{%s,\n",
+ $self->{'type'} || '',
+ $self->{'key'} || '');
+ }
+ elsif ($self->{'metatype'} == &BTE_MACRODEF)
+ {
+ $output = sprintf ("@%s{\n",
+ $self->{'type'} || '');
+ }
+
+ # Comment and preamble entries are treated the same -- we print out
+ # the entire entry, on one line, right here.
+ else # comment or preamble
+ {
+ return sprintf ("@%s{%s}\n\n",
+ $self->{'type'},
+ value_to_string ($self->{'value'}));
+ }
+
+ # Here we print out all the fields/values of a regular or macro-def entry
+ my @fields = @{$self->{'fields'}};
+ while ($field = shift @fields)
+ {
+ my $value = $self->{'values'}{$field};
+ if (! defined $value)
+ {
+ carp "field \"$field\" has undefined value\n";
+ $value = '';
+ }
+
+ $output .= " $field = ";
+ $output .= value_to_string ($value);
+
+ $output .= ",\n";
+ }
+
+ # Tack on the last line, and we're done!
+ $output .= "}\n\n";
+
+ Text::BibTeX->_process_result($output, $self->{binmode}, $self->{normalization});
+}
+
+=back
+
+=head2 Miscellaneous methods
+
+=over 4
+
+=item warn (WARNING [, FIELD])
+
+Prepends a bit of location information (filename and line number(s)) to
+WARNING, appends a newline, and passes it to Perl's C<warn>. If FIELD is
+supplied, the line number given is just that of the field; otherwise, the
+range of lines for the whole entry is given. (Well, almost -- currently,
+the line number of the last field is used as the last line of the whole
+entry. This is a bug.)
+
+For example, if lines 10-15 of file F<foo.bib> look like this:
+
+ @article{homer97,
+ author = {Homer Simpson and Ned Flanders},
+ title = {Territorial Imperatives in Modern Suburbia},
+ journal = {Journal of Suburban Studies},
+ year = 1997
+ }
+
+then, after parsing this entry to C<$entry>, the calls
+
+ $entry->warn ('what a silly entry');
+ $entry->warn ('what a silly journal', 'journal');
+
+would result in the following warnings being issued:
+
+ foo.bib, lines 10-14: what a silly entry
+ foo.bib, line 13: what a silly journal
+
+=cut
+
+sub warn
+{
+ my ($self, $warning, $field) = @_;
+
+ my $location = '';
+ if ($self->{'file'})
+ {
+ $location = $self->{'file'}{'filename'} . ", ";
+ }
+
+ my $lines = $self->{'lines'};
+ my $entry_range = ($lines->{'START'} == $lines->{'STOP'})
+ ? "line $lines->{'START'}"
+ : "lines $lines->{'START'}-$lines->{'STOP'}";
+
+ if (defined $field)
+ {
+ $location .= (exists $lines->{$field})
+ ? "line $lines->{$field}: "
+ : "$entry_range (unknown field \"$field\"): ";
+ }
+ else
+ {
+ $location .= "$entry_range: ";
+ }
+
+ warn "$location$warning\n";
+}
+
+
+=item line ([FIELD])
+
+Returns the line number of FIELD. If the entry was parsed from a string,
+this still works--it's just the line number relative to the start of the
+string. If the entry was parsed from a file, this works just as you'd
+expect it to: it returns the absolute line number with respect to the
+whole file. Line numbers are one-based.
+
+If FIELD is not supplied, returns a two-element list containing the line
+numbers of the beginning and end of the whole entry. (Actually, the
+"end" line number is currently inaccurate: it's really the the line
+number of the last field in the entry. But it's better than nothing.)
+
+=cut
+
+sub line
+{
+ my ($self, $field) = @_;
+
+ if (defined $field)
+ {
+ return $self->{'lines'}{$field};
+ }
+ else
+ {
+ return @{$self->{'lines'}}{'START','STOP'};
+ }
+}
+
+=item filename ()
+
+Returns the name of the file from which the entry was parsed. Only
+works if the file is represented by a C<Text::BibTeX::File> object---if
+you just passed a filename/filehandle pair to C<parse>, you can't get
+the filename back. (Sorry.)
+
+=cut
+
+sub filename
+{
+ my $self = shift;
+
+ $self->{'file'}{'filename'}; # ooh yuck -- poking into File object
+}
+
+1;
+
+=back
+
+=head1 SEE ALSO
+
+L<Text::BibTeX>, L<Text::BibTeX::File>, L<Text::BibTeX::Structure>
+
+=head1 AUTHOR
+
+Greg Ward <gward@python.net>
+
+=head1 COPYRIGHT
+
+Copyright (c) 1997-2000 by Gregory P. Ward. All rights reserved. This file
+is part of the Text::BibTeX library. This library is free software; you
+may redistribute it and/or modify it under the same terms as Perl itself.
+
+=cut
diff --git a/lib/Text/BibTeX/File.pm b/lib/Text/BibTeX/File.pm
new file mode 100644
index 0000000..0ec70a2
--- /dev/null
+++ b/lib/Text/BibTeX/File.pm
@@ -0,0 +1,265 @@
+# ----------------------------------------------------------------------
+# NAME : BibTeX/File.pm
+# CLASSES : Text::BibTeX::File
+# RELATIONS :
+# DESCRIPTION: Provides an object-oriented interface to whole BibTeX
+# files.
+# CREATED : March 1997, Greg Ward
+# MODIFIED :
+# VERSION : $Id$
+# COPYRIGHT : Copyright (c) 1997-2000 by Gregory P. Ward. All rights
+# reserved.
+#
+# This file is part of the Text::BibTeX library. This
+# library is free software; you may redistribute it and/or
+# modify it under the same terms as Perl itself.
+# ----------------------------------------------------------------------
+
+package Text::BibTeX::File;
+
+use strict;
+use Carp;
+use IO::File;
+use Text::BibTeX::Entry;
+
+use vars qw'$VERSION';
+$VERSION = 0.85;
+
+=head1 NAME
+
+Text::BibTeX::File - interface to whole BibTeX files
+
+=head1 SYNOPSIS
+
+ use Text::BibTeX::File;
+
+ $bib = Text::BibTeX::File->new("foo.bib") or die "foo.bib: $!\n";
+ # or:
+ $bib = Text::BibTeX::File->new;
+ $bib->open("foo.bib", {binmode => 'utf-8', normalization => 'NFC'}) || die "foo.bib: $!\n";
+
+ $bib->set_structure ($structure_name,
+ $option1 => $value1, ...);
+
+ $at_eof = $bib->eof;
+
+ $bib->close;
+
+=head1 DESCRIPTION
+
+C<Text::BibTeX::File> provides an object-oriented interface to BibTeX
+files. Its most obvious purpose is to keep track of a filename and
+filehandle together for use by the C<Text::BibTeX::Entry> module (which
+is much more interesting). In addition, it allows you to specify
+certain options which are applicable to a whole database (file), rather
+than having to specify them for each entry in the file. Currently, you
+can specify the I<database structure> and some I<structure options>.
+These concepts are fully documented in L<Text::BibTeX::Structure>.
+
+=head1 METHODS
+
+=head2 Object creation, file operations
+
+=over 4
+
+=item new ([FILENAME], [OPTS])
+
+Creates a new C<Text::BibTeX::File> object. If FILENAME is supplied, passes
+it to the C<open> method (along with OPTS). If the C<open> fails, C<new>
+fails and returns false; if the C<open> succeeds (or if FILENAME isn't
+supplied), C<new> returns the new object reference.
+
+=item open (FILENAME [OPTS])
+
+Opens the file specified by FILENAME. OPTS is an hashref that can have
+the following values:
+
+=over 4
+
+=item MODE
+
+mode as specified by L<IO::File>
+
+=item PERMS
+
+permissions as specified by L<IO::File>. Can only be used in conjunction
+with C<MODE>
+
+=item BINMODE
+
+By default, Text::BibTeX uses bytes directly. Thus, you need to encode
+strings accordingly with the encoding of the files you are reading. You can
+also select UTF-8. In this case, Text::BibTeX will return UTF-8 strings in
+NFC mode. Note that at the moment files with BOM are not supported.
+
+Valid values are 'raw/bytes' or 'utf-8'.
+
+=item NORMALIZATION
+
+By default, Text::BibTeX outputs UTF-8 in NFC form. You can change this by passing
+the name of a different form.
+
+Valid values are those forms supported by the Unicode::Normalize module
+('NFD', 'NFDK' etc.)
+
+=item RESET_MACROS
+
+By default, Text::BibTeX accumulates macros. This means that when you open a second
+file, macros defined by the first are still available. This may result on warnings
+of macros being redefined.
+
+This option can be used to force Text::BibTeX to clean up all macros definitions
+(except for the month macros).
+
+=back
+
+=item close ()
+
+Closes the filehandle associated with the object. If there is no such
+filehandle (i.e., C<open> was never called on the object), does nothing.
+
+=item eof ()
+
+Returns the end-of-file state of the filehandle associated with the
+object: a true value means we are at the end of the file.
+
+=back
+
+=cut
+
+sub new
+{
+ my $class = shift;
+
+ $class = ref ($class) || $class;
+ my $self = bless {}, $class;
+ ($self->open (@_) || return undef) if @_;
+ $self;
+}
+
+sub open {
+ my ($self) = shift;
+ $self->{filename} = shift;
+
+ $self->{binmode} = 'bytes';
+ $self->{normalization} = 'NFC';
+ my @args = ( $self->{filename} );
+
+ if ( ref $_[0] eq "HASH" ) {
+ my $opts = {};
+ $opts = shift;
+ $opts->{ lc $_ } = $opts->{$_} for ( keys %$opts );
+ $self->{binmode} = 'utf-8'
+ if exists $opts->{binmode} && $opts->{binmode} =~ /utf-?8/i;
+ $self->{normalization} = $opts->{normalization} if exists $opts->{normalization};
+
+ if (exists $opts->{reset_macros} && $opts->{reset_macros}) {
+ Text::BibTeX::delete_all_macros();
+ Text::BibTeX::_define_months();
+ }
+
+ if ( exists $opts->{mode} ) {
+ push @args, $opts->{mode};
+ push @args, $opts->{perms} if exists $opts->{perms};
+ }
+ }
+ else {
+ push @args, @_;
+ }
+
+ $self->{handle} = IO::File->new;
+ $self->{handle}->open(@args); # filename, maybe mode, maybe perms
+}
+
+
+sub close
+{
+ my $self = shift;
+ if ( $self->{handle} ) {
+ Text::BibTeX::Entry->new ($self->{filename}, undef); # resets parser
+ $self->{handle}->close;
+ }
+}
+
+sub eof
+{
+ eof (shift->{handle});
+}
+
+sub DESTROY
+{
+ my $self = shift;
+ $self->close;
+}
+
+=head2 Object properties
+
+=over 4
+
+=item set_structure (STRUCTURE [, OPTION =E<gt> VALUE, ...])
+
+Sets the database structure for a BibTeX file. At the simplest level,
+this means that entries from the file are expected to conform to certain
+field requirements as specified by the I<structure module>. It also
+gives you full access to the methods of the particular I<structured
+entry class> for this structure, allowing you to perform operations
+specific to this kind of database. See L<Text::BibTeX::Structure/"CLASS
+INTERACTIONS"> for all the consequences of setting the database
+structure for a C<Text::BibTeX::File> object.
+
+=item structure ()
+
+Returns the name of the database structure associated with the object
+(as set by C<set_structure>).
+
+=cut
+
+sub set_structure
+{
+ my ($self, $structure, @options) = @_;
+
+ require Text::BibTeX::Structure;
+ croak "Text::BibTeX::File::set_structure: options list must have even " .
+ "number of elements"
+ unless @options % 2 == 0;
+ $self->{structure} = Text::BibTeX::Structure->new($structure, @options);
+}
+
+sub structure { shift->{structure} }
+
+
+=item preserve_values ([PRESERVE])
+
+Sets the "preserve values" flag, to control all future parsing of entries
+from this file. If PRESERVE isn't supplied, returns the current state of
+the flag. See L<Text::BibTeX::Value> for details on parsing in "value
+preservation" mode.
+
+=back
+
+=cut
+
+sub preserve_values
+{
+ my $self = shift;
+
+ $self->{'preserve_values'} = shift if @_;
+ $self->{'preserve_values'};
+}
+
+
+1;
+
+=head1 SEE ALSO
+
+L<Text::BibTeX>, L<Text::BibTeX::Entry>, L<Text::BibTeX::Structure>
+
+=head1 AUTHOR
+
+Greg Ward <gward@python.net>
+
+=head1 COPYRIGHT
+
+Copyright (c) 1997-2000 by Gregory P. Ward. All rights reserved. This file
+is part of the Text::BibTeX library. This library is free software; you
+may redistribute it and/or modify it under the same terms as Perl itself.
diff --git a/lib/Text/BibTeX/Name.pm b/lib/Text/BibTeX/Name.pm
new file mode 100644
index 0000000..00e7ed5
--- /dev/null
+++ b/lib/Text/BibTeX/Name.pm
@@ -0,0 +1,426 @@
+# ----------------------------------------------------------------------
+# NAME : BibTeX/Name.pm
+# CLASSES : Text::BibTeX::Name
+# RELATIONS :
+# DESCRIPTION: Provides an object-oriented interface to the BibTeX-
+# style author names (parsing them, that is; formatting
+# them is done by the Text::BibTeX::NameFormat class).
+# CREATED : Nov 1997, Greg Ward
+# MODIFIED :
+# VERSION : $Id$
+# COPYRIGHT : Copyright (c) 1997-2000 by Gregory P. Ward. All rights
+# reserved.
+#
+# This file is part of the Text::BibTeX library. This
+# library is free software; you may redistribute it and/or
+# modify it under the same terms as Perl itself.
+# ----------------------------------------------------------------------
+
+package Text::BibTeX::Name;
+
+require 5.004;
+
+use strict;
+use Carp;
+use vars qw'$VERSION';
+$VERSION = 0.85;
+
+use Text::BibTeX;
+
+=encoding UTF-8
+
+=head1 NAME
+
+Text::BibTeX::Name - interface to BibTeX-style author names
+
+=head1 SYNOPSIS
+
+ use Text::BibTeX::Name;
+
+ $name = Text::BibTeX::Name->new();
+ $name->split('J. Random Hacker');
+ # or:
+ $name = Text::BibTeX::Name->new('J. Random Hacker');
+
+ @firstname_tokens = $name->part ('first');
+ $lastname = join (' ', $name->part ('last'));
+
+ $format = Text::BibTeX::NameFormat->new();
+ # ...customize $format...
+ $formatted = $name->format ($format);
+
+=head1 DESCRIPTION
+
+C<Text::BibTeX::Name> provides an abstraction for BibTeX-style names and
+some basic operations on them. A name, in the BibTeX world, consists of
+a list of I<tokens> which are divided amongst four I<parts>: `first',
+`von', `last', and `jr'.
+
+Tokens are separated by whitespace or commas at brace-level zero. Thus
+the name
+
+ van der Graaf, Horace Q.
+
+has five tokens, whereas the name
+
+ {Foo, Bar, and Sons}
+
+consists of a single token. Skip down to L<"EXAMPLES"> for more examples, or
+read on if you want to know the exact details of how names are split into
+tokens and parts.
+
+How tokens are divided into parts depends on the form of the name. If
+the name has no commas at brace-level zero (as in the second example),
+then it is assumed to be in either "first last" or "first von last"
+form. If there are no tokens that start with a lower-case letter, then
+"first last" form is assumed: the final token is the last name, and all
+other tokens form the first name. Otherwise, the earliest contiguous
+sequence of tokens with initial lower-case letters is taken as the `von'
+part; if this sequence includes the final token, then a warning is
+printed and the final token is forced to be the `last' part.
+
+If a name has a single comma, then it is assumed to be in "von last,
+first" form. A leading sequence of tokens with initial lower-case
+letters, if any, forms the `von' part; tokens between the `von' and the
+comma form the `last' part; tokens following the comma form the `first'
+part. Again, if there are no tokens following a leading sequence of
+lowercase tokens, a warning is printed and the token immediately
+preceding the comma is taken to be the `last' part.
+
+If a name has more than two commas, a warning is printed and the name is
+treated as though only the first two commas were present.
+
+Finally, if a name has two commas, it is assumed to be in "von last, jr,
+first" form. (This is the only way to represent a name with a `jr'
+part.) The parsing of the name is the same as for a one-comma name,
+except that tokens between the two commas are taken to be the `jr' part.
+
+=head1 CAVEAT
+
+The C code that does the actual work of splitting up names takes a shortcut
+and makes few assumptions about whitespace. In particular, there must be
+no leading whitespace, no trailing whitespace, no consecutive whitespace
+characters in the string, and no whitespace characters other than space.
+In other words, all whitespace must consist of lone internal spaces.
+
+=head1 EXAMPLES
+
+The strings C<"John Smith"> and C<"Smith, John"> are different
+representations of the same name, so split into parts and tokens the
+same way, namely as:
+
+ first => ('John')
+ von => ()
+ last => ('Smith')
+ jr => ()
+
+Note that every part is a list of tokens, even if there is only one
+token in that part; empty parts get empty token lists. Every token is
+just a string. Writing this example in actual code is simple:
+
+ $name = Text::BibTeX::Name->new("John Smith"); # or "Smith, John"
+ $name->part ('first'); # returns list ("John")
+ $name->part ('last'); # returns list ("Smith")
+ $name->part ('von'); # returns list ()
+ $name->part ('jr'); # returns list ()
+
+(We'll omit the empty parts in the rest of the examples: just assume
+that any unmentioned part is an empty list.) If more than two tokens
+are included and there's no comma, they'll go to the first name: thus
+C<"John Q. Smith"> splits into
+
+ first => ("John", "Q."))
+ last => ("Smith")
+
+and C<"J. R. R. Tolkein"> into
+
+ first => ("J.", "R.", "R.")
+ last => ("Tolkein")
+
+The ambiguous name C<"Kevin Philips Bong"> splits into
+
+ first => ("Kevin", "Philips")
+ last => ("Bong")
+
+which may or may not be the right thing, depending on the particular
+person. There's no way to know though, so if this fellow's last name is
+"Philips Bong" and not "Bong", the string representation of his name
+must disambiguate. One possibility is C<"Philips Bong, Kevin"> which
+splits into
+
+ first => ("Kevin")
+ last => ("Philips", "Bong")
+
+Alternately, C<"Kevin {Philips Bong}"> takes advantage of the fact that
+tokes are only split on whitespace I<at brace-level zero>, and becomes
+
+ first => ("Kevin")
+ last => ("{Philips Bong}")
+
+which is fine if your names are destined to be processed by TeX, but
+might be problematic in other contexts. Similarly, C<"St John-Mollusc,
+Oliver"> becomes
+
+ first => ("Oliver")
+ last => ("St", "John-Mollusc")
+
+which can also be written as C<"Oliver {St John-Mollusc}">:
+
+ first => ("Oliver")
+ last => ("{St John-Mollusc}")
+
+Since tokens are separated purely by whitespace, hyphenated names will
+work either way: both C<"Nigel Incubator-Jones"> and C<"Incubator-Jones,
+Nigel"> come out as
+
+ first => ("Nigel")
+ last => ("Incubator-Jones")
+
+Multi-token last names with lowercase components -- the "von part" --
+work fine: both C<"Ludwig van Beethoven"> and C<"van Beethoven, Ludwig">
+parse (correctly) into
+
+ first => ("Ludwig")
+ von => ("van")
+ last => ("Beethoven")
+
+This allows these European aristocratic names to sort properly,
+i.e. I<van Beethoven> under I<B> rather than I<v>. Speaking of
+aristocratic European names, C<"Charles Louis Xavier Joseph de la
+Vall{\'e}e Poussin"> is handled just fine, and splits into
+
+ first => ("Charles", "Louis", "Xavier", "Joseph")
+ von => ("de", "la")
+ last => ("Vall{\'e}e", "Poussin")
+
+so could be sorted under I<V> rather than I<d>. (Note that the sorting
+algorithm in L<Text::BibTeX::BibSort> is a slavish imitiation of BibTeX
+0.99, and therefore does the wrong thing with these names: the sort key
+starts with the "von" part.)
+
+However, capitalized "von parts" don't work so well: C<"R. J. Van de
+Graaff"> splits into
+
+ first => ("R.", "J.", "Van")
+ von => ("de")
+ last => ("Graaff")
+
+which is clearly wrong. This name should be represented as C<"Van de
+Graaff, R. J.">
+
+ first => ("R.", "J.")
+ last => ("Van", "de", "Graaff")
+
+which is probably right. (This particular Van de Graaff was an
+American, so he probably belongs under I<V> -- which is where my
+(British) dictionary puts him. Other Van de Graaff's mileages may
+vary.)
+
+Finally, many names include a suffix: "Jr.", "III", "fils", and so
+forth. These are handled, but with some limitations. If there's a
+comma before the suffix (the usual U.S. convention for "Jr."), then the
+name should be in I<last, jr, first> form, e.g. C<"Doe, Jr., John">
+comes out (correctly) as
+
+ first => ("John")
+ last => ("Doe")
+ jr => ("Jr.")
+
+but C<"John Doe, Jr."> is ambiguous and is parsed as
+
+ first => ("Jr.")
+ last => ("John", "Doe")
+
+(so don't do it that way). If there's no comma before the suffix -- the
+usual for Roman numerals, and occasionally seen with "Jr." -- then
+you're stuck and have to make the suffix part of the last name. Thus,
+C<"Gates III, William H."> comes out
+
+ first => ("William", "H.")
+ last => ("Gates", "III")
+
+but C<"William H. Gates III"> is ambiguous, and becomes
+
+ first => ("William", "H.", "Gates")
+ last => ("III")
+
+-- not what you want. Again, the curly-brace trick comes in handy, so
+C<"William H. {Gates III}"> splits into
+
+ first => ("William", "H.")
+ last => ("{Gates III}")
+
+There is no way to make a comma-less suffix the C<jr> part. (This is an
+unfortunate consequence of slavishly imitating BibTeX 0.99.)
+
+Finally, names that aren't really names of people but rather are
+organization or company names should be forced into a single token by
+wrapping them in curly braces. For example, "Foo, Bar and Sons" should
+be written C<"{Foo, Bar and Sons}">, which will split as
+
+ last => ("{Foo, Bar and Sons}")
+
+Of course, if this is one name in a BibTeX C<authors> or C<editors>
+list, this name has to be wrapped in braces anyways (because of the C<"
+and ">), but that's another story.
+
+=head1 FORMATTING NAMES
+
+Putting a split-up name back together again in a flexible, customizable
+way is the job of another module: see L<Text::BibTeX::NameFormat>.
+
+=head1 METHODS
+
+=over 4
+
+=item new([ [OPTS,] NAME [, FILENAME, LINE, NAME_NUM]])
+
+Creates a new C<Text::BibTeX::Name> object. If NAME is supplied, it
+must be a string containing a single name, and it will be be passed to
+the C<split> method for further processing. FILENAME, LINE, and
+NAME_NUM, if present, are all also passed to C<split> to allow better
+error messages.
+
+If the first argument is a hash reference, it is used to define
+configuration values. At the moment the available values are:
+
+=over 4
+
+=item BINMODE
+
+Set the way Text::BibTeX deals with strings. By default it manages
+strings as bytes. You can set BINMODE to 'utf-8' to get NFC normalized
+UTF-8 strings and you can customise the normalization with the NORMALIZATION option.
+
+ Text::BibTeX::Name->new(
+ { binmode => 'utf-8', normalization => 'NFD' },
+ "Alberto Simões"});
+
+=back
+
+=cut
+
+sub new {
+ my $class = shift;
+ my $opts = ref $_[0] eq 'HASH' ? shift : {};
+
+ $opts->{ lc $_ } = $opts->{$_} for ( keys %$opts );
+
+ my ( $name, $filename, $line, $name_num ) = @_;
+
+ $class = ref($class) || $class;
+ my $self = bless { }, $class;
+
+ $self->{binmode} = 'bytes';
+ $self->{normalization} = 'NFC';
+ $self->{binmode} = 'utf-8'
+ if exists $opts->{binmode} && $opts->{binmode} =~ /utf-?8/i;
+ $self->{normalization} = $opts->{normalization} if exists $opts->{normalization};
+
+ $self->split( Text::BibTeX->_process_argument($name, $self->{binmode}, $self->{normalization}),
+ $filename, $line, $name_num, 1 )
+ if ( defined $name );
+ $self;
+}
+
+
+sub DESTROY
+{
+ my $self = shift;
+ $self->free; # free the C structure kept by `split'
+}
+
+
+=item split (NAME [, FILENAME, LINE, NAME_NUM])
+
+Splits NAME (a string containing a single name) into tokens and
+subsequently into the four parts of a BibTeX-style name (first, von,
+last, and jr). (Each part is a list of tokens, and tokens are separated
+by whitespace or commas at brace-depth zero. See above for full details
+on how a name is split into its component parts.)
+
+The token-lists that make up each part of the name are then stored in
+the C<Text::BibTeX::Name> object for later retrieval or formatting with
+the C<part> and C<format> methods.
+
+=cut
+
+sub split
+{
+ my ($self, $name, $filename, $line, $name_num) = @_;
+
+ # Call the XSUB with default values if necessary
+ $self->_split (Text::BibTeX->_process_argument($name, $self->{binmode}, $self->{normalization}), $filename,
+ defined $line ? $line : -1,
+ defined $name_num ? $name_num : -1,
+ 1);
+}
+
+
+=item part (PARTNAME)
+
+Returns the list of tokens in part PARTNAME of a name previously split with
+C<split>. For example, suppose a C<Text::BibTeX::Name> object is created and
+initialized like this:
+
+ $name = Text::BibTeX::Name->new();
+ $name->split ('Charles Louis Xavier Joseph de la Vall{\'e}e Poussin');
+
+Then this code:
+
+ $name->part ('von');
+
+would return the list C<('de','la')>.
+
+=cut
+
+sub part {
+ my ( $self, $partname ) = @_;
+
+ croak "unknown name part"
+ unless $partname =~ /^(first|von|last|jr)$/;
+
+ if ( exists $self->{$partname} ) {
+ my @x = map { Text::BibTeX->_process_result($_, $self->{binmode}, $self->{normalization}) }
+ @{ $self->{$partname} };
+ return @x > 1 ? @x : $x[0];
+ }
+ return undef;
+}
+
+
+=item format (FORMAT)
+
+Formats a name according to the specifications encoded in FORMAT, which
+should be a C<Text::BibTeX::NameFormat> (or descendant) object. (In short,
+it must supply a method C<apply> which takes a C<Text::BibTeX::NameFormat>
+object as its only argument.) Returns the formatted name as a string.
+
+See L<Text::BibTeX::NameFormat> for full details on formatting names.
+
+=cut
+
+sub format
+{
+ my ($self, $format) = @_;
+
+ $format->apply ($self);
+}
+
+1;
+
+=back
+
+=head1 SEE ALSO
+
+L<Text::BibTeX::Entry>, L<Text::BibTeX::NameFormat>, L<bt_split_names>.
+
+=head1 AUTHOR
+
+Greg Ward <gward@python.net>
+
+=head1 COPYRIGHT
+
+Copyright (c) 1997-2000 by Gregory P. Ward. All rights reserved. This file
+is part of the Text::BibTeX library. This library is free software; you
+may redistribute it and/or modify it under the same terms as Perl itself.
diff --git a/lib/Text/BibTeX/NameFormat.pm b/lib/Text/BibTeX/NameFormat.pm
new file mode 100644
index 0000000..26ed9ff
--- /dev/null
+++ b/lib/Text/BibTeX/NameFormat.pm
@@ -0,0 +1,325 @@
+# ----------------------------------------------------------------------
+# NAME : BibTeX/NameFormat.pm
+# CLASSES : Text::BibTeX::NameFormat
+# RELATIONS :
+# DESCRIPTION: Provides a way to format already-parsed BibTeX-style
+# author names. (The parsing is done by the
+# Text::BibTeX:Name class.)
+# CREATED : Nov 1997, Greg Ward
+# MODIFIED :
+# VERSION : $Id$
+# COPYRIGHT : Copyright (c) 1997-2000 by Gregory P. Ward. All rights
+# reserved.
+#
+# This file is part of the Text::BibTeX library. This
+# library is free software; you may redistribute it and/or
+# modify it under the same terms as Perl itself.
+# ----------------------------------------------------------------------
+
+package Text::BibTeX::NameFormat;
+
+require 5.004;
+
+use strict;
+use Carp;
+use vars qw'$VERSION';
+$VERSION = 0.85;
+
+=head1 NAME
+
+Text::BibTeX::NameFormat - format BibTeX-style author names
+
+=head1 SYNOPSIS
+
+ use Text::BibTeX::NameFormat;
+
+ $format = Text::BibTeX::NameFormat->($parts, $abbrev_first);
+
+ $format->set_text ($part,
+ $pre_part, $post_part,
+ $pre_token, $post_token);
+
+ $format->set_options ($part, $abbrev, $join_tokens, $join_part
+
+ ## Uses the encoding/binmode and normalization form stored in $name
+ $formatted_name = $format->apply ($name);
+
+=head1 DESCRIPTION
+
+After splitting a name into its components parts (represented as a
+C<Text::BibTeX::Name> object), you often want to put it back together
+again as a single string formatted in a consistent way.
+C<Text::BibTeX::NameFormat> provides a very flexible way to do this,
+generally in two stages: first, you create a "name format" which
+describes how to put the tokens and parts of any name back together, and
+then you apply the format to a particular name.
+
+The "name format" is encapsulated in a C<Text::BibTeX::NameFormat>
+object. The constructor (C<new>) includes some clever behind-the-scenes
+trickery that means you can usually get away with calling it alone, and
+not need to do any customization of the format object. If you do need
+to customize the format, though, the C<set_text()> and C<set_options()>
+methods provide that capability.
+
+Note that C<Text::BibTeX::NameFormat> is a fairly direct translation of
+the name-formatting C interface in the B<btparse> library. This manual
+page is meant to provide enough information to use the Perl class, but
+for more details and examples, consult L<bt_format_names>.
+
+=head1 CONSTANTS
+
+Two enumerated types for dealing with names and name formatting have
+been brought from C into Perl. In the B<btparse> documentation, you'll
+see references to C<bt_namepart> and C<bt_joinmethod>. The former lists
+the four "parts" of a BibTeX name: first, von, last, and jr; its values
+(in both C and Perl) are C<BTN_FIRST>, C<BTN_VON>, C<BTN_LAST>, and
+C<BTN_JR>. The latter lists the ways in which C<bt_format_name()> (the
+C function that corresponds to C<Text::BibTeX::NameFormat>'s C<apply>
+method) can join adjacent tokens together: C<BTJ_MAYTIE>, C<BTJ_SPACE>,
+C<BTJ_FORCETIE>, and C<BTJ_NOTHING>. Both sets of values may be
+imported from the C<Text::BibTeX> module, using the import tags
+C<nameparts> and C<joinmethods>. For instance:
+
+ use Text::BibTeX qw(:nameparts :joinmethods);
+ use Text::BibTeX::Name;
+ use Text::BibTeX::NameFormat;
+
+The "name part" constants are used to specify surrounding text or
+formatting options on a per-part basis: for instance, you can supply the
+"pre-token" text, or the "abbreviate" flag, for a single part without
+affecting other parts. The "join methods" are two of the three
+formatting options that you can set for a part: you can control how to
+join the individual tokens of a name (C<"JR Smith">, or C<"J R Smith">,
+or C<"J~R Smith">, and you can control how the final token of one part
+is joined to the next part (C<"la Roche"> versus C<"la~Roche">).
+
+=head1 METHODS
+
+=over 4
+
+=item new(PARTS, ABBREV_FIRST)
+
+Creates a new name format, with the two most common customizations: which
+parts to include (and in what order), and whether to abbreviate the first
+name. PARTS should be a string with at most four characters, one representing
+each part that you want to occur in a formatted name (defaults to C<"fvlj">).
+For example, C<"fvlj"> means to format names in "first von last jr" order,
+while C<"vljf"> denotes "von last jr first." ABBREV_FIRST is just a boolean
+value: false to print out the first name in full, and true to abbreviate it
+with periods after each token and discretionary ties between tokens (defaults
+to false). All intra- and inter-token punctuation and spacing is independently
+controllable with the C<set_text> and C<set_options> methods, although these
+will rarely be necessary---sensible defaults are chosen for everything, based
+on the PARTS and ABBREV_FIRST values that you supply. See the description of
+C<bt_create_name_format()> in L<bt_format_names> for full details of the
+choices made.
+
+=cut
+
+sub new
+{
+ my ($class, $parts, $abbrev_first) = @_;
+
+ $parts ||= "fvlj";
+ $abbrev_first = defined($abbrev_first)? $abbrev_first : 0;
+
+ die unless $parts =~ /^[fvlj]{1,4}$/;
+
+ $class = ref ($class) || $class;
+ my $self = bless {}, $class;
+ $self->{_cstruct} = create ($parts, $abbrev_first);
+ $self;
+}
+
+
+sub DESTROY
+{
+ my $self = shift;
+ free ($self->{'_cstruct'})
+ if defined $self->{'_cstruct'};
+}
+
+
+=item set_text (PART, PRE_PART, POST_PART, PRE_TOKEN, POST_TOKEN)
+
+Allows you to customize some or all of the surrounding text for a single
+name part. Every name part has four possible chunks of text that go
+around or within it: before/after the part as a whole, and before/after
+each token in the part. For instance, if you are abbreviating first
+names and wish to control the punctuation after each token in the first
+name, you would set the "post token" text:
+
+ $format->set_text ('first', undef, undef, undef, '');
+
+would set the post-token text to the empty string, resulting in names
+like C<"J R Smith">. (Normally, abbreviated first names will have a
+period after each token: C<"J. R. Smith">.) Note that supplying
+C<undef> for the other three values leaves them unchanged.
+
+See L<bt_format_names> for full information on formatting names.
+
+=cut
+
+sub set_text
+{
+ my ($self, $part, $pre_part, $post_part, $pre_token, $post_token) = @_;
+
+ # Engage in a little conspiracy with the XS code (_set_text) and the
+ # underlying C function (bt_set_format_text) here. In particular,
+ # neither of those functions copy the strings we pass in here -- they
+ # just copy the C pointers. Ultimately, those refer back to the Perl
+ # strings that we're passing in now. Thus, if those Perl strings
+ # were to go away (ref count drop to zero), then the C code might
+ # have dangling pointers to free'd strings -- oops! The solution is
+ # to keep references of those Perl strings here, so that their ref
+ # count can never drop to zero without our assent. Every time
+ # set_text is called, the old references are overridden (ref count
+ # drops), and when the NameFormat object is destroyed, we destroy
+ # them (ref count drops). Other than that, there will always be some
+ # reference to the strings passed in to set_text.
+
+ # XXX what if some of these are undef?
+
+ $self->{'textrefs'} = [\$pre_part, \$post_part, \$pre_token, \$post_token];
+
+ _set_text ($self->{'_cstruct'},
+ $part,
+ $pre_part,
+ $post_part,
+ $pre_token,
+ $post_token);
+ 1;
+}
+
+
+=item set_options (PART, ABBREV, JOIN_TOKENS, JOIN_PART)
+
+Allows further customization of a name format: you can set the
+abbreviation flag and the two token-join methods. Alas, there is no
+mechanism for leaving a value unchanged; you must set everything with
+C<set_options>.
+
+For example, let's say that just dropping periods from abbreviated
+tokens in the first name isn't enough; you I<really> want to save
+space by jamming the abbreviated tokens together: C<"JR Smith"> rather
+than C<"J R Smith"> Assuming the two calls in the above example have
+been done, the following will finish the job:
+
+ $format->set_options (BTN_FIRST,
+ 1, # keep same value for abbrev flag
+ BTJ_NOTHING, # jam tokens together
+ BTJ_SPACE); # space after final token of part
+
+Note that we unfortunately had to know (and supply) the current values
+for the abbreviation flag and post-part join method, even though we were
+only setting the intra-part join method.
+
+=cut
+
+sub set_options
+{
+ my ($self, $part, $abbrev, $join_tokens, $join_part) = @_;
+
+ _set_options ($self->{'_cstruct'}, $part,
+ $abbrev, $join_tokens, $join_part);
+ 1;
+}
+
+
+=item apply (NAME)
+
+Once a name format has been created and customized to your heart's
+content, you can use it to format any number of names using the C<apply>
+method. NAME must be a C<Text::BibTeX::Name> object (i.e., a pre-split
+name); C<apply> returns a string containing the parts of the name
+formatted according to the C<Text::BibTeX::NameFormat> structure it is
+called on.
+
+=cut
+
+sub apply
+{
+ my ($self, $name) = @_;
+
+ my $name_struct = $name->{'_cstruct'} ||
+ croak "invalid Name object: no C structure";
+ my $format_struct = $self->{'_cstruct'} ||
+ croak "invalid NameFormat object: no C structure";
+
+ my $ans = format_name ($name_struct, $format_struct);
+
+ $ans = Text::BibTeX->_process_result($ans, $name->{binmode}, $name->{normalization});
+
+ return $ans;
+}
+
+=back
+
+=head1 EXAMPLES
+
+Although the process of splitting and formatting names may sound
+complicated and convoluted from reading the above (along with
+L<Text::BibTeX::Name>), it's actually quite simple. There are really
+only three steps to worry about: split the name (create a
+C<Text::BibTeX::Name> object), create and customize the format
+(C<Text::BibTeX::NameFormat> object), and apply the format to the name.
+
+The first step is covered in L<Text::BibTeX::Name>; here's a brief
+example:
+
+ $orig_name = 'Charles Louis Xavier Joseph de la Vall{\'e}e Poussin';
+ $name = Text::BibTeX::Name->new($orig_name);
+
+The various parts of the name can now be accessed through
+C<Text::BibTeX::Name> methods; for instance C<$name-E<gt>part('von')>
+returns the list C<("de","la")>.
+
+Creating the name format is equally simple:
+
+ $format = Text::BibTeX::NameFormat->new('vljf', 1);
+
+creates a format that will print the name in "von last jr first" order,
+with the first name abbreviated. And for no extra charge, you get the
+right punctuation at the right place: a comma before any `jr' or `first'
+tokens, and periods after each `first' token.
+
+For instance, we can perform no further customization on this format,
+and apply it immediately to C<$name>. There are in fact two ways to do
+this, depending on whether you prefer to think of it in terms of
+"Applying the format to a name" or "formatting a name". The first is
+done with C<Text::BibTeX::NameFormat>'s C<apply> method:
+
+ $formatted_name = $format->apply ($name);
+
+while the second uses C<Text::BibTeX::Name>'s C<format> method:
+
+ $formatted_name = $name->format ($format);
+
+which is just a wrapper around C<Text::BibTeX::NameFormat::apply>. In
+either case, the result with the example name and format shown is
+
+ de~la Vall{\'e}e~Poussin, C.~L. X.~J.
+
+Note the strategic insertion of TeX "ties" (non-breakable spaces) at
+sensitive spots in the name. (The exact rules for insertion of
+discretionary ties are given in L<bt_format_names>.)
+
+=head1 SEE ALSO
+
+L<Text::BibTeX::Entry>, L<Text::BibTeX::Name>, L<bt_format_names>.
+
+=head1 AUTHOR
+
+Greg Ward <gward@python.net>
+
+=head1 COPYRIGHT
+
+Copyright (c) 1997-2000 by Gregory P. Ward. All rights reserved. This file
+is part of the Text::BibTeX library. This library is free software; you
+may redistribute it and/or modify it under the same terms as Perl itself.
+
+=cut
+
+
+1;
+
diff --git a/lib/Text/BibTeX/Structure.pm b/lib/Text/BibTeX/Structure.pm
new file mode 100644
index 0000000..15d9b7d
--- /dev/null
+++ b/lib/Text/BibTeX/Structure.pm
@@ -0,0 +1,1206 @@
+# ----------------------------------------------------------------------
+# NAME : BibTeX/Structure.pm
+# CLASSES : Text::BibTeX::Structure, Text::BibTeX::StructuredEntry
+# RELATIONS :
+# DESCRIPTION: Provides the two base classes needed to implement
+# Text::BibTeX structure modules.
+# CREATED : in original form: Apr 1997
+# completely redone: Oct 1997
+# MODIFIED :
+# VERSION : $Id$
+# COPYRIGHT : Copyright (c) 1997-2000 by Gregory P. Ward. All rights
+# reserved.
+#
+# This file is part of the Text::BibTeX library. This
+# library is free software; you may redistribute it and/or
+# modify it under the same terms as Perl itself.
+# ----------------------------------------------------------------------
+
+package Text::BibTeX::Structure;
+
+require 5.004; # for 'isa' and 'can'
+
+use strict;
+use Carp;
+
+use vars qw'$VERSION';
+$VERSION = 0.85;
+
+use Text::BibTeX ('check_class');
+
+=head1 NAME
+
+Text::BibTeX::Structure - provides base classes for user structure modules
+
+=head1 SYNOPSIS
+
+ # Define a 'Foo' structure for BibTeX databases: first, the
+ # structure class:
+
+ package Text::BibTeX::FooStructure;
+ @ISA = ('Text::BibTeX::Structure');
+
+ sub known_option
+ {
+ my ($self, $option) = @_;
+
+ ...
+ }
+
+ sub default_option
+ {
+ my ($self, $option) = @_;
+
+ ...
+ }
+
+ sub describe_entry
+ {
+ my $self = shift;
+
+ $self->set_fields ($type,
+ \@required_fields,
+ \@optional_fields,
+ [$constraint_1, $constraint_2, ...]);
+ ...
+ }
+
+
+ # Now, the structured entry class
+
+ package Text::BibTeX::FooEntry;
+ @ISA = ('Text::BibTeX::StructuredEntry');
+
+ # define whatever methods you like
+
+=head1 DESCRIPTION
+
+The module C<Text::BibTeX::Structure> provides two classes that form the
+basis of the B<btOOL> "structure module" system. This system is how
+database structures are defined and imposed on BibTeX files, and
+provides an elegant synthesis of object-oriented techniques with
+BibTeX-style database structures. Nothing described here is
+particularly deep or subtle; anyone familiar with object-oriented
+programming should be able to follow it. However, a fair bit of jargon
+in invented and tossed around, so pay attention.
+
+A I<database structure>, in B<btOOL> parlance, is just a set of allowed
+entry types and the rules for fields in each of those entry types.
+Currently, there are three kinds of rules that apply to fields: some
+fields are I<required>, meaning they must be present in every entry for
+a given type; some are I<optional>, meaning they may be present, and
+will be used if they are; other fields are members of I<constraint
+sets>, which are explained in L<"Field lists and constraint sets">
+below.
+
+A B<btOOL> structure is implemented with two classes: the I<structure
+class> and the I<structured entry class>. The former defines everything
+that applies to the structure as a whole (allowed types and field
+rules). The latter provides methods that operate on individual entries
+which conform (or are supposed to conform) to the structure. The two
+classes provided by the C<Text::BibTeX::Structure> module are
+C<Text::BibTeX::Structure> and C<Text::BibTeX::StructuredEntry>; these
+serve as base classes for, respectively, all structure classes and all
+structured entry classes. One canonical structure is provided as an
+example with B<btOOL>: the C<Bib> structure, which (via the
+C<BibStructure> and C<BibEntry> classes) provides the same functionality
+as the standard style files of BibTeX 0.99. It is hoped that other
+programmers will write new bibliography-related structures, possibly
+deriving from the C<Bib> structure, to emulate some of the functionality
+that is available through third-party BibTeX style files.
+
+The purpose of this manual page is to describe the whole "structure
+module" system. It is mainly for programmers wishing to implement a new
+database structure for data files with BibTeX syntax; if you are
+interested in the particular rules for the BibTeX-emulating C<Bib>
+structure, see L<Text::BibTeX::Bib>.
+
+Please note that the C<Text::BibTeX> prefix is dropped from most module
+and class names in this manual page, except where necessary.
+
+=head1 STRUCTURE CLASSES
+
+Structure classes have two roles: to define the list of allowed types
+and field rules, and to handle I<structure options>.
+
+=head2 Field lists and constraint sets
+
+Field lists and constraint sets define the database structure for a
+particular entry type: that is, they specify the rules which an entry
+must follow to conform to the structure (assuming that entry is of an
+allowed type). There are three components to the field rules for each
+entry type: a list of required fields, a list of optional fields, and
+I<field constraints>. Required and optional fields should be obvious to
+anyone with BibTeX experience: all required fields must be present, and
+any optional fields that are present have some meaning to the structure.
+(One could conceive of a "strict" interpretation, where any field not
+mentioned in the official definition is disallowed; this would be
+contrary to the open spirit of BibTeX databases, but could be useful in
+certain applications where a stricter level of control is desired.
+Currently, B<btOOL> does not offer such an option.)
+
+Field constraints capture the "one or the other, but not both" type of
+relationships present for some entry types in the BibTeX standard style
+files. Most BibTeX documentation glosses over the distinction between
+mutually constrained fields and required/optional fields. For instance,
+one of the standard entry types is C<book>, and "C<author> or C<editor>"
+is given in the list of required fields for that type. The meaning of
+this is that an entry of type C<book> must have I<either> the C<author>
+or C<editor> fields, but not both. Likewise, the "C<volume> or
+C<number>" are listed under the "optional fields" heading for C<book>
+entries; it would be more accurate to say that every C<book> entry may
+have one or the other, or neither, of C<volume> or C<number>---but not
+both.
+
+B<btOOL> attempts to clarify this situation by creating a third category
+of fields, those that are mutually constrained. For instance, neither
+C<author> nor C<editor> appears in the list of required fields for
+the C<inbook> type according to B<btOOL>; rather, a field constraint is
+created to express this relationship:
+
+ [1, 1, ['author', 'editor']]
+
+That is, a field constraint is a reference to a three-element list. The
+last element is a reference to the I<constraint set>, the list of fields
+to which the constraint applies. (Calling this a set is a bit
+inaccurate, as there are conditions in which the order of fields
+matters---see the C<check_field_constraints> method in L<"METHODS 2:
+BASE STRUCTURED ENTRY CLASS">.) The first two elements are the minimum
+and maximum number of fields from the constraint set that must be
+present for an entry to conform to the constraint. This constraint thus
+expresses that there must be exactly one (>= 1 and <= 1) of the fields
+C<author> and C<editor> in a C<book> entry.
+
+The "either one or neither, but not both" constraint that applies to the
+C<volume> and C<number> fields for C<book> entries is expressed slightly
+differently:
+
+ [0, 1, ['volume', 'number']]
+
+That is, either 0 or 1, but not the full 2, of C<volume> and C<number>
+may be present.
+
+It is important to note that checking and enforcing field constraints is
+based purely on counting which fields from a set are actually present;
+this mechanism can't capture "x must be present if y is" relationships.
+
+The requirements imposed on the actual structure class are simple: it
+must provide a method C<describe_entry> which sets up a fancy data
+structure describing the allowed entry types and all the field rules for
+those types. The C<Structure> class provides methods (inherited by a
+particular structure class) to help particular structure classes create
+this data structure in a consistent, controlled way. For instance, the
+C<describe_structure> method in the BibTeX 0.99-emulating
+C<BibStructure> class is quite simple:
+
+ sub describe_entry
+ {
+ my $self = shift;
+
+ # series of 13 calls to $self->set_fields (one for each standard
+ # entry type)
+ }
+
+One of those calls to the C<set_fields> method defines the rules for
+C<book> entries:
+
+ $self->set_fields ('book',
+ [qw(title publisher year)],
+ [qw(series address edition month note)],
+ [1, 1, [qw(author editor)]],
+ [0, 1, [qw(volume number)]]);
+
+The first field list is the list of required fields, and the second is
+the list of optional fields. Any number of field constraints may follow
+the list of optional fields; in this case, there are two, one for each
+of the constraints (C<author>/C<editor> and C<volume>/C<number>)
+described above. At no point is a list of allowed types explicitly
+supplied; rather, each call to C<set_fields> adds one more allowed type.
+
+New structure modules that derive from existing ones will probably use the
+C<add_fields> method (and possibly C<add_constraints>) to augment an
+existing entry type. Adding new types should be done with C<set_fields>,
+though.
+
+=head2 Structure options
+
+The other responsibility of structure classes is to handle I<structure
+options>. These are scalar values that let the user customize the
+behaviour of both the structure class and the structured entry class.
+For instance, one could have an option to enable "extended structure",
+which might add on a bunch of new entry types and new fields. (In this
+case, the C<describe_entry> method would have to pay attention to this
+option and modify its behaviour accordingly.) Or, one could have
+options to control how the structured entry class sorts or formats
+entries (for bibliography structures such as C<Bib>).
+
+The easy way to handle structure options is to provide two methods,
+C<known_option> and C<default_option>. These return, respectively,
+whether a given option is supported, and what its default value is. (If
+your structure doesn't support any options, you can just inherit these
+methods from the C<Structure> class. The default C<known_option>
+returns false for all options, and its companion C<default_option>
+crashes with an "unknown option" error.)
+
+Once C<known_option> and C<default_option> are provided, the structure
+class can sit back and inherit the more visible C<set_options> and
+C<get_options> methods from the C<Structure> class. These are the
+methods actually used to modify/query options, and will be used by
+application programs to customize the structure module's behaviour, and
+by the structure module itself to pay attention to the user's wishes.
+
+Options should generally have pure string values, so that the generic
+set_options method doesn't have to parse user-supplied strings into some
+complicated structure. However, C<set_options> will take any scalar
+value, so if the structure module clearly documents its requirements,
+the application program could supply a structure that meets its needs.
+Keep in mind that this requires cooperation between the application and
+the structure module; the intermediary code in
+C<Text::BibTeX::Structure> knows nothing about the format or syntax of
+your structure's options, and whatever scalar the application passes via
+C<set_options> will be stored for your module to retrieve via
+C<get_options>.
+
+As an example, the C<Bib> structure supports a number of "markup"
+options that allow applications to control the markup language used for
+formatting bibliographic entries. These options are naturally paired,
+as formatting commands in markup languages generally have to be turned
+on and off. The C<Bib> structure thus expects references to two-element
+lists for markup options; to specify LaTeX 2e-style emphasis for book
+titles, an application such as C<btformat> would set the C<btitle_mkup>
+option as follows:
+
+ $structure->set_options (btitle_mkup => ['\emph{', '}']);
+
+Other options for other structures might have a more complicated
+structure, but it's up to the structure class to document and enforce
+this.
+
+=head1 STRUCTURED ENTRY CLASSES
+
+A I<structured entry class> defines the behaviour of individual entries
+under the regime of a particular database structure. This is the
+I<raison d'E<ecirc>tre> for any database structure: the structure class
+merely lays out the rules for entries to conform to the structure, but
+the structured entry class provides the methods that actually operate on
+individual entries. Because this is completely open-ended, the
+requirements of a structured entry class are much less rigid than for a
+structure class. In fact, all of the requirements of a structured entry
+class can be met simply by inheriting from
+C<Text::BibTeX::StructuredEntry>, the other class provided by the
+C<Text::BibTeX::Structure> module. (For the record, those requirements
+are: a structured entry class must provide the entry
+parse/query/manipulate methods of the C<Entry> class, and it must
+provide the C<check>, C<coerce>, and C<silently_coerce> methods of the
+C<StructuredEntry> class. Since C<StructuredEntry> inherits from
+C<Entry>, both of these requirements are met "for free" by structured
+entry classes that inherit from C<Text::BibTeX::StructuredEntry>, so
+naturally this is the recommended course of action!)
+
+There are deliberately no other methods required of structured entry
+classes. A particular application (eg. C<btformat> for bibliography
+structures) will require certain methods, but it's up to the application
+and the structure module to work out the requirements through
+documentation.
+
+=head1 CLASS INTERACTIONS
+
+Imposing a database structure on your entries sets off a chain reaction
+of interactions between various classes in the C<Text::BibTeX> library
+that should be transparent when all goes well. It could prove confusing
+if things go wrong and you have to go wading through several levels of
+application program, core C<Text::BibTeX> classes, and some structure
+module.
+
+The justification for this complicated behaviour is that it allows you
+to write programs that will use a particular structured module without
+knowing the name of the structure when you write the program. Thus, the
+user can supply a database structure, and ultimately the entry objects
+you manipulate will be blessed into a class supplied by the structure
+module. A short example will illustrate this.
+
+Typically, a C<Text::BibTeX>-based program is based around a kernel of
+code like this:
+
+ $bibfile = Text::BibTeX::File->new("foo.bib");
+ while ($entry = Text::BibTeX::Entry->new($bibfile))
+ {
+ # process $entry
+ }
+
+In this case, nothing fancy is happening behind the scenes: the
+C<$bibfile> object is blessed into the C<Text::BibTeX::File> class, and
+C<$entry> is blessed into C<Text::BibTeX::Entry>. This is the
+conventional behaviour of Perl classes, but it is not the only possible
+behaviour. Let us now suppose that C<$bibfile> is expected to conform
+to a database structure specified by C<$structure> (presumably a
+user-supplied value, and thus unknown at compile-time):
+
+ $bibfile = Text::BibTeX::File->new("foo.bib");
+ $bibfile->set_structure ($structure);
+ while ($entry = Text::BibTeX::Entry->new($bibfile))
+ {
+ # process $entry
+ }
+
+A lot happens behind the scenes with the call to C<$bibfile>'s
+C<set_structure> method. First, a new structure object is created from
+C<$structure>. The structure name implies the name of a Perl
+module---the structure module---which is C<require>'d by the
+C<Structure> constructor. (The main consequence of this is that any
+compile-time errors in your structure module will not be revealed until
+a C<Text::BibTeX::File::set_structure> or
+C<Text::BibTeX::Structure::new> call attempts to load it.)
+
+Recall that the first responsibility of a structure module is to define
+a structure class. The "structure object" created by the
+C<set_structure> method call is actually an object of this class; this
+is the first bit of trickery---the structure object (buried behind the
+scenes) is blessed into a class whose name is not known until run-time.
+
+Now, the behaviour of the C<Text::BibTeX::Entry::new> constructor
+changes subtly: rather than returning an object blessed into the
+C<Text::BibTeX::Entry> class as you might expect from the code, the
+object is blessed into the structured entry class associated with
+C<$structure>.
+
+For example, if the value of C<$structure> is C<"Foo">, that means the
+user has supplied a module implementing the C<Foo> structure.
+(Ordinarily, this module would be called C<Text::BibTeX::Foo>---but you
+can customize this.) Calling the C<set_structure> method on C<$bibfile>
+will attempt to create a new structure object via the
+C<Text::BibTeX::Structure> constructor, which loads the structure module
+C<Text::BibTeX::Foo>. Once this module is successfully loaded, the new
+object is blessed into its structure class, which will presumably be
+called C<Text::BibTeX::FooStructure> (again, this is customizable). The
+new object is supplied with the user's structure options via the
+C<set_options> method (usually inherited), and then it is asked to
+describe the actual entry layout by calling its C<describe_entry>
+method. This, in turn, will usually call the inherited C<set_fields>
+method for each entry type in the database structure. When the
+C<Structure> constructor is finished, the new structure object is stored
+in the C<File> object (remember, we started all this by calling
+C<set_structure> on a C<File> object) for future reference.
+
+Then, when a new C<Entry> object is created and parsed from that
+particular C<File> object, some more trickery happens. Trivially, the
+structure object stored in the C<File> object is also stored in the
+C<Entry> object. (The idea is that entries could belong to a database
+structure independently of any file, but usually they will just get the
+structure that was assigned to their database file.) More importantly,
+the new C<Entry> object is re-blessed into the structured entry class
+supplied by the structure module---presumably, in this case,
+C<Text::BibTeX::FooEntry> (also customizable).
+
+Once all this sleight-of-hand is accomplished, the application may treat
+its entry objects as objects of the structured entry class for the
+C<Foo> structure---they may call the check/coerce methods inherited from
+C<Text::BibTeX::StructuredEntry>, and they may also call any methods
+specific to entries for this particular database structure. What these
+methods might be is up to the structure implementor to decide and
+document; thus, applications may be specific to one particular database
+structure, or they may work on all structures that supply certain
+methods. The choice is up to the application developer, and the range
+of options open to him depends on which methods structure implementors
+provide.
+
+=head1 EXAMPLE
+
+For example code, please refer to the source of the C<Bib> module and
+the C<btcheck>, C<btsort>, and C<btformat> applications supplied with
+C<Text::BibTeX>.
+
+=head1 METHODS 1: BASE STRUCTURE CLASS
+
+The first class provided by the C<Text::BibTeX::Structure> module is
+C<Text::BibTeX::Structure>. This class is intended to provide methods
+that will be inherited by user-supplied structure classes; such classes
+should not override any of the methods described here (except
+C<known_option> and C<default_option>) without very good reason.
+Furthermore, overriding the C<new> method would be useless, because in
+general applications won't know the name of your structure class---they
+can only call C<Text::BibTeX::Structure::new> (usually via
+C<Text::BibTeX::File::set_structure>).
+
+Finally, there are three methods that structure classes should
+implement: C<known_option>, C<default_option>, and C<describe_entry>.
+The first two are described in L<"Structure options"> above, the latter
+in L<"Field lists and constraint sets">. Note that C<describe_entry>
+depends heavily on the C<set_fields>, C<add_fields>, and
+C<add_constraints> methods described here.
+
+=head2 Constructor/simple query methods
+
+=over 4
+
+=item new (STRUCTURE, [OPTION =E<gt> VALUE, ...])
+
+Constructs a new structure object---I<not> a C<Text::BibTeX::Structure>
+object, but rather an object blessed into the structure class associated
+with STRUCTURE. More precisely:
+
+=over 4
+
+=item *
+
+Loads (with C<require>) the module implementing STRUCTURE. In the
+absence of other information, the module name is derived by appending
+STRUCTURE to C<"Text::BibTeX::">---thus, the module C<Text::BibTeX::Bib>
+implements the C<Bib> structure. Use the pseudo-option C<module> to
+override this module name. For instance, if the structure C<Foo> is
+implemented by the module C<Foo>:
+
+ $structure = Text::BibTeX::Structure->new
+ ('Foo', module => 'Foo');
+
+This method C<die>s if there are any errors loading/compiling the
+structure module.
+
+=item *
+
+Verifies that the structure module provides a structure class and a
+structured entry class. The structure class is named by appending
+C<"Structure"> to the name of the module, and the structured entry class
+by appending C<"Entry">. Thus, in the absence of a C<module> option,
+these two classes (for the C<Bib> structure) would be named
+C<Text::BibTeX::BibStructure> and C<Text::BibTeX::BibEntry>. Either or
+both of the default class names may be overridden by having the
+structure module return a reference to a hash (as opposed to the
+traditional C<1> returned by modules). This hash could then supply a
+C<structure_class> element to name the structure class, and an
+C<entry_class> element to name the structured entry class.
+
+Apart from ensuring that the two classes actually exist, C<new> verifies
+that they inherit correctly (from C<Text::BibTeX::Structure> and
+C<Text::BibTeX::StructuredEntry> respectively), and that the structure
+class provides the required C<known_option>, C<default_option>, and
+C<describe_entry> methods.
+
+=item *
+
+Creates the new structure object, and blesses it into the structure
+class. Supplies it with options by passing all (OPTION, VALUE) pairs to
+its C<set_options> method. Calls its C<describe_entry> method, which
+should list the field requirements for all entry types recognized by
+this structure. C<describe_entry> will most likely use some or all of
+the C<set_fields>, C<add_fields>, and C<add_constraints>
+methods---described below---for this.
+
+=back
+
+=cut
+
+sub new
+{
+ my ($type, $name, %options) = @_;
+
+ # - $type is presumably "Text::BibTeX::Structure" (if called from
+ # Text::BibTeX::File::set_structure), but shouldn't assume that
+ # - $name is the name of the user-supplied structure; it also
+ # determines the module we will attempt to load here, unless
+ # a 'module' option is given in %options
+ # - %options is a mix of options recognized here (in particular
+ # 'module'), by Text::BibTeX::StructuredEntry (? 'check', 'coerce',
+ # 'warn' flags), and by the user structure classes
+
+ my $module = (delete $options{'module'}) || ('Text::BibTeX::' . $name);
+
+ my $module_info = eval "require $module";
+ die "Text::BibTeX::Structure: unable to load module \"$module\" for " .
+ "user structure \"$name\": $@\n"
+ if $@;
+
+ my ($structure_class, $entry_class);
+ if (ref $module_info eq 'HASH')
+ {
+ $structure_class = $module_info->{'structure_class'};
+ $entry_class = $module_info->{'entry_class'};
+ }
+ $structure_class ||= $module . 'Structure';
+ $entry_class ||= $module . 'Entry';
+
+ check_class ($structure_class, "user structure class",
+ 'Text::BibTeX::Structure',
+ ['known_option', 'default_option', 'describe_entry']);
+ check_class ($entry_class, "user entry class",
+ 'Text::BibTeX::StructuredEntry',
+ []);
+
+ my $self = bless {}, $structure_class;
+ $self->{entry_class} = $entry_class;
+ $self->{name} = $name;
+ $self->set_options (%options); # these methods are both provided by
+ $self->describe_entry; # the user structure class
+ $self;
+}
+
+
+=item name ()
+
+Returns the name of the structure described by the object.
+
+=item entry_class ()
+
+Returns the name of the structured entry class associated with this
+structure.
+
+=back
+
+=cut
+
+sub name { shift->{'name'} }
+
+sub entry_class { shift->{'entry_class'} }
+
+
+=head2 Field structure description methods
+
+=over 4
+
+=item add_constraints (TYPE, CONSTRAINT, ...)
+
+Adds one or more field constraints to the structure. A field constraint
+is specified as a reference to a three-element list; the last element is
+a reference to the list of fields affected, and the first two elements
+are the minimum and maximum number of fields from the constraint set
+allowed in an entry of type TYPE. See L<"Field lists and constraint
+sets"> for a full explanation of field constraints.
+
+=cut
+
+sub add_constraints
+{
+ my ($self, $type, @constraints) = @_;
+ my ($constraint);
+
+ foreach $constraint (@constraints)
+ {
+ my ($min, $max, $fields) = @$constraint;
+ croak "add_constraints: constraint record must be a 3-element " .
+ "list, with the last element a list ref"
+ unless (@$constraint == 3 && ref $fields eq 'ARRAY');
+ croak "add_constraints: constraint record must have 0 <= 'min' " .
+ "<= 'max' <= length of field list"
+ unless ($min >= 0 && $max >= $min && $max <= @$fields);
+ map { $self->{fields}{$type}{$_} = $constraint } @$fields;
+ }
+ push (@{$self->{fieldgroups}{$type}{'constraints'}}, @constraints);
+
+} # add_constraints
+
+
+=item add_fields (TYPE, REQUIRED [, OPTIONAL [, CONSTRAINT, ...]])
+
+Adds fields to the required/optional lists for entries of type TYPE.
+Can also add field constraints, but you can just as easily use
+C<add_constraints> for that.
+
+REQUIRED and OPTIONAL, if defined, should be references to lists of
+fields to add to the respective field lists. The CONSTRAINTs, if given,
+are exactly as described for C<add_constraints> above.
+
+=cut
+
+sub add_fields # add fields for a particular type
+{
+ my ($self, $type, $required, $optional, @constraints) = @_;
+
+ # to be really robust and inheritance-friendly, we should:
+ # - check that no field is in > 1 list (just check $self->{fields}
+ # before we start assigning stuff)
+ # - allow sub-classes to delete fields or move them to another group
+
+ if ($required)
+ {
+ push (@{$self->{fieldgroups}{$type}{'required'}}, @$required);
+ map { $self->{fields}{$type}{$_} = 'required' } @$required;
+ }
+
+ if ($optional)
+ {
+ push (@{$self->{fieldgroups}{$type}{'optional'}}, @$optional);
+ map { $self->{fields}{$type}{$_} = 'optional' } @$optional;
+ }
+
+ $self->add_constraints ($type, @constraints);
+
+} # add_fields
+
+
+=item set_fields (TYPE, REQUIRED [, OPTIONAL [, CONSTRAINTS, ...]])
+
+Sets the lists of required/optional fields for entries of type TYPE.
+Identical to C<add_fields>, except that the field lists and list of
+constraints are set from scratch here, rather than being added to.
+
+=back
+
+=cut
+
+sub set_fields
+{
+ my ($self, $type, $required, $optional, @constraints) = @_;
+ my ($constraint, $field);
+
+ undef %{$self->{fields}{$type}};
+
+ if ($required)
+ {
+ $self->{fieldgroups}{$type}{'required'} = $required;
+ map { $self->{fields}{$type}{$_} = 'required' } @$required;
+ }
+
+ if ($optional)
+ {
+ $self->{fieldgroups}{$type}{'optional'} = $optional;
+ map { $self->{fields}{$type}{$_} = 'optional' } @$optional;
+ }
+
+ undef @{$self->{fieldgroups}{$type}{'constraints'}};
+ $self->add_constraints ($type, @constraints);
+
+} # set_fields
+
+
+=head2 Field structure query methods
+
+=over 4
+
+=item types ()
+
+Returns the list of entry types supported by the structure.
+
+=item known_type (TYPE)
+
+Returns true if TYPE is a supported entry type.
+
+=item known_field (TYPE, FIELD)
+
+Returns true if FIELD is in the required list, optional list, or one of
+the constraint sets for entries of type TYPE.
+
+=item required_fields (TYPE)
+
+Returns the list of required fields for entries of type TYPE.
+
+=item optional_fields ()
+
+Returns the list of optional fields for entries of type TYPE.
+
+=item field_constraints ()
+
+Returns the list of field constraints (in the format supplied to
+C<add_constraints>) for entries of type TYPE.
+
+=back
+
+=cut
+
+sub types
+{
+ my $self = shift;
+
+ keys %{$self->{'fieldgroups'}};
+}
+
+sub known_type
+{
+ my ($self, $type) = @_;
+
+ exists $self->{'fieldgroups'}{$type};
+}
+
+sub _check_type
+{
+ my ($self, $type) = @_;
+
+ croak "unknown entry type \"$type\" for $self->{'name'} structure"
+ unless exists $self->{'fieldgroups'}{$type};
+}
+
+sub known_field
+{
+ my ($self, $type, $field) = @_;
+
+ $self->_check_type ($type);
+ $self->{'fields'}{$type}{$field}; # either 'required', 'optional', or
+} # a constraint record (or undef!)
+
+sub required_fields
+{
+ my ($self, $type) = @_;
+
+ $self->_check_type ($type);
+ @{$self->{'fieldgroups'}{$type}{'required'}};
+}
+
+sub optional_fields
+{
+ my ($self, $type) = @_;
+
+ $self->_check_type ($type);
+ @{$self->{'fieldgroups'}{$type}{'optional'}};
+}
+
+sub field_constraints
+{
+ my ($self, $type) = @_;
+
+ $self->_check_type ($type);
+ @{$self->{'fieldgroups'}{$type}{'constraints'}};
+}
+
+
+=head2 Option methods
+
+=over 4
+
+=item known_option (OPTION)
+
+Returns false. This is mainly for the use of derived structures that
+don't have any options, and thus don't need to provide their own
+C<known_option> method. Structures that actually offer options should
+override this method; it should return true if OPTION is a supported
+option.
+
+=cut
+
+sub known_option
+{
+ return 0;
+}
+
+
+=item default_option (OPTION)
+
+Crashes with an "unknown option" message. Again, this is mainly for use
+by derived structure classes that don't actually offer any options.
+Structures that handle options should override this method; every option
+handled by C<known_option> should have a default value (which might just
+be C<undef>) that is returned by C<default_option>. Your
+C<default_options> method should crash on an unknown option, perhaps by
+calling C<SUPER::default_option> (in order to ensure consistent error
+messages). For example:
+
+ sub default_option
+ {
+ my ($self, $option) = @_;
+ return $default_options{$option}
+ if exists $default_options{$option};
+ $self->SUPER::default_option ($option); # crash
+ }
+
+The default value for an option is returned by C<get_options> when that
+options has not been explicitly set with C<set_options>.
+
+=cut
+
+sub default_option
+{
+ my ($self, $option) = @_;
+
+ croak "unknown option \"$option\" for structure \"$self->{'name'}\"";
+}
+
+
+=item set_options (OPTION =E<gt> VALUE, ...)
+
+Sets one or more option values. (You can supply as many
+C<OPTION =E<gt> VALUE> pairs as you like, just so long as there are an even
+number of arguments.) Each OPTION must be handled by the structure
+module (as indicated by the C<known_option> method); if not
+C<set_options> will C<croak>. Each VALUE may be any scalar value; it's
+up to the structure module to validate them.
+
+=cut
+
+sub set_options
+{
+ my $self = shift;
+ my ($option, $value);
+
+ croak "must supply an even number of arguments (option/value pairs)"
+ unless @_ % 2 == 0;
+ while (@_)
+ {
+ ($option, $value) = (shift, shift);
+ croak "unknown option \"$option\" for structure \"$self->{'name'}\""
+ unless $self->known_option ($option);
+ $self->{'options'}{$option} = $value;
+ }
+}
+
+
+=item get_options (OPTION, ...)
+
+Returns the value(s) of one or more options. Any OPTION that has not
+been set by C<set_options> will return its default value, fetched using
+the C<default_value> method. If OPTION is not supported by the
+structure module, then your program either already crashed (when it
+tried to set it with C<set_option>), or it will crash here (thanks to
+calling C<default_option>).
+
+=back
+
+=cut
+
+sub get_options
+{
+ my $self = shift;
+ my ($options, $option, $value, @values);
+
+ $options = $self->{'options'};
+ while (@_)
+ {
+ $option = shift;
+ $value = (exists $options->{$option})
+ ? $options->{$option}
+ : $self->default_option ($option);
+ push (@values, $value);
+ }
+
+ wantarray ? @values : $values[0];
+}
+
+
+
+# ----------------------------------------------------------------------
+# Text::BibTeX::StructuredEntry methods dealing with entry structure
+
+package Text::BibTeX::StructuredEntry;
+use strict;
+use vars qw(@ISA $VERSION);
+$VERSION = 0.85;
+
+use Carp;
+
+@ISA = ('Text::BibTeX::Entry');
+use Text::BibTeX qw(:metatypes display_list);
+
+=head1 METHODS 2: BASE STRUCTURED ENTRY CLASS
+
+The other class provided by the C<Structure> module is
+C<StructuredEntry>, the base class for all structured entry classes.
+This class inherits from C<Entry>, so all of its entry
+query/manipulation methods are available. C<StructuredEntry> adds
+methods for checking that an entry conforms to the database structure
+defined by a structure class.
+
+It only makes sense for C<StructuredEntry> to be used as a base class;
+you would never create standalone C<StructuredEntry> objects. The
+superficial reason for this is that only particular structured-entry
+classes have an actual structure class associated with them,
+C<StructuredEntry> on its own doesn't have any information about allowed
+types, required fields, field constraints, and so on. For a deeper
+understanding, consult L<"CLASS INTERACTIONS"> above.
+
+Since C<StructuredEntry> derives from C<Entry>, it naturally operates on
+BibTeX entries. Hence, the following descriptions refer to "the
+entry"---this is just the object (entry) being operated on. Note that
+these methods are presented in bottom-up order, meaning that the methods
+you're most likely to actually use---C<check>, C<coerce>, and
+C<silently_coerce> are at the bottom. On a first reading, you'll
+probably want to skip down to them for a quick summary.
+
+=over 4
+
+=item structure ()
+
+Returns the object that defines the structure the entry to which is
+supposed to conform. This will be an instantiation of some structure
+class, and exists mainly so the check/coerce methods can query the
+structure about the types and fields it recognizes. If, for some
+reason, you wanted to query an entry's structure about the validity of
+type C<foo>, you might do this:
+
+ # assume $entry is an object of some structured entry class, i.e.
+ # it inherits from Text::BibTeX::StructuredEntry
+ $structure = $entry->structure;
+ $foo_known = $structure->known_type ('foo');
+
+=cut
+
+sub structure
+{
+ my $self = shift;
+ $self->{'structure'};
+}
+
+
+=item check_type ([WARN])
+
+Returns true if the entry has a valid type according to its structure.
+If WARN is true, then an invalid type results in a warning being
+printed.
+
+=cut
+
+sub check_type
+{
+ my ($self, $warn) = @_;
+
+ my $type = $self->{'type'};
+ if (! $self->{'structure'}->known_type ($type))
+ {
+ $self->warn ("unknown entry type \"$type\"") if $warn;
+ return 0;
+ }
+ return 1;
+}
+
+
+=item check_required_fields ([WARN [, COERCE]])
+
+Checks that all required fields are present in the entry. If WARN is
+true, then a warning is printed for every missing field. If COERCE is
+true, then missing fields are set to the empty string.
+
+This isn't generally used by other code; see the C<check> and C<coerce>
+methods below.
+
+=cut
+
+sub check_required_fields
+{
+ my ($self, $warn, $coerce) = @_;
+ my ($field, $warning);
+ my $num_errors = 0;
+
+ foreach $field ($self->{'structure'}->required_fields ($self->type))
+ {
+ if (! $self->exists ($field))
+ {
+ $warning = "required field '$field' not present" if $warn;
+ if ($coerce)
+ {
+ $warning .= " (setting to empty string)" if $warn;
+ $self->set ($field, '');
+ }
+ $self->warn ($warning) if $warn;
+ $num_errors++;
+ }
+ }
+
+ # Coercion is always successful, so if $coerce is true return true.
+ # Otherwise, return true if no errors found.
+
+ return $coerce || ($num_errors == 0);
+
+} # check_required_fields
+
+
+=item check_field_constraints ([WARN [, COERCE]])
+
+Checks that the entry conforms to all of the field constraints imposed
+by its structure. Recall that a field constraint consists of a list of
+fields, and a minimum and maximum number of those fields that must be
+present in an entry. For each constraint, C<check_field_constraints>
+simply counts how many fields in the constraint's field set are present.
+If this count falls below the minimum or above the maximum for that
+constraint and WARN is true, a warning is issued. In general, this
+warning is of the form "between x and y of fields foo, bar, and baz must
+be present". The more common cases are handled specially to generate
+more useful and human-friendly warning messages.
+
+If COERCE is true, then the entry is modified to force it into
+conformance with all field constraints. How this is done depends on
+whether the violation is a matter of not enough fields present in the
+entry, or of too many fields present. In the former case, just enough
+fields are added (as empty strings) to meet the requirements of the
+constraint; in the latter case, fields are deleted. Which fields to add
+or delete is controlled by the order of fields in the constraint's field
+list.
+
+An example should clarify this. For instance, a field constraint
+specifying that exactly one of C<author> or C<editor> must appear in an
+entry would look like this:
+
+ [1, 1, ['author', 'editor']]
+
+Suppose the following entry is parsed and expected to conform to this
+structure:
+
+ @inbook{unknown:1997a,
+ title = "An Unattributed Book Chapter",
+ booktitle = "An Unedited Book",
+ publisher = "Foo, Bar \& Company",
+ year = 1997
+ }
+
+If C<check_field_constraints> is called on this method with COERCE true
+(which is done by any of the C<full_check>, C<coerce>, and
+C<silently_coerce> methods), then the C<author> field is set to the
+empty string. (We go through the list of fields in the constraint's
+field set in order -- since C<author> is the first missing field, we
+supply it; with that done, the entry now conforms to the
+C<author>/C<editor> constraint, so we're done.)
+
+However, if the same structure was applied to this entry:
+
+ @inbook{smith:1997a,
+ author = "John Smith",
+ editor = "Fred Jones",
+ ...
+ }
+
+then the C<editor> field would be deleted. In this case, we allow the
+first field in the constraint's field list---C<author>. Since only one
+field from the set may be present, all fields after the first one are in
+violation, so they are deleted.
+
+Again, this method isn't generally used by other code; rather, it is
+called by C<full_check> and its friends below.
+
+=cut
+
+sub check_field_constraints
+{
+ my ($self, $warn, $coerce) = @_;
+
+ my $num_errors = 0;
+ my $constraint;
+
+ foreach $constraint ($self->{'structure'}->field_constraints ($self->type))
+ {
+ my ($warning);
+ my ($min, $max, $fields) = @$constraint;
+
+ my $field;
+ my $num_seen = 0;
+ map { $num_seen++ if $self->exists ($_) } @$fields;
+
+ if ($num_seen < $min || $num_seen > $max)
+ {
+ if ($warn)
+ {
+ if ($min == 0 && $max > 0)
+ {
+ $warning = sprintf ("at most %d of fields %s may be present",
+ $max, display_list ($fields, 1));
+ }
+ elsif ($min < @$fields && $max == @$fields)
+ {
+ $warning = sprintf ("at least %d of fields %s must be present",
+ $min, display_list ($fields, 1));
+ }
+ elsif ($min == $max)
+ {
+ $warning = sprintf ("exactly %d of fields %s %s be present",
+ $min, display_list ($fields, 1),
+ ($num_seen < $min) ? "must" : "may");
+ }
+ else
+ {
+ $warning = sprintf ("between %d and %d of fields %s " .
+ "must be present",
+ $min, $max, display_list ($fields, 1))
+ }
+ }
+
+ if ($coerce)
+ {
+ if ($num_seen < $min)
+ {
+ my @blank = @{$fields}[$num_seen .. ($min-1)];
+ $warning .= sprintf (" (setting %s to empty string)",
+ display_list (\@blank, 1))
+ if $warn;
+ @blank = map (($_, ''), @blank);
+ $self->set (@blank);
+ }
+ elsif ($num_seen > $max)
+ {
+ my @delete = @{$fields}[$max .. ($num_seen-1)];
+ $warning .= sprintf (" (deleting %s)",
+ display_list (\@delete, 1))
+ if $warn;
+ $self->delete (@delete);
+ }
+ } # if $coerce
+
+ $self->warn ($warning) if $warn;
+ $num_errors++;
+ } # if $num_seen out-of-range
+
+ } # foreach $constraint
+
+ # Coercion is always successful, so if $coerce is true return true.
+ # Otherwise, return true if no errors found.
+
+ return $coerce || ($num_errors == 0);
+
+} # check_field_constraints
+
+
+=item full_check ([WARN [, COERCE]])
+
+Returns true if an entry's type and fields are all valid. That is, it
+calls C<check_type>, C<check_required_fields>, and
+C<check_field_constraints>; if all of them return true, then so does
+C<full_check>. WARN and COERCE are simply passed on to the three
+C<check_*> methods: the first controls the printing of warnings, and the
+second decides whether we should modify the entry to force it into
+conformance.
+
+=cut
+
+sub full_check
+{
+ my ($self, $warn, $coerce) = @_;
+
+ return 1 unless $self->metatype == &BTE_REGULAR;
+ return unless $self->check_type ($warn);
+ return $self->check_required_fields ($warn, $coerce) &&
+ $self->check_field_constraints ($warn, $coerce);
+}
+
+
+# Front ends for full_check -- there are actually four possible wrappers,
+# but having both $warn and $coerce false is pointless.
+
+=item check ()
+
+Checks that the entry conforms to the requirements of its associated
+database structure: the type must be known, all required fields must be
+present, and all field constraints must be met. See C<check_type>,
+C<check_required_fields>, and C<check_field_constraints> for details.
+
+Calling C<check> is the same as calling C<full_check> with WARN true and
+COERCE false.
+
+=item coerce ()
+
+Same as C<check>, except entries are coerced into conformance with the
+database structure---that is, it's just like C<full_check> with both
+WARN and COERCE true.
+
+=item silently_coerce ()
+
+Same as C<coerce>, except warnings aren't printed---that is, it's just
+like C<full_check> with WARN false and COERCE true.
+
+=back
+
+=cut
+
+sub check { shift->full_check (1, 0) }
+
+sub coerce { shift->full_check (1, 1) }
+
+sub silently_coerce { shift->full_check (0, 1) }
+
+1;
+
+=head1 SEE ALSO
+
+L<Text::BibTeX>, L<Text::BibTeX::Entry>, L<Text::BibTeX::File>
+
+=head1 AUTHOR
+
+Greg Ward <gward@python.net>
+
+=head1 COPYRIGHT
+
+Copyright (c) 1997-2000 by Gregory P. Ward. All rights reserved. This file
+is part of the Text::BibTeX library. This library is free software; you
+may redistribute it and/or modify it under the same terms as Perl itself.
diff --git a/lib/Text/BibTeX/Value.pm b/lib/Text/BibTeX/Value.pm
new file mode 100644
index 0000000..a904aab
--- /dev/null
+++ b/lib/Text/BibTeX/Value.pm
@@ -0,0 +1,333 @@
+# ----------------------------------------------------------------------
+# NAME : Text::BibTeX::Value
+# CLASSES : Text::BibTeX::Value, Text::BibTeX::SimpleValue
+# RELATIONS :
+# DESCRIPTION: Provides interfaces to BibTeX values (list of simple
+# values) and simple values (string/macro/number).
+# CREATED : 1998/03/12, Greg Ward
+# MODIFIED :
+# VERSION : $Id$
+# COPYRIGHT : Copyright (c) 1997-2000 by Gregory P. Ward. All rights
+# reserved.
+#
+# This file is part of the Text::BibTeX library. This
+# library is free software; you may redistribute it and/or
+# modify it under the same terms as Perl itself.
+# ----------------------------------------------------------------------
+
+package Text::BibTeX::Value;
+
+use strict;
+use Scalar::Util 'blessed';
+use Carp;
+
+use vars qw'$VERSION';
+$VERSION = 0.85;
+
+=head1 NAME
+
+Text::BibTeX::Value - interfaces to BibTeX values and simple values
+
+=head1 SYNOPSIS
+
+ use Text::BibTeX;
+
+ $entry = Text::BibTeX::Entry->new;
+
+ # set the 'preserve_values' flag to 1 for this parse
+ $entry->parse ($filename, $filehandle, 1);
+
+ # 'get' method now returns a Text::BibTeX::Value object
+ # rather than a string
+ $value = $entry->get ($field);
+
+ # query the `Value' object (list of SimpleValue objects)
+ @all_values = $value->values;
+ $first_value = $value->value (0);
+ $last_value = $value->value (-1);
+
+ # query the simple value objects -- type will be one of BTAST_STRING,
+ # BTAST_MACRO, or BTAST_NUMBER
+ use Text::BibTex (':nodetypes'); # import "node type" constants
+ $is_macro = ($first_value->type == BTAST_MACRO);
+ $text = $first_value->text;
+
+=head1 DESCRIPTION
+
+The C<Text::BibTeX::Value> module provides two classes,
+C<Text::BibTeX::Value> and C<Text::BibTeX::SimpleValue>, which respectively
+give you access to BibTeX "compound values" and "simple values". Recall
+that every field value in a BibTeX entry is the concatenation of one or
+more simple values, and that each of those simple values may be a literal
+string, a macro (abbreviation), or a number. Normally with
+C<Text::BibTeX>, field values are "fully processed," so that you only have
+access to the string that results from expanding macros, converting numbers
+to strings, concatenating all sub-strings, and collapsing whitespace in the
+resulting string.
+
+For example, in the following entry:
+
+ @article{homer97,
+ author = "Homer Simpson" # and # "Ned Flanders",
+ title = {Territorial Imperatives in Modern Suburbia},
+ journal = jss,
+ year = 1997
+ }
+
+we see the full range of options. The C<author> field consists of three
+simple values: a string, a macro (C<and>), and another string. The
+C<title> field is a single string, and the C<journal> and C<year> fields
+are, respectively, a single macro and a single number. If you parse
+this entry in the usual way:
+
+ $entry = Text::BibTeX::Entry->new($entry_text);
+
+then the C<get> method on C<$entry> would return simple strings.
+Assuming that the C<and> macro is defined as C<" and ">, then
+
+ $entry->get ('author')
+
+would return the Perl string C<"Homer Simpson and Ned Flanders">.
+
+However, you can also request that the library preserve the input values
+in your entries, i.e. not lose the information about which values use
+macros, which values are composed of multiple simple values, and so on.
+There are two ways to make this request: per-file and per-entry. For a
+per-file request, use the C<preserve_values> method on your C<File>
+object:
+
+ $bibfile = Text::BibTeX::File->new($filename);
+ $bibfile->preserve_values (1);
+
+ $entry = Text::BibTeX::Entry->new($bibfile);
+ $entry->get ($field); # returns a Value object
+
+ $bibfile->preserve_values (0);
+ $entry = Text::BibTeX::Entry->new($bibfile);
+ $entry->get ($field); # returns a string
+
+If you're not using a C<File> object, or want to control things at a
+finer scale, then you have to pass in the C<preserve_values> flag when
+invoking C<read>, C<parse>, or C<parse_s> on your C<Entry> objects:
+
+ # no File object, parsing from a string
+ $entry = Text::BibTeX::Entry->new;
+ $entry->parse_s ($entry_text, 0); # preserve_values=0 (default)
+ $entry->get ($field); # returns a string
+
+ $entry->parse_s ($entry_text, 1);
+ $entry->get ($field); # returns a Value object
+
+ # using a File object, but want finer control
+ $entry->read ($bibfile, 0); # now get will return strings (default)
+ $entry->read ($bibfile, 1); # now get will return Value objects
+
+A compound value, usually just called a value, is simply a list of
+simple values. The C<Text::BibTeX::Value> class (hereinafter
+abbreviated as C<Value>) provides a simple interface to this list; you
+can request the whole list, or an individual member of the list. The
+C<SimpleValue> class gives you access to the innards of each simple
+value, which consist of the I<type> and the I<text>. The type just
+tells you if this simple value is a string, macro, or number; it is
+represented using the Perl translation of the "node type" enumeration
+from C. The possible types are C<BTAST_STRING>, C<BTAST_NUMBER>, and
+C<BTAST_MACRO>. The text is just what appears in the original entry
+text, be it a string, number, or macro.
+
+For example, we could parse the above entry in "preserve values" mode as
+follows:
+
+ $entry->parse_s ($entry_text, 1); # preserve_values is 1
+
+Then, using the C<get> method on C<$entry> would return not a string,
+but a C<Value> object. We can get the list of all simple values using
+the C<values> method, or a single value using C<value>:
+
+ $author = $entry->get ('author'); # now a Text::BibTeX::Value object
+ @all_values = $author->values; # array of Text::BibTeX::SimpleValue
+ $second = $author->value (1); # same as $all_values[1]
+
+The simple values may be queried using the C<Text::BibTeX::SimpleValue>
+methods, C<type> and C<text>:
+
+ $all_values[0]->type; # returns BTAST_STRING
+ $second->type; # returns BTAST_MACRO
+
+ $all_values[0]->text; # "Homer Simpson"
+ $second->text; # "and" (NOT the macro expansion!)
+
+ $entry->get ('year')->value (0)->text; # "1997"
+
+=head1 METHODS
+
+Normally, you won't need to create C<Value> or C<SimpleValue>
+objects---they'll be created for you when an entry is parsed, and
+returned to you by the C<get> method in the C<Entry> class. Thus, the
+query methods (C<values> and C<value> for the C<Value> class, C<type>
+and C<text> for C<SimpleValue>) are probably all you need to worry
+about. If you wish, though, you can create new values and simple values
+using the two classes' respective constructors. You can also put
+newly-created C<Value> objects back into an existing C<Entry> object
+using the C<set> entry method; it doesn't matter how the entry was
+parsed, this is acceptable anytime.
+
+=head2 Text::BibTeX::Value methods
+
+=over 4
+
+=item new (SVAL, ...)
+
+Creates a new C<Value> object from a list of simple values. Each simple
+value, SVAL, may be either a C<SimpleValue> object or a reference to a
+two-element list containing the type and text of the simple value. For
+example, one way to recreate the C<author> field of the example entry in
+L<"DESCRIPTION"> would be:
+
+ $and_macro = Text::BibTeX::SimpleValue->new (BTAST_MACRO, 'and');
+ $value = Text::BibTeX::Value->new
+ ([BTAST_STRING, 'Homer Simpson'],
+ $and_macro,
+ [BTAST_STRING, 'Ned Flanders']);
+
+The resulting C<Value> object could then be installed into an entry
+using the C<set> method of the C<Entry> class.
+
+=cut
+
+sub new
+{
+ my $class = shift;
+
+ $class = ref $class || $class;
+ my $self = bless [], $class;
+ while (my $sval = shift)
+ {
+ $sval = Text::BibTeX::SimpleValue->new(@$sval)
+ if ref $sval eq 'ARRAY' && @$sval == 2;
+ croak "simple value is neither a two-element array ref " .
+ "nor a Text::BibTeX::SimpleValue object"
+ unless blessed($sval) && $sval->isa('Text::BibTeX::SimpleValue');
+ push (@$self, $sval);
+ }
+
+ $self;
+}
+
+=item values ()
+
+Returns the list of C<SimpleValue> objects that make up a C<Value> object.
+
+=item value (NUM)
+
+Returns the NUM'th C<SimpleValue> object from the list of C<SimpleValue>
+objects that make up a C<Value> object. This is just like a Perl array
+reference: NUM is zero-based, and negative numbers count from the end of
+the array.
+
+=back
+
+=cut
+
+# A Text::BibTeX::Value object is just an array ref; that array is a list
+# of Text::BibTeX::SimpleValue objects. Most of the real work for Value
+# and SimpleValue is done behind the scenes when an entry is parsed, in
+# BibTeX.xs and btxs_support.c.
+
+sub values { @{$_[0]} }
+
+sub value { $_[0]->[$_[1]] }
+
+
+package Text::BibTeX::SimpleValue;
+
+use strict;
+use Carp;
+use Text::BibTeX qw(:nodetypes);
+
+use vars qw($VERSION);
+$VERSION = '0.85';
+
+
+=head2 Text::BibTeX::SimpleValue methods
+
+=over
+
+=item new (TYPE, TEXT)
+
+Creates a new C<SimpleValue> object with the specified TYPE and TEXT.
+TYPE must be one of the allowed types for BibTeX simple values,
+i.e. C<BTAST_STRING>, C<BTAST_NUMBER>, or C<BTAST_MACRO>. You'll
+probably want to import these constants from C<Text::BibTeX> using the
+C<nodetypes> export tag:
+
+ use Text::BibTeX qw(:nodetypes);
+
+TEXT may be any string. Note that if TYPE is C<BTAST_NUMBER> and TEXT
+is not a string of digits, the C<SimpleValue> object will be created
+anyways, but a warning will be issued. No warning is issued about
+non-existent macros.
+
+=cut
+
+sub new
+{
+ my ($class, $type, $text) = @_;
+
+ croak "invalid simple value type ($type)"
+ unless ($type == &BTAST_STRING ||
+ $type == &BTAST_NUMBER ||
+ $type == &BTAST_MACRO);
+ croak "invalid simple value text (must be a simple string or number)"
+ unless defined $text && ! ref $text;
+ carp "warning: creating a 'number' simple value with non-numeric text"
+ if $type == &BTAST_NUMBER && $text !~ /^\d+$/;
+
+ $class = ref $class || $class;
+ my $self = bless [undef, undef], $class;
+ $self->[0] = $type;
+ $self->[1] = $text;
+ $self;
+}
+
+
+=item type ()
+
+Returns the type of a simple value. This will be one of the allowed
+"node types" as described under L</new> above.
+
+=item text ()
+
+Returns the text of a simple value. This is just the text that appears
+in the original entry---unexpanded macro name, or unconverted number.
+(Of course, converting numbers doesn't make any difference from Perl; in
+fact, it's all the same in C too, since the C code just keeps numbers as
+strings of digits. It's simply a matter of whether the string of digits
+is represented as a string or a number, which you might be interested in
+knowing if you want to preserve the structure of the input as much
+possible.)
+
+=back
+
+=cut
+
+sub type { shift->[0] }
+
+sub text { shift->[1] }
+
+1;
+
+=head1 SEE ALSO
+
+L<Text::BibTeX>, L<Text::BibTeX::File>, L<Text::BibTeX::Entry>
+
+=head1 AUTHOR
+
+Greg Ward <gward@python.net>
+
+=head1 COPYRIGHT
+
+Copyright (c) 1997-2000 by Gregory P. Ward. All rights reserved. This file
+is part of the Text::BibTeX library. This library is free software; you
+may redistribute it and/or modify it under the same terms as Perl itself.
+
+=cut
diff --git a/scripts/btcheck b/scripts/btcheck
new file mode 100755
index 0000000..d3a3eb0
--- /dev/null
+++ b/scripts/btcheck
@@ -0,0 +1,31 @@
+#!/usr/bin/perl -w
+
+#
+# btcheck
+#
+# Check the syntax and structure of a single BibTeX database file.
+# Currently hardcoded to use the "Bib" structure, which implements
+# exactly the structure of BibTeX 0.99.
+#
+# $Id$
+#
+
+use strict;
+use Text::BibTeX (':metatypes');
+
+my ($filename, $structure, $bibfile, $entry, %seen_key);
+die "usage: btcheck file [structure]\n" unless @ARGV == 1 || @ARGV == 2;
+($filename, $structure) = @ARGV;
+$structure ||= 'Bib';
+
+$bibfile = Text::BibTeX::File->new( $filename) or die "$filename: $!\n";
+$bibfile->set_structure ($structure);
+
+while ($entry = Text::BibTeX::Entry->new( $bibfile))
+{
+ next unless $entry->parse_ok and $entry->metatype == BTE_REGULAR;
+ my $key = $entry->key;
+ $entry->warn ("repeated entry key \"$key\"") if $seen_key{$key};
+ $seen_key{$key} = 1;
+ $entry->check;
+}
diff --git a/scripts/btformat b/scripts/btformat
new file mode 100755
index 0000000..f6b6303
--- /dev/null
+++ b/scripts/btformat
@@ -0,0 +1,128 @@
+#!/usr/bin/perl -w
+
+use strict;
+use Text::BibTeX;
+use Getopt::Tabular;
+
+# ----------------------------------------------------------------------
+# Command-line options and option table
+
+my @select; # list of citation keys
+my $markup = 'latex';
+my $open_bib = 0;
+
+# Default markups -- should be customizable
+my %markup =
+ (pre_entry => { latex => '\bibitem{%KEY%}' . "\n",
+ latex2e => '\bibitem{%KEY%}' . "\n",
+ html => '"[%LABEL%]"' },
+ inter_block => { latex => "\n\\newblock ",
+ latex2e => "\n\\newblock ",
+ html => $open_bib ? "<br>\n" : " " },
+ atitle => { latex => ['{\em ', '}'],
+ latex2e => ['\emph{', '}'],
+ html => ['<emph>', '</emph>'] },
+ btitle => { latex => ['{\em ', '}'],
+ latex2e => ['\emph{', '}'],
+ html => ['<emph>', '</emph>'] },
+ journal => { latex => ['{\em ', '}'],
+ latex2e => ['\emph{', '}'],
+ html => ['<emph>', '</emph>'] },
+ );
+
+my @opt_table =
+ (['-select', 'call', undef, sub { &get_list_arg (@_, \@select) },
+ 'list of entries to format (selected by citation key)',
+ 'key1 ...'],
+ ['-latex', 'const', 'latex', \$markup,
+ 'add LaTeX 2.09 markup to the bibliography entries'],
+ ['-latex2e', 'const', 'latex2e', \$markup,
+ 'add LaTeX 2e markup to the bibliography entries'],
+ ['-html', 'const', 'html', \$markup,
+ 'add HTML markup to the bibliography entries'],
+ ['-openbib|-closedbib', 'boolean', 0, \$open_bib,
+ 'use "open" bibliography format'],
+ );
+
+
+
+# ----------------------------------------------------------------------
+# Main program
+
+# First, parse the command line and make sure there's exactly one
+# argument (the .bib file to format) left.
+
+my $usage = "usage: btformat [options] bibfile\n";
+Getopt::Tabular::SetHelp ($usage, undef);
+GetOptions (\@opt_table, \@ARGV) || exit 1;
+
+die "$usage\nIncorrect number of arguments\n" unless (@ARGV == 1);
+
+
+# OK, we're happy with the command-line -- let's start working for real
+my ($filename, $bibfile, $entry, %select);
+
+$filename = shift;
+$bibfile = Text::BibTeX::File->new( $filename) or die "$filename: $!\n";
+$bibfile->set_structure ('Bib', namestyle => 'nopunct', nameorder => 'first');
+
+%select = map { ($_ => 1) } @select
+ if @select;
+
+my $entry_num = 0;
+while ($entry = Text::BibTeX::Entry->new( $bibfile))
+{
+ next unless $entry->parse_ok && $entry->metatype == BTE_REGULAR;
+ next if (@select && ! $select{$entry->key});
+ $entry_num++;
+
+# printf "formatting entry >%s<\n", $entry->key;
+ my (@blocks, $block, $sentence);
+ @blocks = $entry->format;
+ @blocks = grep ($_, @blocks); # strip empty blocks
+
+ BLOCK:
+ for $block (@blocks)
+ {
+ SENTENCE:
+ for $sentence (@$block)
+ {
+ # If sentence has multiple clauses, process them: first, strip
+ # out empties, and jump to the next sentence if it turns out
+ # this one is empty (ie. just a bunch of empty clauses). Then
+ # join the left-over clauses with commas.
+ if (ref $sentence eq 'ARRAY')
+ {
+ @$sentence = grep ($_, @$sentence);
+ ($sentence = '', next SENTENCE) unless @$sentence;
+ $sentence = join (', ', @$sentence);
+ }
+
+ # finish sentence with a period if it's not already punctuated
+ $sentence .= '.' unless $sentence eq '' || $sentence =~ /[.!?]$/;
+ }
+
+ # Now join together all the sentences in the block, first stripping
+ # any empties.
+ @$block = grep ($_, @$block);
+ next BLOCK unless @$block;
+ $block = join (' ', @$block); # put the sentences together
+ }
+
+ if (@blocks)
+ {
+ my ($key, $label, $header, $f_entry, $footer);
+
+ $key = $entry->key;
+ $label = $entry_num; # for now!
+ $header = $markup{pre_entry}{$markup};
+ $header =~ s/%KEY%/$key/g;
+ $header =~ s/%LABEL%/$label/g;
+
+ $f_entry = join ($markup{inter_block}{$markup}, @blocks);
+
+ print $header;
+ print $f_entry;
+ print "\n\n";
+ }
+}
diff --git a/scripts/btsort b/scripts/btsort
new file mode 100755
index 0000000..e2c02f8
--- /dev/null
+++ b/scripts/btsort
@@ -0,0 +1,33 @@
+#!/usr/bin/perl -w
+
+#
+# btsort
+#
+# Reads an entire BibTeX file, sorts the entries, and spits them back out
+# again.
+#
+# $Id$
+#
+
+use strict;
+use Text::BibTeX (':metatypes');
+
+my ($filename, $structure, @options, $bibfile, $entry, %sortkey, @entries);
+die "usage: btcheck file [structure [options]]\n" unless @ARGV >= 1;
+($filename, $structure, @options) = @ARGV;
+$structure ||= 'Bib';
+
+$bibfile = Text::BibTeX::File->new( $filename) or die "$filename: $!\n";
+$bibfile->set_structure ('Bib', @options);
+
+while ($entry = Text::BibTeX::Entry->new( $bibfile))
+{
+ next unless $entry->parse_ok && $entry->metatype == BTE_REGULAR;
+ $entry->check;
+ $sortkey{$entry} = $entry->sort_key;
+ push (@entries, $entry);
+}
+$bibfile->close;
+
+@entries = sort { $sortkey{$a} cmp $sortkey{$b} } @entries;
+$entry->print while $entry = shift @entries;
diff --git a/t/00_system_info.t b/t/00_system_info.t
new file mode 100644
index 0000000..4131599
--- /dev/null
+++ b/t/00_system_info.t
@@ -0,0 +1,12 @@
+#!perl
+
+use strict;
+use warnings;
+use Test::More tests => 1;
+
+open my $f, '<', "btparse/src/bt_config.h";
+while (<$f>) {
+ diag $_ if /#define/;
+}
+close $f;
+ok(1);
diff --git a/t/bib.t b/t/bib.t
new file mode 100644
index 0000000..9df8caa
--- /dev/null
+++ b/t/bib.t
@@ -0,0 +1,157 @@
+# -*- cperl -*-
+use strict;
+use warnings;
+
+use vars qw($DEBUG);
+use IO::Handle;
+use File::Temp qw(tempfile);
+
+use Test::More tests => 42;
+use Cwd;
+BEGIN {
+ use_ok('Text::BibTeX');
+ use_ok('Text::BibTeX::Bib');
+ my $common = getcwd()."/t/common.pl";
+ require $common;
+}
+
+$DEBUG = 1;
+
+# Basic test of the BibEntry classes (really, its base classes
+# BibFormat and BibSort)
+
+my $entries = <<'ENTRIES';
+@article{homer97,
+ author = {Simpson, Homer J. and Andr{\'e} de la Poobah},
+ title = {Territorial Imperatives in Modern Suburbia},
+ journal = {Journal of Suburban Studies},
+ volume = 4,
+ pages = "125--130",
+ year = 1997
+}
+
+@book{george98,
+ author = "George Simpson",
+ title = "How to Found a Big Department Store",
+ year = 1998,
+ month = feb
+}
+ENTRIES
+
+# (Currently) we have to go through a Text::BibTeX::File object to get
+# Entry objects blessed into a structured entry class, so start
+# by creating the file to parse.
+my ($fh, $fn) = tempfile("tmpXXXXX", SUFFIX => '.bib', UNLINK => 1);
+print {$fh} $entries;
+close $fh;
+
+# Open it as a Text::BibTeX::File object, set the structure class (which
+# controls the structured entry class of all entries parsed from that
+# file), and get the structure class (so we can set options on it).
+my $file = Text::BibTeX::File->new ($fn);
+$file->set_structure ('Bib');
+my $structure = $file->structure;
+
+# Read the two entries
+my $entry1 = Text::BibTeX::BibEntry->new( $file );
+my $entry2 = Text::BibTeX::BibEntry->new( $file );
+
+$file->close;
+#unlink ($fn) || warn "couldn't delete temporary file $fn: $!\n";
+
+# The default options of BibStructure are:
+# namestyle => 'full'
+# nameorder => 'first'
+# atitle => 1 (true)
+# sortby => 'name'
+# Let's make sure these are respected.
+
+my @blocks = $entry1->format;
+is(scalar @blocks, 4); # 4 blocks:
+ok( defined $blocks[0] ); # author
+ok( defined $blocks[1] ); # title
+ok( defined $blocks[2] ); # journal
+ok(!defined $blocks[3] ); # note (there is no note!)
+
+is(ref $blocks[0], 'ARRAY'); # 1 sentence, 1 clauses (2 authors)
+is(scalar @{$blocks[0]}, 1);
+
+is($blocks[0][0], "Homer~J. Simpson and Andr{\\'e} de~la Poobah");
+is(ref $blocks[1], 'ARRAY'); # 1 sentence, 1 clause for title
+is(scalar @{$blocks[1]}, 1);
+is($blocks[1][0], "Territorial imperatives in modern suburbia");
+
+is(ref $blocks[2], 'ARRAY'); # 1 sentence for journal
+is(scalar @{$blocks[2]}, 1);
+
+is(ref $blocks[2][0] , 'ARRAY'); # 3 clauses in that 1 sentence
+is(scalar @{$blocks[2][0]}, 3);
+
+is($blocks[2][0][0] , 'Journal of Suburban Studies');
+is($blocks[2][0][1] , '4:125--130');
+is($blocks[2][0][2] , '1997');
+
+# Tweak options, one at a time, testing the result of each tweak
+$structure->set_options (nameorder => 'last');
+@blocks = $entry1->format;
+is($blocks[0][0], "Simpson, Homer~J. and de~la Poobah, Andr{\\'e}");
+
+$structure->set_options (namestyle => 'abbrev',
+ nameorder => 'first');
+@blocks = $entry1->format;
+is($blocks[0][0] , "H.~J. Simpson and A. de~la Poobah");
+
+$structure->set_options (nameorder => 'last');
+@blocks = $entry1->format;
+is($blocks[0][0] , "Simpson, H.~J. and de~la Poobah, A.");
+
+$structure->set_options (namestyle => 'nopunct');
+@blocks = $entry1->format;
+is($blocks[0][0] , "Simpson, H~J and de~la Poobah, A");
+
+$structure->set_options (namestyle => 'nospace');
+@blocks = $entry1->format;
+is($blocks[0][0] , "Simpson, HJ and de~la Poobah, A");
+
+$structure->set_options (atitle_lower => 0);
+@blocks = $entry1->format;
+is($blocks[1][0] , "Territorial Imperatives in Modern Suburbia");
+
+# Now some formatting tests on the second entry (a book). Note that the
+# two entries share a structure object, so the last-set options apply
+# here!
+
+@blocks = $entry2->format;
+is(scalar @blocks, 4); # again, 4 blocks:
+ok(defined $blocks[0]); # name (authors or editors)
+ok(defined $blocks[1]); # title (and volume no.)
+ok(defined $blocks[2]); # no/series/publisher/date
+ok(! defined $blocks[3]); # note (again none)
+
+is($blocks[0][0], "Simpson, G");
+
+is($blocks[1][0][0], "How to Found a Big Department Store");
+ok(! $blocks[1][0][1]); # no volume number
+
+ok(! $blocks[2][0]); # no number/series
+ok(! $blocks[2][1][0]); # no publisher
+ok(! $blocks[2][1][1]); # no publisher address
+ok(! $blocks[2][1][2]); # no edition
+
+is($blocks[2][1][3], 'February 1998'); # but we do at least have a date!
+
+# fiddle a bit more with name-generation options just to make sure
+# everything's in working order
+$structure->set_options (namestyle => 'full',
+ nameorder => 'first');
+@blocks = $entry2->format;
+is($blocks[0][0], "George Simpson");
+
+# Now test sorting: by default, the book (G. Simpson 1998) should come
+# before the article (H. J. Simpson 1997) because the default sort
+# order is (name, year).
+ok($entry2->sort_key lt $entry1->sort_key);
+
+# But if we change to sort by year, the article comes first
+$structure->set_options (sortby => 'year');
+ok($entry1->sort_key lt $entry2->sort_key);
diff --git a/t/common.pl b/t/common.pl
new file mode 100644
index 0000000..c15c2b1
--- /dev/null
+++ b/t/common.pl
@@ -0,0 +1,68 @@
+use Carp;
+use Capture::Tiny 'capture';
+
+sub no_err {
+ err_like( $_[0], qr/^$/);
+}
+
+sub err_like {
+ my ($stdout, $stderr);
+
+ ($stdout, $stderr) = capture \&{$_[0]};
+
+# SKIP: {
+# skip "STDERR not available under Win32", 1 if $^O =~ /mswin32/i;
+ like($stderr, $_[1]);
+# }
+}
+
+sub list_equal {
+ my ($eq, $a, $b) = @_;
+
+ croak "list_equal: \$a and \$b not lists"
+ unless ref $a eq 'ARRAY' && ref $b eq 'ARRAY';
+
+ return 0 unless @$a == @$b; # compare lengths
+ my @eq = map { &$eq ($a->[$_], $b->[$_]) } (0 .. $#$a);
+ return 0 unless (grep ($_ == 1, @eq)) == @eq;
+}
+
+sub slist_equal {
+ my ($a, $b) = @_;
+ list_equal (sub {
+ my ($a, $b) = @_;
+ (defined $a && defined $b && $a eq $b) ||
+ (! defined $a && ! defined $b);
+ }, $a, $b);
+}
+
+sub test_entry {
+ my ($entry, $type, $key, $fields, $values, $test) = @_;
+ my ($i, @vals);
+
+ $test ||= "";
+
+ croak "test_entry: num fields != num values"
+ unless $#$fields == $#$values;
+ ok($entry->parse_ok, "Parse ok for $test");
+ is($entry->type, $type, "Type ok for $test");
+
+ if (defined $key) {
+ is($entry->key, $key, "Key ok for $test");
+ } else {
+ ok(!defined $entry->key, "Key ok for $test");
+ }
+
+ ok(slist_equal ([$entry->fieldlist], $fields));
+
+ for $i (0 .. $#$fields) {
+ my $val = $entry->get ($fields->[$i]) || '';
+ ok($entry->exists ($fields->[$i]));
+ is($val, $values->[$i]);
+ }
+
+ @vals = map ($_ || '', $entry->get (@$fields));
+ ok (slist_equal (\@vals, $values));
+}
+
+1;
diff --git a/t/corpora.bib b/t/corpora.bib
new file mode 100644
index 0000000..2544e0d
--- /dev/null
+++ b/t/corpora.bib
@@ -0,0 +1,264 @@
+
+@Article{linguamatica:6:2:Laboreiroetal,
+ author = {Gustavo Laboreiro and Eugénio Oliveira},
+ title = {Avaliação de métodos de desofuscação de palavrões},
+ journal = {Linguamática},
+ year = {2014},
+ volume = {6},
+ number = {2},
+ pages = {25--43},
+ month = {Dezembro},
+ editor = {Alberto Simões and José João Almeida and Xavier Gómez Guinovart}
+}
+
+@Article{Arbelatz13,
+ Title = {An extensive comparative study of cluster validity indicess},
+ Author = {Arbelaitz, Olatz and Gurrutxaga, Ibai and Muguerza, Javier and Pérez, Jesús M and Perona, Iñigo},
+ Journal = {Pattern Recognition},
+ Year = {2013},
+ Number = {1},
+ Pages = {243--256},
+ Volume = {46},
+
+ Publisher = {Elsevier}
+}
+
+@InProceedings{ester1996density,
+ Title = {A density-based algorithm for discovering clusters in large spatial databases with noise},
+ Author = {Ester, Martin and Kriegel, Hans-Peter and Sander, Jörg and Xu, Xiaowei},
+ Booktitle = {Proceedings of Knowledge Discovery and Data Mining},
+ Year = {1996},
+ Number = {34},
+ Pages = {226--231},
+ Volume = {96}
+}
+
+@Article{frey2007clustering,
+ Title = {Clustering by passing messages between data points},
+ Author = {Frey, Brendan J and Dueck, Delbert},
+ Journal = {Science},
+ Year = {2007},
+ Number = {5814},
+ Pages = {972--976},
+ Volume = {315},
+
+ Publisher = {American Association for the Advancement of Science}
+}
+
+@Article{fukunaga1975estimation,
+ Title = {The estimation of the gradient of a density function, with applications in pattern recognition},
+ Author = {Fukunaga, Keinosuke and Hostetler, Larry D},
+ Journal = {IEEE Transactions on Information Theory},
+ Year = {1975},
+ Number = {1},
+ Pages = {32--40},
+ Volume = {21},
+
+ Publisher = {IEEE}
+}
+
+@Article{grvcar2012methodology,
+ Title = {A methodology for mining document-enriched heterogeneous information networks},
+ Author = {Gr{\v{c}}ar, Miha and Trdin, Nejc and Lavra{\v{c}}, Nada},
+ Journal = {The Computer Journal},
+ Year = {2012},
+
+ Publisher = {Br Computer Soc}
+}
+
+@Article{hartigan1979algorithm,
+ Title = {Algorithm {AS} 136: {A} k-means clustering algorithm},
+ Author = {Hartigan, John A and Wong, Manchek A},
+ Journal = {Applied Statistics},
+ Year = {1979},
+ Pages = {100--108},
+
+ Publisher = {JSTOR}
+}
+
+@InProceedings{huynh2012scientific,
+ Title = {Scientific publication recommendations based on collaborative citation networks},
+ Author = {Huynh, Tin and Hoang, Kiem and Do, Loc and Tran, Huong and Luong, Hiep and Gauch, Susan},
+ Booktitle = {International Conference on Collaboration Technologies and Systems (CTS)},
+ Year = {2012},
+ Organization = {IEEE},
+ Pages = {316--321}
+}
+
+@Article{johnson1967hierarchical,
+ Title = {Hierarchical clustering schemes},
+ Author = {Johnson, Stephen C},
+ Journal = {Psychometrika},
+ Year = {1967},
+ Number = {3},
+ Pages = {241--254},
+ Volume = {32},
+
+ Publisher = {Springer}
+}
+
+@InCollection{liang2011finding,
+ Title = {Finding relevant papers based on citation relations},
+ Author = {Liang, Yicong and Li, Qing and Qian, Tieyun},
+ Booktitle = {Web-age Information Management},
+ Publisher = {Springer},
+ Year = {2011},
+ Pages = {403--414}
+}
+
+@TechReport{ilprints422,
+ Title = {The {PageRank Citation Ranking: Bringing Order to the Web.}},
+ Author = {Lawrence Page and Sergey Brin and Rajeev Motwani and Terry Winograd},
+ Institution = {Stanford InfoLab},
+ Year = {1999},
+
+ Address = {Stanford, CA},
+ Month = {November},
+ Number = {1999-66},
+
+ Publisher = {Stanford InfoLab}
+}
+
+@Article{pedregosa2011scikit,
+ Title = {Scikit-learn: Machine learning in {P}ython},
+ Author = {Fabian Pedregosa and Gaël Varoquaux and Alexandre Gramfort and Vincent Michel and Bertrand Thirion and Olivier Grisel and Mathieu Blondel and Peter Prettenhofer and Ron Weiss and Vincent Dubourg and Jake Vanderplas and Alexandre Passos and David Cournapeau and Matthieu Brucher and Matthieu Perrot and Édouard Duchesnay},
+ Journal = {The Journal of Machine Learning Research},
+ Year = {2011},
+ Pages = {2825--2830},
+ Volume = {12},
+
+ Publisher = {JMLR. org}
+}
+
+@InProceedings{pham2012enhancing,
+ Title = {Enhancing academic event participation with context-aware and social recommendations},
+ Author = {Pham, Manh Cuong and Kovachev, Dejan and Cao, Yiwei and Mbogos, Ghislain Manib and Klamma, Ralf},
+ Booktitle = {Proceedings of IEEE/ACM International Conference on Advances in Social Networks Analysis and Mining (ASONAM)},
+ Year = {2012},
+ Organization = {IEEE},
+ Pages = {464--471}
+}
+
+@Article{rousseeuw1987silhouettes,
+ Title = {Silhouettes: a graphical aid to the interpretation and validation of cluster analysis},
+ Author = {Rousseeuw, Peter J},
+ Journal = {Journal of Computational and Applied Mathematics},
+ Year = {1987},
+ Pages = {53--65},
+ Volume = {20},
+
+ Publisher = {Elsevier}
+}
+
+@Article{salton1975vector,
+ Title = {A vector space model for automatic indexing},
+ Author = {Salton, Gerard and Wong, Anita and Yang, Chung-Shu},
+ Journal = {Communications of the ACM},
+ Year = {1975},
+ Number = {11},
+ Pages = {613--620},
+ Volume = {18},
+
+ Publisher = {ACM}
+}
+
+@InProceedings{sculley2010web,
+ Title = {Web-scale k-means clustering},
+ Author = {Sculley, David},
+ Booktitle = {Proceedings of the 19th International Conference on World Wide Web},
+ Year = {2010},
+ Organization = {ACM},
+ Pages = {1177--1178}
+}
+
+@Article{van2008visualizing,
+ Title = {Visualizing data using t-{SNE}},
+ Author = {Van der Maaten, Laurens and Hinton, Geoffrey},
+ Journal = {Journal of Machine Learning Research},
+ Year = {2008},
+ Number = {2579-2605},
+ Pages = {85},
+ Volume = {9}
+}
+
+@InProceedings{wagstaff2001constrained,
+ Title = {Constrained k-means clustering with background knowledge},
+ Author = {Wagstaff, Kiri and Cardie, Claire and Rogers, Seth and Schr\"{o}dl, Stefan},
+ Booktitle = {Proceedings of the International Conference on Machine Learning},
+ Year = {2001},
+ Pages = {577--584},
+ Volume = {1}
+}
+
+@InProceedings{xia2014folksonomy,
+ Title = {Folksonomy based socially-aware recommendation of scholarly papers for conference participants},
+ Author = {Xia, Feng and Asabere, Nana Yaw and Liu, Haifeng and Deonauth, Nakema and Li, Fengqi},
+ Booktitle = {Proceedings of the Companion Publication of the 23rd International Conference on World Wide Web Companion},
+ Year = {2014},
+ Organization = {International World Wide Web Conferences Steering Committee},
+ Pages = {781--786}
+}
+
+@InProceedings{xia2013socially,
+ Title = {Socially-aware venue recommendation for conference participants},
+ Author = {Xia, Feng and Asabere, Nana Yaw and Rodrigues, Joel JPC and Basso, Filippo and Deonauth, Nakema and Wang, Wei},
+ Booktitle = {Proceedings of the 10th International Conference on Autonomic and Trusted Computing (UIC/ATC)},
+ Year = {2013},
+ Organization = {IEEE},
+ Pages = {134--141}
+}
+
+@Article{zhu2010data,
+ Title = {Data clustering with size constraints},
+ Author = {Zhu, Shunzhi and Wang, Dingding and Li, Tao},
+ Journal = {Knowledge-Based Systems},
+ Year = {2010},
+ Number = {8},
+ Pages = {883--889},
+ Volume = {23},
+
+ Publisher = {Elsevier}
+}
+
+@Proceedings{aime,
+ Title = {Artificial Intelligence in Medicine: 14th Conference on Artificial Intelligence in Medicine, AIME 2013, Murcia, Spain},
+ Year = {2013},
+ Editor = {Niels Peek and Roque Marin Morales and Mor Peleg },
+ Publisher = {Springer},
+ Series = {Lecture Notes in Artificial Intelligence},
+ Volume = {7885},
+
+ Booktitle = {Proceedings of 14th Conference on Artificial Intelligence in Medicine}
+}
+
+@Proceedings{aime2,
+ Title = {13th Conference on Artificial Intelligence in Medicine},
+ Booktitle = {Artificial Intelligence in Mediclne},
+ Year = {2011},
+ Editor = {Peleg, Mor and Lavra\v{c}, Nada and Combi, Carlo},
+ Publisher = {Springer}
+}
+
+@article{spasic2005text,
+ title ={Text mining and ontologies in biomedicine: making sense of raw text},
+ author ={Spasic, Irena and Ananiadou, Sophia and McNaught, John and Kumar, Anand},
+ journal ={Briefings in bioinformatics},
+ volume ={6},
+ number ={3},
+ pages ={239--251},
+ year ={2005},
+ publisher ={Oxford University Press}
+}
+
+@article{blei2012probabilistic,
+ title ={Probabilistic topic models},
+ author ={Blei, David M.},
+ journal ={Communications of the ACM},
+ volume ={55},
+ number ={4},
+ pages ={77--84},
+ year ={2012},
+ publisher ={ACM}
+}
+
+
diff --git a/t/errors.bib b/t/errors.bib
new file mode 100644
index 0000000..59dc277
--- /dev/null
+++ b/t/errors.bib
@@ -0,0 +1,7 @@
+@Article{error1,
+ author {error},
+ title = {title},
+}
+
+article{error2,
+}
diff --git a/t/from_file.t b/t/from_file.t
new file mode 100644
index 0000000..f4fffbc
--- /dev/null
+++ b/t/from_file.t
@@ -0,0 +1,82 @@
+use strict;
+use warnings;
+
+use Test::More tests => 12;
+use utf8;
+
+use Cwd;
+use Text::BibTeX;
+
+BEGIN {
+ my $common = getcwd()."/t/common.pl";
+ require $common;
+}
+
+##### parse t/corpora.bib #####
+
+my $bibtex = Text::BibTeX::File->new("t/corpora.bib", { binmode => 'utf-8'});
+
+is ref($bibtex), "Text::BibTeX::File";
+
+my @entries;
+while (my $entry = Text::BibTeX::Entry->new($bibtex)) {
+ push @entries, $entry;
+}
+
+is scalar(@entries), 25;
+
+# @Article{linguamatica:6:2:Laboreiroetal,
+# author = {Gustavo Laboreiro and Eugénio Oliveira},
+# title = {Avaliação de métodos de desofuscação de palavrões},
+# journal = {Linguamática},
+# year = {2014},
+# volume = {6},
+# number = {2},
+# pages = {25--43},
+# month = {Dezembro},
+# editor = {Alberto Simões and José João Almeida and Xavier Gómez Guinovart}
+# }
+is $entries[0]->get("title"), "Avaliação de métodos de desofuscação de palavrões";
+is $entries[0]->get("author"), "Gustavo Laboreiro and Eugénio Oliveira";
+
+my @editors = $entries[0]->names("editor");
+
+is $editors[0]->part("last"), "Simões";
+
+##### parse t/corpora.bib again to check whether bt_parse_entry() state has been reset #####
+
+$bibtex = Text::BibTeX::File->new("t/corpora.bib", { binmode => 'utf-8'});
+is ref($bibtex), "Text::BibTeX::File";
+
+@entries = ();
+while (my $entry = Text::BibTeX::Entry->new($bibtex)) {
+ push @entries, $entry;
+}
+
+is scalar(@entries), 25;
+
+##### parse t/error.bib to check whether bt_parse_entry() state can reset after error #####
+
+$bibtex = Text::BibTeX::File->new("t/errors.bib", { binmode => 'utf-8'});
+is ref($bibtex), "Text::BibTeX::File";
+
+@entries = ();
+err_like sub {
+ while (my $entry = Text::BibTeX::Entry->new($bibtex)) {
+ push @entries, $entry;
+ }
+}, qr!syntax error: found "\{error\}", expected "="!;
+
+is scalar(@entries), 1;
+
+##### parse t/corpora.bib again to check whether bt_parse_entry() state has been reset #####
+
+$bibtex = Text::BibTeX::File->new("t/corpora.bib", { binmode => 'utf-8'});
+is ref($bibtex), "Text::BibTeX::File";
+
+@entries = ();
+while (my $entry = Text::BibTeX::Entry->new($bibtex)) {
+ push @entries, $entry;
+}
+
+is scalar(@entries), 25;
diff --git a/t/macro.t b/t/macro.t
new file mode 100644
index 0000000..b04937e
--- /dev/null
+++ b/t/macro.t
@@ -0,0 +1,135 @@
+# -*- cperl -*-
+use strict;
+use warnings;
+
+use Test::More tests => 67;
+
+use vars ('$DEBUG');
+use Cwd;
+BEGIN {
+ use_ok('Text::BibTeX', qw(:macrosubs));
+ my $common = getcwd()."/t/common.pl";
+ require $common;
+}
+$DEBUG = 1;
+
+# setup_stderr;
+
+# ----------------------------------------------------------------------
+# test macro parsing and expansion
+
+my ($macrodef, $regular, $other, $entry, @warnings);
+
+$macrodef = <<'TEXT';
+@string ( foo = " The Foo
+ Journal",
+ sons = " \& Sons",
+ bar
+= {Bar } # sons,
+
+)
+TEXT
+
+$regular = <<'TEXT';
+@article { my_article,
+ author = { Us and Them },
+ journal = foo,
+ publisher = "Fu" # bar
+ }
+TEXT
+
+$other = <<'EOT';
+@article { xxx, institution = ugh }
+EOT
+
+# Direct access to macro table, part 1: make sure the macros we're going to
+# defined aren't defined
+
+print "testing that none of our macros are defined yet\n" if $DEBUG;
+
+is(macro_length('foo') , 0 );
+is(macro_length('sons'), 0 );
+is(macro_length('bar') , 0 );
+
+err_like( sub{ ok(! defined macro_text('foo') ); }, qr/undefined macro "foo"/);
+err_like( sub{ ok(! defined macro_text('sons')); }, qr/undefined macro "sons"/);
+err_like( sub{ ok(! defined macro_text('bar') ); }, qr/undefined macro "bar"/);
+
+# Now parse the macro-definition entry; this should put the three
+# macros we're interested in into the macro table so we can
+# successfully parse the regular entry
+print "parsing macro-definition entry to define 3 macros\n" if $DEBUG;
+$entry = Text::BibTeX::Entry->new();;
+
+no_err( sub{ $entry->parse_s($macrodef); } );
+
+test_entry($entry, 'string', undef,
+ [qw(foo sons bar)],
+ [' The Foo Journal', ' \& Sons', 'Bar \& Sons'], "test 1");
+
+# Direct access to macro table, part 2: make sure the macros we've just
+# defined now have the correct values
+print "checking macro table to ensure that the macros were properly defined\n"
+ if $DEBUG;
+
+no_err( sub {
+ is(macro_length('foo') ,19);
+ is(macro_length('sons'), 8);
+ is(macro_length('bar') ,14);
+
+ is(macro_text('foo') , ' The Foo Journal');
+ is(macro_text('sons'), ' \& Sons');
+ is(macro_text('bar') , 'Bar \& Sons');
+ } );
+
+# Parse the regular entry -- there should be no warnings, because
+# we've just defined the 'foo' and 'bar' macros on which it depends
+
+# calling a parse or read method on an existing object isn't documented
+# as an "ok thing to do", but it is (at least as the XS code currently
+# is!) -- hence I can leave the "new" uncommented
+# $entry = Text::BibTeX::Entry->new();
+print "parsing the regular entry which uses those 2 of those macros\n"
+ if $DEBUG;
+
+no_err( sub { $entry->parse_s ($regular); });
+
+test_entry ($entry, 'article', 'my_article',
+ [qw(author journal publisher)],
+ ['Us and Them', 'The Foo Journal', 'FuBar \& Sons'], "test 2");
+
+
+# Delete the 'bar' macro and change 'foo' -- this should result in
+# one warning about the macro value being overridden
+
+delete_macro ('bar');
+is(macro_length ('bar'), 0);
+
+err_like( sub { ok(! defined macro_text ('bar')); }, qr/undefined macro "bar"/);
+
+err_like ( sub { add_macro_text ('foo', 'The Journal of Fooology'); },
+ qr/overriding existing definition of macro "foo"/);
+
+# Now re-parse our regular entry; we should get a warning about the deleted
+# "bar" macro, and the "journal" field (which relies on "foo") should have
+# a different value
+
+err_like( sub { $entry->parse_s ($regular); }, qr/undefined macro "bar"/);
+
+test_entry ($entry, 'article', 'my_article',
+ [qw(author journal publisher)],
+ ['Us and Them', 'The Journal of Fooology', 'Fu'], "test 3");
+
+my $ugh = 'University of Good Heavens';
+add_macro_text('ugh', $ugh);
+is macro_length('ugh'), length($ugh), "ugh got defined";
+no_err( sub { $entry->parse_s ($other); }, qr/undefined macro "ugh"/);
+test_entry($entry, 'article', 'xxx', ['institution'], [$ugh], "Macro replaced");
+
+my $string = 'wednesday';
+add_macro_text(substr($string, 0, 3), $string);
+is macro_length('wed'), 9;
+
+
+
+
diff --git a/t/modify.t b/t/modify.t
new file mode 100644
index 0000000..efbc569
--- /dev/null
+++ b/t/modify.t
@@ -0,0 +1,88 @@
+# -*- cperl -*-
+use strict;
+use warnings;
+
+use IO::Handle;
+use Test::More tests => 29;
+use Cwd;
+BEGIN {
+ use_ok('Text::BibTeX');
+ my $common = getcwd()."/t/common.pl";
+ require $common;
+}
+
+# ----------------------------------------------------------------------
+# entry modification methods
+
+my ($text, $entry, @warnings, @fieldlist);
+
+$text = <<'TEXT';
+@article{homer97,
+ author = {Homer Simpson and Ned Flanders},
+ title = {Territorial Imperatives in Modern Suburbia},
+ journal = {Journal of Suburban Studies},
+ year = 1997
+}
+TEXT
+
+ok($entry = Text::BibTeX::Entry->new);
+ok($entry->parse_s ($text));
+
+ok($entry->type eq 'article');
+$entry->set_type ('book');
+ok($entry->type eq 'book');
+
+ok($entry->key eq 'homer97');
+$entry->set_key ($entry->key . 'a');
+ok($entry->key eq 'homer97a');
+
+my @names = $entry->names ('author');
+$names[0] = $names[0]->{'last'}[0] . ', ' . $names[0]->{'first'}[0];
+$names[1] = $names[1]->{'last'}[0] . ', ' . $names[1]->{'first'}[0];
+$entry->set ('author', join (' and ', @names));
+
+my $author;
+no_err( sub {
+ $author = $entry->get ('author');
+ is($author, 'Simpson, Homer and Flanders, Ned');
+ });
+
+no_err(
+ sub {
+ $entry->set (author => 'Foo Bar {and} Co.',
+ title => 'This is a new title');
+ ok($entry->get ('author') eq 'Foo Bar {and} Co.');
+ ok($entry->get ('title') eq 'This is a new title');
+ ok(slist_equal ([$entry->get ('author', 'title')],
+ ['Foo Bar {and} Co.', 'This is a new title']));
+ }
+ );
+
+ok(slist_equal ([$entry->fieldlist], [qw(author title journal year)]));
+ok($entry->exists ('journal'));
+
+$entry->delete ('journal');
+no_err sub {
+ @fieldlist = $entry->fieldlist;
+ ok(! $entry->exists ('journal'));
+ ok(slist_equal (\@fieldlist, [qw(author title year)]));
+};
+
+err_like sub { $entry->set_fieldlist ([qw(author title journal year)]); },
+ qr/implicitly adding undefined field \"journal\"/i;
+
+no_err sub {
+ @fieldlist = $entry->fieldlist;
+ ok($entry->exists ('journal'));
+ ok(! defined $entry->get ('journal'));
+ ok(slist_equal (\@fieldlist, [qw(author title journal year)]));
+};
+
+$entry->delete ('journal', 'author', 'year');
+no_err sub { @fieldlist = $entry->fieldlist; };
+ok(! $entry->exists ('journal'));
+ok(! $entry->exists ('author'));
+ok(! $entry->exists ('year'));
+is(scalar @fieldlist, 1);
+is($fieldlist[0] ,'title');
+
diff --git a/t/nameformat.t b/t/nameformat.t
new file mode 100644
index 0000000..74fc9c9
--- /dev/null
+++ b/t/nameformat.t
@@ -0,0 +1,197 @@
+# -*- cperl -*-
+use strict;
+use vars qw($DEBUG);
+use IO::Handle;
+use Test::More tests=>27;
+use utf8;
+use Encode 'decode';
+use Unicode::Normalize;
+
+use Cwd;
+my $common = getcwd()."/t/common.pl";
+require $common;
+
+
+use Text::BibTeX qw(:nameparts :joinmethods);
+
+
+$DEBUG = 1;
+
+{
+ # tests 1..3
+ # Get a name to work with (and just a quick check that the Name class
+ # is in working order)
+ my $name = Text::BibTeX::Name->new
+ ("Charles Louis Xavier Joseph de la Vall{\\'e}e Poussin");
+
+ my @first = $name->part('first');
+ my @von = $name->part('von');
+ my @last = $name->part('last');
+
+ is_deeply \@first, [qw(Charles Louis Xavier Joseph)],
+ "First name is 'Charles Louis Xavier Joseph'";
+ is_deeply \@von, [qw(de la)],
+ "von part is 'de la'";
+ is_deeply \@last, ["Vall{\\'e}e", 'Poussin'],
+ "Last name is 'Vall{\\'e}e Poussin'";
+}
+
+{
+ # tests 4..5..
+ my $name1 = Text::BibTeX::Name->new('{John Henry} Ford');
+ my $format1 = Text::BibTeX::NameFormat->new('f', 1);
+ is $format1->apply($name1), 'J.', "first name is abbreviated correctly [1]";
+
+ my $name2 = Text::BibTeX::Name->new('{John} Ford');
+ my $format2 = Text::BibTeX::NameFormat->new('f', 1);
+ is $format2->apply($name2), 'J.', "first name is abbreviated correctly [2]";
+}
+
+{
+ # tests 6..
+ my $name3 = Text::BibTeX::Name->new
+ ('{U.S. Department of Health and Human Services, National Institute of Mental Health,'.
+ 'National Heart, Lung and Blood Institute}');
+
+ my $format3 = Text::BibTeX::NameFormat->new('l', 1);
+
+ $format3->set_text(BTN_LAST, undef, undef, undef, '.');
+ $format3->set_options(BTN_LAST, 1, BTJ_NOTHING, BTJ_NOTHING);
+
+ is $format3->apply($name3), 'U.', 'big institution';
+}
+
+{
+ # tests 7..8..
+ my $name4 = Text::BibTeX::Name->new("{\\'E}mile Zola");
+ my $format4 = Text::BibTeX::NameFormat->new('f', 1);
+ is $format4->apply($name4), "{\\'E}.", "accented first letter";
+
+ my $name5 = Text::BibTeX::Name->new('St John-Mollusc, Oliver');
+ my $format5 = Text::BibTeX::NameFormat->new('l', 1);
+
+ $format5->set_text(BTN_LAST, undef, undef, undef, '.');
+ $format5->set_options(BTN_LAST, 1, BTJ_MAYTIE, BTJ_NOTHING);
+
+ is $format5->apply($name5), 'S.~J.-M.', "abbreviated surname";
+}
+
+{
+ # tests 9..
+ my $name6 = Text::BibTeX::Name->new("St John-{\\'E}mile Mollusc, Oliver");
+ my $format6 = Text::BibTeX::NameFormat->new('l', 1);
+
+ $format6->set_text (BTN_LAST, undef, undef, undef, '.');
+ $format6->set_options (BTN_LAST, 1, BTJ_MAYTIE, BTJ_NOTHING);
+
+ is $format6->apply($name6), "S.~J.-{\\'E}.~M.", "Abbreviated accented surname";
+}
+
+{
+ # test 10...
+ my $name7 = Text::BibTeX::Name->new('St {John-Mollusc}, Oliver');
+ my $format7 = Text::BibTeX::NameFormat->new('l', 1);
+
+ $format7->set_text (BTN_LAST, undef, undef, undef, '.');
+ $format7->set_options (BTN_LAST, 1, BTJ_MAYTIE, BTJ_NOTHING);
+
+ is $format7->apply($name7), 'S.~J.';
+}
+
+{
+ # test 11... to 16
+
+ ## This in raw mode
+ my $name8 = Text::BibTeX::Name->new('Šomeone Smith');
+ my $formatter = Text::BibTeX::NameFormat->new('f', 1);
+ is NFC(decode('UTF-8',$formatter->apply($name8))), 'Š.', "raw test 1";
+
+ my $name9 = Text::BibTeX::Name->new('Šomeone-Šomething Smith');
+ is NFC(decode('UTF-8',$formatter->apply($name9))), 'Š.-Š.', "raw test 2";
+
+ $formatter = Text::BibTeX::NameFormat->new('f', 1);
+ my $name10 = Text::BibTeX::Name->new({binmode=>'utf-8'},'{Šomeone-Šomething} Smith');
+ is $formatter->apply($name10), 'Š.', "utf-8 [1]";
+
+ # Initial is 2 bytes long in UTF8
+ my $formatterlast = Text::BibTeX::NameFormat->new('f', 1);
+ my $name11 = Text::BibTeX::Name->new({binmode=>'utf-8'},'Żaa Smith');
+ is $formatterlast->apply($name11), 'Ż.', "utf-8 [2]";
+
+ # Initial is 3 bytes long in UTF8 (Z + 2 byte combining mark)
+ $formatterlast = Text::BibTeX::NameFormat->new('f', 1);
+ my $name12 = Text::BibTeX::Name->new({binmode=>'utf-8'},'Z̃ Smith');
+ is $formatterlast->apply($name12), 'Z̃.', "utf-8 [3]";
+
+ # Initial is 7 bytes long in UTF8 (A + 3 * 2 byte combining marks)
+ $formatterlast = Text::BibTeX::NameFormat->new('f', 1);
+ my $name13 = Text::BibTeX::Name->new({binmode=>'utf-8'},'A̧̦̓ Smith');
+ is $formatterlast->apply($name13), 'A̧̦̓.', "utf-8 [3]";
+
+}
+
+{
+ # test 17... and 18
+ my $name14 = Text::BibTeX::Name->new('Harold {K}ent-{B}arrow');
+ my $format11 = Text::BibTeX::NameFormat->new('l', 1);
+
+ $format11->set_text(BTN_LAST, undef, undef, undef, '.');
+ $format11->set_options(BTN_LAST, 1, BTJ_MAYTIE, BTJ_NOTHING);
+
+ is $format11->apply($name14), 'K.-B.';
+
+ my $name15 = Text::BibTeX::Name->new('Mirian Neuser-Hoffman');
+ my $format12 = Text::BibTeX::NameFormat->new('l', 1);
+
+ $format12->set_text(BTN_LAST, undef, undef, undef, '');
+ $format12->set_options(BTN_LAST, 1, BTJ_MAYTIE, BTJ_NOTHING);
+
+ is $format12->apply($name15), 'N-H';
+}
+
+{
+ # test 19 to 26
+
+ my $name = Text::BibTeX::Name->new
+ ("Charles Louis Xavier Joseph de la Vall{\\'e}e Poussin");
+
+ # Start with a basic "von last, jr, first" formatter
+ my $format = Text::BibTeX::NameFormat->new('vljf', 1);
+
+ is $format->apply($name), "de~la Vall{\\'e}e~Poussin, C.~L. X.~J.";
+ is $format->apply($name), $name->format($format);
+
+ # Tweak options: force ties between tokens of the first name
+ $format->set_options(BTN_FIRST, 1, BTJ_FORCETIE, BTJ_NOTHING);
+ is $format->apply($name), "de~la Vall{\\'e}e~Poussin, C.~L.~X.~J.";
+
+ # And no ties in the "von" part
+ $format->set_options(BTN_VON, 0, BTJ_SPACE, BTJ_SPACE);
+ is $format->apply($name), "de la Vall{\\'e}e~Poussin, C.~L.~X.~J.";
+
+ # No punctuation in the first name
+ $format->set_text(BTN_FIRST, undef, undef, undef, '');
+ is $format->apply($name), "de la Vall{\\'e}e~Poussin, C~L~X~J";
+
+ # And drop the first name inter-token separation entirely
+ $format->set_options(BTN_FIRST, 1, BTJ_NOTHING, BTJ_NOTHING);
+ is $format->apply($name), "de la Vall{\\'e}e~Poussin, CLXJ";
+
+ # Now we get silly: keep the first name tokens jammed together, but
+ # don't abbreviate them any more
+ $format->set_options(BTN_FIRST, 0, BTJ_NOTHING, BTJ_NOTHING);
+ is $format->apply($name), "de la Vall{\\'e}e~Poussin, CharlesLouisXavierJoseph";
+
+ # OK, but spaces back in to the first name
+ $format->set_options (BTN_FIRST, 0, BTJ_SPACE, BTJ_NOTHING);
+ is $format->apply($name), "de la Vall{\\'e}e~Poussin, Charles Louis Xavier Joseph";
+}
+
+{
+ # test 27
+ my $entry = new Text::BibTeX::Entry;
+ $entry->parse_s('@' . "article{key,\n author = {Firstlastname Secondlastname, Firstname and others},\n}");
+ my @authors = $entry->names("author");
+ my $format = new Text::BibTeX::NameFormat("vl");
+ is $format->apply($authors[0]), "Firstlastname~Secondlastname";
+}
diff --git a/t/namelist.t b/t/namelist.t
new file mode 100644
index 0000000..91349ac
--- /dev/null
+++ b/t/namelist.t
@@ -0,0 +1,58 @@
+# -*- cperl -*-
+use strict;
+use warnings;
+
+use IO::Handle;
+use Test::More tests => 13;
+
+use vars qw($DEBUG);
+use Cwd;
+BEGIN {
+ use_ok('Text::BibTeX');
+ my $common = getcwd()."/t/common.pl";
+ require $common;
+}
+
+$DEBUG = 0;
+
+# ----------------------------------------------------------------------
+# make sure we can split up lists of names
+
+my (@names);
+
+@names =
+ ('J. Smith and N. D. Andrews' => ['J. Smith', 'N. D. Andrews'],
+ 'J. Smith and A. Jones' => ['J. Smith', 'A. Jones'],
+ 'J. Smith and A. Jones and J. Random' => ['J. Smith', 'A. Jones', 'J. Random'],
+ 'A. Smith and J. Jones' => ['A. Smith', 'J. Jones'],
+ 'A. Smith and A. Jones' => ['A. Smith', 'A. Jones'],
+ 'Amy Smith and Andrew Jones' => ['Amy Smith', 'Andrew Jones'],
+ 'Amy Smith and And y Jones' => ['Amy Smith', undef, 'y Jones'],
+ 'K. Herterich and S. Determann and B. Grieger and I. Hansen and P. Helbig and S. Lorenz and A. Manschke' => ['K. Herterich', 'S. Determann', 'B. Grieger', 'I. Hansen', 'P. Helbig', 'S. Lorenz', 'A. Manschke'],
+ 'A. Manschke and M. Matthies and A. Paul and R. Schlotte and U. Wyputta' => ['A. Manschke', 'M. Matthies', 'A. Paul', 'R. Schlotte', 'U. Wyputta'],
+ 'S. Lorenz and A. Manschke and M. Matthies' => ['S. Lorenz', 'A. Manschke', 'M. Matthies'],
+ 'K. Herterich and S. Determann and B. Grieger and I. Hansen and P. Helbig and S. Lorenz and A. Manschke and M. Matthies and A. Paul and R. Schlotte and U. Wyputta' => ['K. Herterich', 'S. Determann', 'B. Grieger', 'I. Hansen', 'P. Helbig', 'S. Lorenz', 'A. Manschke', 'M. Matthies', 'A. Paul', 'R. Schlotte', 'U. Wyputta'],
+ );
+
+while (@names) {
+ my ($name, $should_split) = (shift @names, shift @names);
+
+ my $actual_split;
+ if (!$should_split->[1]) {
+ # these should issue a warning
+ err_like sub { $actual_split = [Text::BibTeX::split_list ($name, 'and')] },
+ qr!empty substring!;
+ } else {
+ $actual_split = [Text::BibTeX::split_list ($name, 'and')];
+ }
+
+ if ($DEBUG) {
+ printf "name = >%s<\n", $name;
+ print "should split to:\n ";
+ print join ("\n ", @$should_split) . "\n";
+ print "actually split to:\n ";
+ print join ("\n ", @$actual_split) . "\n";
+ }
+
+ ok(slist_equal ($should_split, $actual_split));
+}
diff --git a/t/names.t b/t/names.t
new file mode 100644
index 0000000..4c77d60
--- /dev/null
+++ b/t/names.t
@@ -0,0 +1,138 @@
+# -*- cperl -*-
+use strict;
+use warnings;
+use vars qw($DEBUG);
+
+use IO::Handle;
+use Test::More tests => 61;
+use utf8;
+use Encode 'encode';
+use Text::BibTeX;
+
+use Cwd;
+my $common = getcwd()."/t/common.pl";
+require $common;
+
+$DEBUG = 0;
+
+#setup_stderr;
+
+sub test_name {
+ my ($name, $parts) = @_;
+ my $ok = 1;
+ my @partnames = qw(first von last jr);
+ my $i;
+
+ for $i (0 .. $#partnames) {
+ if (defined $parts->[$i]) {
+ $ok &= ($name->part ($partnames[$i]))
+ && slist_equal ($parts->[$i], [$name->part ($partnames[$i])]);
+ }
+ else {
+ $ok &= ! $name->part ($partnames[$i]);
+ }
+ }
+ # Only 5 keys max: first, von, last, jr AND encoding, normalization
+ ok(keys %$name <= 6 && $ok);
+}
+
+
+# ----------------------------------------------------------------------
+# processing of author names
+
+my (@names, @unames, @pnames, %names, @orig_namelist, $namelist, @namelist);
+my ($text, $entry, $pentry, $uentry);
+
+# first just a big ol' list of names, not attached to any entry
+%names =
+ ('van der Graaf' => '|van+der|Graaf|',
+ 'Jones' => '||Jones|',
+ 'van' => '||van|',
+ 'John Smith' => 'John||Smith|',
+ 'John van Smith' => 'John|van|Smith|',
+ 'John van Smith Jr.' => 'John|van|Smith+Jr.|',
+ 'John Smith Jr.' => 'John+Smith||Jr.|',
+ 'John van' => 'John||van|',
+ 'John van der' => 'John|van|der|',
+ 'John van der Graaf' => 'John|van+der|Graaf|',
+ 'John van der Graaf foo' => 'John|van+der|Graaf+foo|',
+ 'foo Foo foo' => '|foo|Foo+foo|',
+ 'Foo foo' => 'Foo||foo|',
+ 'foo Foo' => '|foo|Foo|'
+ );
+
+@orig_namelist = keys %names;
+$namelist = join (' and ', @orig_namelist);
+@namelist = Text::BibTeX::split_list
+ ($namelist, 'and', 'test', 0, 'name');
+is_deeply(\@orig_namelist, \@namelist, "same lists...");
+
+my $i;
+foreach $i (0 .. $#namelist)
+{
+ is($namelist[$i], $orig_namelist[$i]);
+ my %parts;
+ Text::BibTeX::Name::_split (\%parts, $namelist[$i], 'test', 0, $i, 0);
+ ok (keys %parts <= 4, "number keys is OK");
+
+ my @name = map { join ('+', ref $_ ? @$_ : ()) }
+ @parts{'first','von','last','jr'};
+ is (join ('|', @name), $names{$orig_namelist[$i]});
+}
+
+# now an entry with some names in it
+
+$text = <<'TEXT';
+@article{homer97,
+ author = { Homer Simpson and
+ Flanders, Jr., Ned Q. and
+ {Foo Bar and Co.}},
+ title = {Territorial Imperatives in Modern Suburbia},
+ journal = {Journal of Suburban Studies},
+ year = 1997
+}
+TEXT
+
+my $protected_test = <<'PROT';
+@article{prot1,
+ author = {{U.S. Department of Health and Human Services, National Institute of Mental Health, National Heart, Lung and Blood Institute}}
+}
+PROT
+
+my $uname = Text::BibTeX::Name->new({binmode => 'utf-8'},'фон дер Иванов, И. И.');
+is (join('', $uname->part('last')), 'Иванов', "Testing unicode...");
+is (join('', $uname->part('first')), 'И.И.');
+is (join(' ', $uname->part('von')), 'фон дер');# 2-byte UTF-8 lowercase
+
+$uname = Text::BibTeX::Name->new({binmode => 'utf-8'},'ꝥaa Smith, John');
+is (join('', $uname->part('von')), 'ꝥaa');# 3-byte UTF-8 lowercase (U+A765)
+$uname = Text::BibTeX::Name->new({binmode => 'utf-8'},'𝓺aa Smith, John');
+is (join('', $uname->part('von')), '𝓺aa');# 4-byte UTF-8 lowercase (U+1D4FA)
+
+$uname = Text::BibTeX::Name->new({binmode => 'raw'},'𝓺aa Smith, John');
+is (join('', $uname->part('von')), encode('UTF-8','𝓺aa'), "check raw mode");# 4-byte UTF-8 lowercase (U+1D4FA)
+
+
+ok ($pentry = Text::BibTeX::Entry->new($protected_test));
+my $pauthor = $pentry->get ('author');
+is ($pauthor, '{U.S. Department of Health and Human Services, National Institute of Mental Health, National Heart, Lung and Blood Institute}');
+@pnames = $pentry->split ('author');
+ok (@pnames == 1 && $pnames[0] eq '{U.S. Department of Health and Human Services, National Institute of Mental Health, National Heart, Lung and Blood Institute}');
+@pnames = $pentry->names ('author');
+ok (@pnames == 1);
+test_name ($pnames[0], [undef, undef, ['{U.S. Department of Health and Human Services, National Institute of Mental Health, National Heart, Lung and Blood Institute}'], undef]);
+
+
+ok ($entry = Text::BibTeX::Entry->new($text));
+my $author = $entry->get ('author');
+is ($author, 'Homer Simpson and Flanders, Jr., Ned Q. and {Foo Bar and Co.}');
+@names = $entry->split ('author');
+ok (@names == 3 &&
+ $names[0] eq 'Homer Simpson' &&
+ $names[1] eq 'Flanders, Jr., Ned Q.' &&
+ $names[2] eq '{Foo Bar and Co.}');
+@names = $entry->names ('author');
+ok (@names == 3);
+test_name ($names[0], [['Homer'], undef, ['Simpson'], undef]);
+test_name ($names[1], [['Ned', 'Q.'], undef, ['Flanders'], ['Jr.']]);
+test_name ($names[2], [undef, undef, ['{Foo Bar and Co.}']]);
diff --git a/t/output.t b/t/output.t
new file mode 100644
index 0000000..a9c88a7
--- /dev/null
+++ b/t/output.t
@@ -0,0 +1,96 @@
+# -*- cperl -*-
+use strict;
+use warnings;
+
+use IO::Handle;
+use Test::More tests => 20;
+
+use vars qw($DEBUG);
+
+use Cwd;
+BEGIN {
+ use_ok('Text::BibTeX');
+ my $common = getcwd()."/t/common.pl";
+ require $common;
+}
+
+use Fcntl;
+
+# ----------------------------------------------------------------------
+# entry output methods
+
+my ($text, $entry, @warnings, @fields);
+my ($new_text, $new_entry);
+
+$text = <<'TEXT';
+@article{homer97,
+ author = "H{\"o}mer Simpson" # { \"und } # "Ned Flanders",
+ title = {Territorial Imperatives in Modern Suburbia},
+ journal = {Journal of Suburban Studies},
+ year = 1997
+}
+TEXT
+ok($entry = Text::BibTeX::Entry->new($text), "new entry is defined");
+ok($entry->parse_ok, "new entry parsed correctly");
+
+$new_text = $entry->print_s;
+
+like $new_text => qr/^\@article\{homer97,\s*$/m, 'we have type and key';
+like $new_text =>
+ qr/^\s*author\s*=\s*\{H\{\\"o\}mer Simpson \\"und Ned Flanders\},\s*$/m,
+ 'we have author';
+like $new_text => qr/^\s*title\s*=\s*[{"]Territorial[^}"]*Suburbia[}"],\s*$/m,
+ 'we have title';
+like $new_text => qr/^\s*journal\s*=\s*[{"]Journal[^\}]*Studies[}"],\s*$/m,
+ 'we have journal';
+like $new_text => qr/^\s*year\s*=\s*[{"]1997[}"],\s*$/m, 'we have year'
+;
+
+$new_entry = Text::BibTeX::Entry->new($new_text);
+ok($entry->parse_ok, "second entry parsed correctly");
+
+is $entry->type => $new_entry->type, "entry type is correct";
+is $entry->key => $new_entry->key, "entry key is correct";
+
+ok(slist_equal ([sort $entry->fieldlist], [sort $new_entry->fieldlist]), "same field list");
+
+@fields = $entry->fieldlist;
+ok(slist_equal ([$entry->get (@fields)], [$new_entry->get (@fields)]));
+
+my @test = map { "t/test$_.bib" } 1..3;
+my ($bib);
+
+END { unlink @test }
+
+open (BIB, ">$test[0]") || die "couldn't create $test[0]: $!\n";
+$entry->print (\*BIB);
+close (BIB);
+
+$bib = IO::File->new($test[1], O_CREAT|O_WRONLY)
+ or die "couldn't create $test[1]: $!\n";
+$entry->print ($bib);
+$bib->close;
+
+$bib = Text::BibTeX::File->new($test[2], {MODE => O_CREAT|O_WRONLY})
+ or die "couldn't create $test[2]: $!\n";
+$entry->write ($bib);
+$bib->close;
+
+my (@contents, $i);
+for $i (0 .. 2)
+{
+ open (BIB, $test[$i]) || die "couldn't open $test[$i]: $!\n";
+ $contents[$i] = join ('', <BIB>);
+ close (BIB);
+}
+
+is $new_text => $contents[0], "Contents [0]";
+is $new_text => $contents[1], "Contents [1]";
+is $new_text => $contents[2], "Contents [2]";
+
+my $clone = $entry->clone;
+is ref($clone) => 'Text::BibTeX::Entry';
+is $clone->get('title') => 'Territorial Imperatives in Modern Suburbia';
+$clone->set('title', 'Changed title');
+is $clone->get('title') => 'Changed title';
+is $entry->get('title') => 'Territorial Imperatives in Modern Suburbia';
diff --git a/t/parse.t b/t/parse.t
new file mode 100644
index 0000000..aba0f5b
--- /dev/null
+++ b/t/parse.t
@@ -0,0 +1,52 @@
+# -*- cperl -*-
+use strict;
+use warnings;
+
+use Capture::Tiny 'capture';
+use IO::Handle;
+use Test::More tests => 32;
+
+use vars qw($DEBUG);
+use Cwd;
+BEGIN {
+ use_ok('Text::BibTeX');
+ my $common = getcwd()."/t/common.pl";
+ require $common;
+}
+
+$DEBUG = 0;
+
+
+# ----------------------------------------------------------------------
+# entry creation and parsing from a Text::BibTeX::File object
+
+my ($bibfile, $entry);
+my $multiple_file = 'btparse/tests/data/simple.bib';
+
+ok($bibfile = Text::BibTeX::File->new( $multiple_file));
+err_like sub { ok($entry = Text::BibTeX::Entry->new( $bibfile)); },
+ qr!$multiple_file, line 5, warning: undefined macro "junk"!;
+
+test_entry ($entry, 'book', 'abook',
+ [qw(title editor publisher year)],
+ ['A Book', 'John Q. Random', 'Foo Bar \& Sons', '1922']);
+
+ok($entry->read ($bibfile));
+test_entry ($entry, 'string', undef,
+ ['macro', 'foo'],
+ ['macro text ', 'blah blah ding dong ']);
+
+
+ok($entry->read ($bibfile));
+ok($entry->parse_ok &&
+ $entry->type eq 'comment' &&
+ $entry->metatype == BTE_COMMENT &&
+ $entry->value eq 'this is a comment entry, anything at all can go in it (as long as parentheses are balanced), even {braces}');
+
+ok($entry->read ($bibfile));
+ok($entry->parse_ok &&
+ $entry->type eq 'preamble' &&
+ $entry->metatype == BTE_PREAMBLE &&
+ $entry->value eq 'This is a preamble---the concatenation of several strings');
+
+ok(! $entry->read ($bibfile));
diff --git a/t/parse_f.t b/t/parse_f.t
new file mode 100644
index 0000000..a8af38c
--- /dev/null
+++ b/t/parse_f.t
@@ -0,0 +1,87 @@
+# -*- cperl -*-
+use strict;
+use warnings;
+
+use IO::Handle;
+use Test::More tests => 73;
+
+use vars qw($DEBUG);
+use Cwd;
+
+BEGIN {
+ use_ok('Text::BibTeX');
+ my $common = getcwd()."/t/common.pl";
+ require $common;
+
+}
+
+
+# ----------------------------------------------------------------------
+# entry creation and parsing from files
+
+my ($fh, $entry);
+
+my $regular_file = 'btparse/tests/data/regular.bib';
+
+# first, from a regular ol' Perl filehandle, with 'new' and 'parse"
+# bundled into one call
+open (BIB, $regular_file) || die "couldn't open $regular_file: $!\n";
+
+err_like sub { ok($entry = Text::BibTeX::Entry->new($regular_file, \*BIB)); },
+ qr!$regular_file, line 5, warning: undefined macro "junk"!;
+
+test_entry ($entry, 'book', 'abook',
+ [qw(title editor publisher year)],
+ ['A Book', 'John Q. Random', 'Foo Bar \& Sons', '1922']);
+ok(!Text::BibTeX::Entry->new($regular_file, \*BIB));
+
+
+# An interesting note: if I forget the 'seek' here, a bug is exposed in
+# btparse -- it crashes with an internal error if it hits eof twice in a
+# row. Should add a test for that bug to the official suite, once
+# it's fixed of course. ;-)
+
+seek (BIB, 0, 0);
+
+# now the same, separating the 'new' and 'parse' calls -- also a test
+# to see if we can pass undef for filename and get no filename in the
+# error message (and suffer no other consequences!)
+err_like sub { ok($entry->parse (undef, \*BIB)); },
+ qr!line 5, warning: undefined macro "junk"!;
+
+test_entry ($entry, 'book', 'abook',
+ [qw(title editor publisher year)],
+ ['A Book', 'John Q. Random', 'Foo Bar \& Sons', '1922']);
+ok(! $entry->parse (undef, \*BIB));
+
+close (BIB);
+
+# this is so I can stop checking the damned 'undefined macro' warning
+# -- guess I really do need a "set macro value" interface at some level...
+# (problem is that there's just one macro table for the whole process)
+
+ok($entry->parse_s ('@string(junk={, III})'));
+test_entry ($entry, 'string', undef, ['junk'], [', III']);
+
+# Now open that same file using IO::File, and pass in the resulting object
+# instead of a glob ref; everything else here is just the same
+
+$fh = IO::File->new($regular_file)
+ or die "couldn't open $regular_file: $!\n";
+no_err sub { ok($entry = Text::BibTeX::Entry->new($regular_file, $fh)); };
+
+test_entry ($entry, 'book', 'abook',
+ [qw(title editor publisher year)],
+ ['A Book', 'John Q. Random, III', 'Foo Bar \& Sons', '1922']);
+ok(! Text::BibTeX::Entry->new( $regular_file, $fh));
+$fh->seek (0, 0);
+
+# and again, with unbundled 'parse' call
+no_err sub { ok($entry->parse ($regular_file, $fh)); };
+
+test_entry ($entry, 'book', 'abook',
+ [qw(title editor publisher year)],
+ ['A Book', 'John Q. Random, III', 'Foo Bar \& Sons', '1922']);
+ok(! Text::BibTeX::Entry->new( $regular_file, $fh));
+
+$fh->close;
diff --git a/t/parse_s.t b/t/parse_s.t
new file mode 100644
index 0000000..70e8e52
--- /dev/null
+++ b/t/parse_s.t
@@ -0,0 +1,119 @@
+# -*- cperl -*-
+use strict;
+use warnings;
+use utf8;
+use IO::Handle;
+use Test::More tests => 54;
+
+use vars qw($DEBUG);
+use Cwd;
+BEGIN {
+ use_ok('Text::BibTeX');
+ my $common = getcwd()."/t/common.pl";
+ require $common;
+}
+
+
+# ----------------------------------------------------------------------
+# entry creation and parsing from a string
+
+my ($text, $entry, @warnings, $result, $text_uck, $entry_uck);
+
+$text = <<'TEXT';
+@foo { mykey,
+ f1 = {hello } # { there},
+ f2 = "fancy " # "that!" # foo # 1991,
+ f3 = foo
+ }
+TEXT
+
+# Test with a Unicode key
+$text_uck = <<'TEXT';
+@foo { mykeyŠ,
+ f1 = {f1val},
+ f2 = {f1val}
+ }
+TEXT
+
+ok($entry_uck = Text::BibTeX::Entry->new());
+ok($entry_uck->parse_s($text_uck));
+
+
+
+ok($entry = Text::BibTeX::Entry->new());
+
+err_like
+ sub { ok($entry->parse_s ($text)); },
+ qr/line 3, warning: undefined macro "foo".*line 4, warning: undefined macro "foo"/s;
+
+# First, low-level tests: make sure the data structure itself looks right
+ok($entry->{'status'});
+ok($entry->{'type'} eq 'foo');
+ok($entry->{'key'} eq 'mykey');
+ok(scalar @{$entry->{fields}} == 3);
+ok($entry->{fields}[0] eq 'f1' &&
+ $entry->{fields}[1] eq 'f2' &&
+ $entry->{fields}[2] eq 'f3');
+ok(scalar keys %{$entry->{'values'}} == 3);
+ok($entry->{'values'}{f1} eq 'hello there');
+
+
+
+# Now the same tests again, but using the object's methods
+test_entry ($entry, 'foo', 'mykey',
+ ['f1', 'f2', 'f3'],
+ ['hello there', 'fancy that!1991', '']);
+
+# Repeat with "bundled" form (new and parse_s in one go)
+
+err_like
+ sub { ok($entry = Text::BibTeX::Entry->new($text)); },
+ qr/line 3, warning: undefined macro "foo".*line 4, warning: undefined macro "foo"/s;
+
+# Repeat tests of entry contents
+test_entry ($entry, 'foo', 'mykey',
+ ['f1', 'f2', 'f3'],
+ ['hello there', 'fancy that!1991', '']);
+
+# Make sure parsing an empty string, or string with no entry in it,
+# just returns false
+
+$entry = Text::BibTeX::Entry->new();
+
+err_like sub {
+ $result = $entry->parse_s ('');
+ ok(! $result);
+}, qr!expected "@"!;
+ok(! $result);
+
+err_like sub {
+ $result = $entry->parse_s (undef);
+ $result = $entry->parse_s ('top-level junk that is not caught');
+}, qr!expected "@"!;
+ok(! $result);
+
+err_like sub {
+ $result = $entry->parse_s ('top-level junk that is not caught');
+}, qr!expected "@"!;
+ok(! $result);
+
+$result = $entry->parse_s (undef);
+ok(! $result);
+
+
+# Test the "proper noun at both ends" bug (the bt_get_text() call in
+# BibTeX.xs stripped off the leading and trailing braces; has since
+# been changed to bt_next_value(), under the assumption that compound
+# values will have been collapsed to a single simple value)
+
+# (thanks to Reiner Schotte for reporting this bug)
+
+$text = <<'TEXT';
+@foo{key, title = "{System}- und {Signaltheorie}"}
+TEXT
+
+no_err sub { $entry = Text::BibTeX::Entry->new($text); };
+
+ok($entry->parse_ok);
+test_entry ($entry, 'foo', 'key',
+ ['title'], ['{System}- und {Signaltheorie}']);
diff --git a/t/purify.t b/t/purify.t
new file mode 100644
index 0000000..1b9b3d6
--- /dev/null
+++ b/t/purify.t
@@ -0,0 +1,142 @@
+# -*- cperl -*-
+use strict;
+use warnings;
+
+use IO::Handle;
+use Test::More tests => 110;
+
+use vars qw($DEBUG);
+use Cwd;
+BEGIN {
+ use_ok('Text::BibTeX', qw(purify_string));
+ my $common = getcwd()."/t/common.pl";
+ require $common;
+}
+
+#
+# purify.t
+#
+# Text::BibTeX test program -- compare my purify routine with known
+# results from BibTeX 0.99.
+#
+# $Id$
+#
+
+$DEBUG = 1;
+
+# make sure that purify_string doesn't modify its input string
+# (at least while it's *supposed* to act this way!)
+my ($in1, $in2, $out);
+$in1 = 'f{\"o}o';
+$in2 = $in1;
+$out = 'clobber me';
+$out = purify_string ($in2);
+is($in1, $in2);
+is($out, 'foo');
+
+is(length $in1, 7);
+is(length $in2, 7);
+is(length $out, 3);
+
+# These two *don't* come from BibTeX -- just borderline cases
+# that should be checked
+is(purify_string (''), '');
+ok(! defined purify_string (undef));
+
+
+# The "expected" results here are all taken directly from BibTeX, using
+# a special .bst file of my own devising. One problem is that BibTeX
+# strips trailing spaces from each line on output, which means that
+# "purified" strings ending with a space are not delivered exactly as
+# I expect them. However, BibTeX's text.length$ function does give the
+# correct length (including those trailing spaces), so at least I can
+# indirectly check that things are as I expect them to be.
+#
+# The upshot of all this is that the "expected purified strings" in the
+# table below are shorn of trailing spaces, but have accurate lengths.
+# My reasoning for doing things this way is that although it is (apparently)
+# BibTeX's output routines that does the space-stripping, there is no
+# way to get data out of BibTeX other than through its output routines.
+# Thus, if I'm going to compare my results with BibTeX's, I'd better be
+# prepared to deal with the stripped-spaces problem...so I am!
+
+my @tests =
+ (q[Bl{\"o}w, Jo{\'{e}} Q. and J.~R. R. Tolk{\u e}in and {Fo{\'o} Bar ~ {\aa}nd {\SS}on{\v{s}}, Ltd.}] =>
+ [58, 'Blow Joe Q and J R R Tolkein and Foo Bar aand SSonvs Ltd'],
+ q[] => [0, ''],
+ q[G{\"o}del] => [5, 'Godel'],
+ q[G{\" o}del] => [5, 'Godel'],
+ q[G{\" o }del] => [5, 'Godel'],
+ q[G{\"o }del] => [5, 'Godel'],
+ q[G{\"{o}}del] => [5, 'Godel'],
+ q[G{\" {o}}del] => [5, 'Godel'],
+ q[G{\" { o}}del] => [5, 'Godel'],
+ q[G{\" {o }}del] => [5, 'Godel'],
+ q[G{\" { o }}del] => [5, 'Godel'],
+ q[G{\" { o } }del] => [5, 'Godel'],
+ q[G{\"{o} }del] => [5, 'Godel'],
+ q[G{\" {o} }del] => [5, 'Godel'],
+ q[G{\"o foo}del] => [8, 'Gofoodel'],
+ q[G{\"foo}del] => [7, 'Gfoodel'],
+ q[G{\"{foo}}del] => [7, 'Gfoodel'],
+ q[{G\"odel}] => [5, 'Godel'],
+ q[G{\"o}del] => [5, 'Godel'],
+ q[G{\"{o}}del] => [5, 'Godel'],
+ q[{\ss}uper-duper] => [12, 'ssuper duper'],
+ q[{\ss }uper-duper] => [12, 'ssuper duper'],
+ q[{ \ss}uper-duper] => [13, ' ssuper duper'],
+ q[{\ss{}}uper-duper] => [12, 'ssuper duper'],
+ q[{\ss foo}uper-duper] => [15, 'ssfoouper duper'],
+ q[{\ss { }}uper-duper] => [12, 'ssuper duper'],
+ q[{\ss {foo}}uper-duper] => [15, 'ssfoouper duper'],
+ q[{\ss{foo}}uper-duper] => [15, 'ssfoouper duper'],
+ q[Tom{\`a}{\v s}] => [5, 'Tomas'],
+ q[Tom{\`a}{\v{s}}] => [5, 'Tomas'],
+ q[Tom{\`a}{{\v s}}] => [7, 'Tomav s'],
+ q[{Tom{\`a}{\v s}}] => [7, 'Tomav s'],
+ q[{Tom{\`a}{\v{s}}}] => [6, 'Tomavs'],
+ q[{Tom{\`a}{\v{ s}}}] => [7, 'Tomav s'],
+ q[{Tom{\`a}{\v{ s }}}] => [8, 'Tomav s'],
+ q[{\v s}] => [1, 's'],
+ q[{\x s}] => [1, 's'],
+ q[{\r s}] => [1, 's'],
+ q[{\foo s}] => [1, 's'],
+ q[{\oe}] => [2, 'oe'],
+ q[{\ae}] => [2, 'ae'],
+
+ # Handling of \aa is a bit problematic -- BibTeX 0.99 converts this
+ # special char. to "a", but my understanding of the Nordic languages
+ # leads me to believe it ought to be converted to "aa". (E.g.
+ # \AArhus is usually written "Aarhus" in English, not "Arhus".)
+ # Neither way will result in proper sorting (at least for Danish,
+ # where \aa comes at the end of the alphabet), but at least my way
+ # is consistent with the normal English rendering of \aa.
+# q[{\aa}] => [1, 'a'], # BibTeX 0.99's behaviour
+ q[{\aa}] => [2, 'aa'], # btparse's behaviour
+ q[{\AA}] => [2, 'Aa'],
+ q[{\o}] => [1, 'o'],
+ q[{\l}] => [1, 'l'],
+ q[{\ss}] => [2, 'ss'],
+ q[{\ae s}] => [3, 'aes'],
+ q[\TeX] => [3, 'TeX'],
+ q[{\TeX}] => [0, ''],
+ q[{{\TeX}}] => [3, 'TeX'],
+ q[{\foobar}] => [0, '']
+ );
+
+while (@tests)
+{
+ my $str = shift @tests;
+ my ($exp_length, $exp_purified) = @{shift @tests};
+
+ my $purified = purify_string ($str);
+ my $length = length $purified; # length before stripping
+ printf "[%s] -> [%s] (length %d) (expected [%s], length %d)\n",
+ $str, $purified, $length, $exp_purified, $exp_length
+ if $DEBUG;
+
+ $purified =~ s/ +$//; # strip trailing spaces
+ is($purified, $exp_purified);
+ is($length, $exp_length);
+}
+
diff --git a/t/split_names b/t/split_names
new file mode 100644
index 0000000..4f81d07
--- /dev/null
+++ b/t/split_names
@@ -0,0 +1,28 @@
+# not a real test suite! just an interactive "you give me da
+# name, I show ya how it splits up" driver.
+
+use strict;
+use Term::ReadLine;
+use Text::BibTeX;
+use Text::BibTeX::Name;
+
+sub show_name
+{
+ my $str = shift;
+ my $name = Text::BibTeX::Name->new($str);
+ my $part;
+
+ foreach $part (qw(first last von jr))
+ {
+ my @tokens = $name->part ($part);
+ printf " %-5s => (%s)\n",
+ $part,
+ join (", ", map (qq["$_"], @tokens));
+ }
+}
+
+my $rl = Term::ReadLine->new('BibTeX name splitter');
+while (defined ($_ = $rl->readline (">> ")))
+{
+ show_name ($_);
+}
diff --git a/t/unlimited.bib b/t/unlimited.bib
new file mode 100644
index 0000000..14aa3f5
--- /dev/null
+++ b/t/unlimited.bib
@@ -0,0 +1,3 @@
+@AAAAA{
+@BBBBB{
+CCCCCC={}}
diff --git a/t/unlimited.t b/t/unlimited.t
new file mode 100644
index 0000000..fb96171
--- /dev/null
+++ b/t/unlimited.t
@@ -0,0 +1,29 @@
+# -*- cperl -*-
+use strict;
+use warnings;
+
+use Capture::Tiny 'capture';
+use IO::Handle;
+use Test::More tests => 4;
+
+use vars qw($DEBUG);
+use Cwd;
+BEGIN {
+ use_ok('Text::BibTeX');
+ my $common = getcwd()."/t/common.pl";
+ require $common;
+}
+
+$DEBUG = 0;
+
+
+# ----------------------------------------------------------------------
+# entry creation and parsing from a Text::BibTeX::File object
+
+my ($bibfile, $entry);
+my $multiple_file = 't/unlimited.bib';
+
+ok($bibfile = Text::BibTeX::File->new( $multiple_file));
+err_like sub { ok($entry = Text::BibTeX::Entry->new( $bibfile)) },
+ qr!warning: possible runaway string started at line!;
+
diff --git a/typemap b/typemap
new file mode 100644
index 0000000..3298aca
--- /dev/null
+++ b/typemap
@@ -0,0 +1,31 @@
+bt_name * T_NAME
+bt_name_format * T_NAME_FORMAT
+bt_namepart T_IV
+bt_joinmethod T_IV
+boolean T_BOOL
+
+# ----------------------------------------------------------------------
+INPUT
+
+# this is needed so we can pass 'undef' for the filename to
+# Text::BibTeX::Entry::parse and have it wind up as NULL
+# in bt_parse_entry()
+T_PV
+ SvGETMAGIC($arg);
+ $var = SvOK ($arg) ? ($type) SvPV_nomg($arg, PL_na) : NULL;
+
+T_NAME
+ $var = (bt_name *) SvIV ($arg)
+
+T_NAME_FORMAT
+ $var = (bt_name_format *) SvIV ($arg)
+
+T_BOOL
+ $var = (SvOK ($arg)) ? (int) SvIV ($arg) : 0
+
+
+# ----------------------------------------------------------------------
+OUTPUT
+
+#T_NAME_FORMAT
+# $arg = (IV) $var
diff --git a/xscode/BibTeX.xs b/xscode/BibTeX.xs
new file mode 100644
index 0000000..4f24192
--- /dev/null
+++ b/xscode/BibTeX.xs
@@ -0,0 +1,572 @@
+/* ------------------------------------------------------------------------
+@NAME : BibTeX.xs
+@INPUT :
+@OUTPUT :
+@RETURNS :
+@DESCRIPTION: Glue between my `btparse' library and the Perl module
+ Text::BibTeX. Provides the following functions to Perl:
+ Text::BibTeX::constant
+ Text::BibTeX::initialize
+ Text::BibTeX::cleanup
+ Text::BibTeX::split_list
+ Text::BibTeX::purify_string
+ Text::BibTeX::Entry::_parse_s
+ Text::BibTeX::Entry::_parse
+ Text::BibTeX::Name::split
+ Text::BibTeX::Name::free
+ Text::BibTeX::add_macro_text
+ Text::BibTeX::delete_macro
+ Text::BibTeX::delete_all_macros
+ Text::BibTeX::macro_length
+ Text::BibTeX::macro_text
+@GLOBALS :
+@CALLS :
+@CREATED : Jan/Feb 1997, Greg Ward
+@MODIFIED :
+@VERSION : $Id: BibTeX.xs 7399 2009-06-01 21:22:51Z ambs $
+-------------------------------------------------------------------------- */
+#include "EXTERN.h"
+#include "perl.h"
+#include "XSUB.h"
+
+#define BT_DEBUG 0
+
+#include "btparse.h"
+#include "btxs_support.h"
+
+
+MODULE = Text::BibTeX PACKAGE = Text::BibTeX
+
+# XSUBs with no corresponding functions in the C library (hence no prefix
+# for this section):
+# constant
+
+SV *
+constant(name)
+char * name
+ CODE:
+ IV i;
+ if (constant(name, &i))
+ ST(0) = sv_2mortal(newSViv(i));
+ else
+ ST(0) = &PL_sv_undef;
+
+
+MODULE = Text::BibTeX PACKAGE = Text::BibTeX PREFIX = bt_
+
+# XSUBs that consist solely of calls to corresponding C functions in the
+# library:
+# initialize
+# cleanup
+
+void
+bt_initialize()
+
+void
+bt_cleanup()
+
+
+# XSUBs that still go right into the Text::BibTeX package (ie. they don't
+# really belong in one of the subsidiary packages), but need a bit of work
+# to convert the C data to Perl form:
+# split_list
+# purify_string
+
+void
+bt_isplit_list (string, delim, filename=NULL, line=0, description=NULL)
+
+ char * string
+ char * delim
+ char * filename
+ int line
+ char * description
+
+ PREINIT:
+ bt_stringlist *
+ names;
+ int i;
+ SV * sv_name;
+
+ PPCODE:
+ names = bt_split_list (string, delim, filename, line, description);
+ if (names == NULL)
+ XSRETURN_EMPTY; /* return empty list to perl */
+
+ EXTEND (sp, names->num_items);
+ for (i = 0; i < names->num_items; i++)
+ {
+ if (names->items[i] == NULL)
+ sv_name = &PL_sv_undef;
+ else
+ sv_name = sv_2mortal (newSVpv (names->items[i], 0));
+
+ PUSHs (sv_name);
+ }
+
+ bt_free_list (names);
+
+
+SV *
+bt_purify_string (instr, options=0)
+
+ char * instr
+ int options
+
+ CODE:
+ if (instr == NULL) /* undef in, undef out */
+ XSRETURN_EMPTY;
+ RETVAL = newSVpv (instr, 0);
+ bt_purify_string (SvPVX (RETVAL), (btshort) options);
+ SvCUR_set (RETVAL, strlen (SvPVX (RETVAL))); /* reset SV's length */
+
+ OUTPUT:
+ RETVAL
+
+
+# Here's an alternate formulation of `purify_string' that acts more like
+# the C function (and less like nice Perl): it modifies the input string
+# in place, and returns nothing. In addition to being weird Perl,
+# this contradicts the documentation. And it would be impossible
+# to replicate this behaviour in a similar Python extension... all
+# round, a bad idea!
+
+## void
+## bt_purify_string (str, options=0)
+
+## char * str
+## int options
+
+## CODE:
+## if (str != NULL)
+## bt_purify_string (str, (btshort) options);
+## sv_setpv (ST(0), str);
+
+
+SV *
+bt_change_case (transform, string, options=0)
+ char transform
+ char * string
+ int options
+
+ CODE:
+ DBG_ACTION
+ (1, printf ("XSUB change_case: transform=%c, string=%p (%s)\n",
+ transform, string, string))
+ if (string == NULL)
+ XSRETURN_EMPTY;
+ RETVAL = newSVpv (string, 0);
+ bt_change_case (transform, SvPVX (RETVAL), (btshort) options);
+
+ OUTPUT:
+ RETVAL
+
+
+
+
+MODULE = Text::BibTeX PACKAGE = Text::BibTeX::Entry
+
+# The two XSUBs that go to the Text::BibTeX::Entry package; both rely on
+# ast_to_hash() to do the appropriate "convert to Perl form" work:
+# _parse
+# _parse_s
+# These XSUBs reset the internal parser states:
+# _reset_parse
+# _reset_parse_s
+
+int
+_parse (entry_ref, filename, file, preserve=FALSE)
+ SV * entry_ref;
+ char * filename;
+ FILE * file;
+ boolean preserve;
+
+ PREINIT:
+ btshort options = 0;
+ boolean status;
+ AST * top;
+
+ CODE:
+
+ top = bt_parse_entry (file, filename, options, &status);
+ DBG_ACTION
+ (2, dump_ast ("BibTeX.xs:parse: AST from bt_parse_entry():\n", top))
+
+ if (!top) /* at EOF -- return false to perl */
+ {
+ XSRETURN_NO;
+ }
+
+ ast_to_hash (entry_ref, top, status, preserve);
+ XSRETURN_YES; /* OK -- return true to perl */
+
+
+int
+_reset_parse ()
+
+ PREINIT:
+ btshort options = 0;
+ boolean status;
+
+ CODE:
+
+ bt_parse_entry (NULL, NULL, options, &status);
+
+ XSRETURN_NO; /* cleanup -- return false to perl */
+
+
+int
+_parse_s (entry_ref, text, preserve=FALSE)
+ SV * entry_ref;
+ char * text;
+ boolean preserve;
+
+ PREINIT:
+ btshort options = 0;
+ boolean status;
+ AST * top;
+
+ CODE:
+
+ top = bt_parse_entry_s (text, NULL, 1, options, &status);
+ if (!top) /* no entry found -- return false to perl */
+ {
+ XSRETURN_NO;
+ }
+
+ ast_to_hash (entry_ref, top, status, preserve);
+ XSRETURN_YES; /* OK -- return true to perl */
+
+
+int
+_reset_parse_s ()
+
+ PREINIT:
+ btshort options = 0;
+ boolean status;
+
+ CODE:
+
+ bt_parse_entry_s (NULL, NULL, 1, options, &status);
+
+ XSRETURN_NO; /* cleanup -- return false to perl */
+
+
+MODULE = Text::BibTeX PACKAGE = Text::BibTeX::Name
+
+# The XSUBs that go in the Text::BibTeX::Name package (ie. that operate
+# on name objects):
+# split
+# free
+
+#if BT_DEBUG
+
+void
+dump_name (hashref)
+ SV * hashref
+
+ PREINIT:
+ HV * hash;
+ SV ** sv_name;
+ bt_name * name;
+
+ CODE:
+ hash = (HV *) SvRV (hashref);
+ sv_name = hv_fetch (hash, "_cstruct", 8, 0);
+ if (! sv_name)
+ {
+ warn ("Name::dump: no _cstruct member in hash");
+ }
+ else
+ {
+ name = (bt_name *) SvIV (*sv_name);
+ dump_name (name); /* currently in format_name.c */
+ }
+
+#endif
+
+
+void
+_split (name_hashref, name, filename, line, name_num, keep_cstruct)
+
+ SV * name_hashref
+ char * name
+ char * filename
+ int line
+ int name_num
+ int keep_cstruct
+
+ PREINIT:
+ HV * name_hash;
+ SV * sv_old_name;
+ bt_name * old_name;
+ bt_name * name_split;
+
+ CODE:
+ if (! (SvROK (name_hashref) &&
+ SvTYPE (SvRV (name_hashref)) == SVt_PVHV))
+ croak ("name_hashref is not a hash reference");
+ name_hash = (HV *) SvRV (name_hashref);
+
+ DBG_ACTION (1,
+ {
+ printf ("XS Name::_split:\n");
+ printf (" name_hashref=%p, name_hash=%p\n",
+ (void *) name_hashref, (void *) name_hash);
+ printf (" name=%p (%s), filename=%p (%s)\n",
+ name, name, filename, filename);
+ printf (" line=%d, name_num=%d, keep_cstruct=%d\n",
+ line, name_num, keep_cstruct);
+ })
+
+ sv_old_name = hv_delete (name_hash, "_cstruct", 8, 0);
+ if (sv_old_name)
+ {
+ old_name = (bt_name *) SvIV (sv_old_name);
+ DBG_ACTION
+ (1, printf ("XS Name::_split: name hash had old C structure "
+ "(%d tokens, first was >%s<) -- freeing it\n",
+ old_name->tokens->num_items,
+ old_name->tokens->items[0]))
+ bt_free_name (old_name);
+ }
+
+ name_split = bt_split_name (name, filename, line, name_num);
+ DBG_ACTION (1, printf ("XS Name::_split: back from bt_split_name, "
+ "calling store_stringlist x 4\n"))
+
+ store_stringlist (name_hash, "first",
+ name_split->parts[BTN_FIRST],
+ name_split->part_len[BTN_FIRST]);
+ store_stringlist (name_hash, "von",
+ name_split->parts[BTN_VON],
+ name_split->part_len[BTN_VON]);
+ store_stringlist (name_hash, "last",
+ name_split->parts[BTN_LAST],
+ name_split->part_len[BTN_LAST]);
+ store_stringlist (name_hash, "jr",
+ name_split->parts[BTN_JR],
+ name_split->part_len[BTN_JR]);
+
+ DBG_ACTION (1,
+ {
+ char ** last = name_split->parts[BTN_LAST];
+ char ** first = name_split->parts[BTN_FIRST];
+
+ printf ("XS Name::_split: name has %d tokens; "
+ "last[0]=%s, first[0]=%s\n",
+ name_split->tokens->num_items,
+ last ? last[0] : "*no last name*",
+ first ? first[0] : "*no first name*");
+ })
+
+ if (keep_cstruct)
+ {
+ hv_store (name_hash, "_cstruct", 8, newSViv ((IV) name_split), 0);
+ DBG_ACTION
+ (1, printf ("XS Name::_split: storing pointer to structure %p\n",
+ name_split))
+ }
+ else
+ {
+ bt_free_name (name_split);
+ }
+
+
+void
+free (name_hashref)
+ SV * name_hashref
+
+ PREINIT:
+ HV * name_hash;
+ SV ** sv_name;
+ bt_name * name;
+
+ CODE:
+ name_hash = (HV *) SvRV (name_hashref);
+ sv_name = hv_fetch (name_hash, "_cstruct", 8, 0);
+ if (sv_name != NULL)
+ {
+ name = (bt_name *) SvIV (*sv_name);
+ DBG_ACTION (1, printf ("XS Name::free: freeing name %p\n", name))
+ bt_free_name (name);
+ }
+#if BT_DEBUG >= 1
+ else
+ {
+ printf ("XS Name::free: no C structure to free!\n");
+ }
+#endif
+
+
+MODULE = Text::BibTeX PACKAGE = Text::BibTeX::NameFormat
+
+IV
+create (parts="fvlj", abbrev_first=FALSE)
+ char * parts
+ bool abbrev_first
+
+ PREINIT:
+
+ CODE:
+ DBG_ACTION
+ (1, printf ("XS NameFormat::create: "
+ "creating name format: parts=\"%s\", abbrev=%d\n",
+ parts, abbrev_first));
+ RETVAL = (IV) bt_create_name_format (parts, abbrev_first);
+
+ OUTPUT:
+ RETVAL
+
+
+void
+free (format)
+ bt_name_format * format
+
+ CODE:
+ bt_free_name_format ((bt_name_format *) format);
+
+
+#if BT_DEBUG
+
+void
+dump_format (hashref)
+ SV * hashref
+
+ PREINIT:
+ HV * hash;
+ SV ** sv_format;
+ bt_name_format * format;
+
+ CODE:
+ hash = (HV *) SvRV (hashref);
+ sv_format = hv_fetch (hash, "_cstruct", 8, 0);
+ if (! sv_format)
+ {
+ warn ("NameFormat::dump: no _cstruct member in hash");
+ }
+ else
+ {
+ format = (bt_name_format *) SvIV (*sv_format);
+ dump_format (format); /* currently in format_name.c */
+ }
+
+#endif
+
+
+void
+_set_text (format, part, pre_part, post_part, pre_token, post_token)
+ bt_name_format * format
+ bt_namepart part
+ char * pre_part
+ char * post_part
+ char * pre_token
+ char * post_token
+
+ CODE:
+#if BT_DEBUG >= 2
+ {
+ static char * nameparts[] =
+ { "first", "von", "last", "jr" };
+ static char * joinmethods[] =
+ {"may tie", "space", "force tie", "nothing"};
+
+ printf ("XS NameFormat::_set_text:\n");
+ printf (" format=%p, namepart=%d (%s)\n",
+ format, part, nameparts[part]);
+ printf (" format currently is:\n");
+ dump_format (format);
+ printf (" pre_part=%s, post_part=%s\n", pre_part, post_part);
+ printf (" pre_token=%s, post_token=%s\n", pre_token, post_token);
+ }
+#endif
+
+ /*
+ * No memory leak here -- just copy the pointers. At first
+ * blush, it might seem that we're opening ourselves up to
+ * the possibility of dangling pointers if the Perl strings
+ * that these char *'s refer to ever go away. However, this
+ * is taken care of at the Perl level -- see the comment
+ * in BibTeX/NameFormat.pm, sub set_text.
+ */
+
+ bt_set_format_text (format, part,
+ pre_part, post_part, pre_token, post_token);
+#if BT_DEBUG >= 2
+ printf ("XS NameFormat::_set_text: after call, format is:\n");
+ dump_format (format);
+#endif
+
+
+void
+_set_options (format, part, abbrev, join_tokens, join_part)
+ bt_name_format * format
+ bt_namepart part
+ bool abbrev
+ bt_joinmethod join_tokens
+ bt_joinmethod join_part
+
+ CODE:
+ DBG_ACTION (2,
+ printf ("XS _set_options: format=%p, part=%d, "
+ "abbrev=%d, join_tokens=%d, join_part=%d\n",
+ format, part, abbrev, join_tokens, join_part))
+ bt_set_format_options (format, part,
+ abbrev, join_tokens, join_part);
+
+
+char *
+format_name (name, format)
+ bt_name * name
+ bt_name_format * format
+
+ CODE:
+ DBG_ACTION
+ (2, printf ("XS format_name: name=%p, format=%p\n", name, format))
+ RETVAL = bt_format_name (name, format);
+ DBG_ACTION
+ (1, printf ("XS format_name: formatted name=%s\n", RETVAL))
+
+ OUTPUT:
+ RETVAL
+
+
+MODULE = Text::BibTeX PACKAGE = Text::BibTeX PREFIX = bt_
+
+void
+bt_add_macro_text (macro, text, filename=NULL, line=0)
+ char * macro
+ char * text
+ char * filename
+ int line
+
+void
+bt_delete_macro (macro)
+ char * macro
+
+void
+bt_delete_all_macros ()
+
+int
+bt_macro_length (macro)
+ char * macro
+
+char *
+bt_macro_text (macro, filename=NULL, line=0)
+ char * macro
+ char * filename
+ int line
+
+
+# This bootstrap code is used to make btparse do "minimal post-processing"
+# on all entries. That way, we can control how much is done on a per-entry
+# basis by simply calling bt_postprocess_entry() ourselves.
+#
+# The need to do this means that btparse is somewhat brain-damaged -- I
+# should be able to specify the per-entry processing options when I call
+# bt_parse_entry()! Shouldn't be too hard to fix....
+BOOT:
+ bt_set_stringopts (BTE_MACRODEF, 0);
+ bt_set_stringopts (BTE_REGULAR, 0);
+ bt_set_stringopts (BTE_COMMENT, 0);
+ bt_set_stringopts (BTE_PREAMBLE, 0);
+
diff --git a/xscode/btxs_support.c b/xscode/btxs_support.c
new file mode 100644
index 0000000..39c662b
--- /dev/null
+++ b/xscode/btxs_support.c
@@ -0,0 +1,488 @@
+/* ------------------------------------------------------------------------
+@NAME : btxs_support.c
+@DESCRIPTION: Support functions needed by the XSUBs in BibTeX.xs.
+@GLOBALS :
+@CREATED : 1997/11/16, Greg Ward (from code in BibTeX.xs)
+@MODIFIED :
+@VERSION : $Id$
+@COPYRIGHT : Copyright (c) 1997-2000 by Gregory P. Ward. All rights reserved.
+-------------------------------------------------------------------------- */
+#include "EXTERN.h"
+#include "perl.h"
+#include "XSUB.h"
+
+#define BT_DEBUG 0
+
+#include "btparse.h"
+#include "btxs_support.h"
+
+
+static char *nodetype_names[] =
+{
+ "entry", "macrodef", "text", "key", "field", "string", "number", "macro"
+};
+
+
+/* ----------------------------------------------------------------------
+ * Miscellaneous stuff
+ */
+
+int
+constant (char * name, IV * arg)
+{
+ int ok = FALSE;
+
+ DBG_ACTION (1, printf ("constant: name=%s\n", name));
+
+ if (! (name[0] == 'B' && name[1] == 'T')) /* should not happen! */
+ croak ("Illegal constant name \"%s\"", name);
+
+ switch (name[2])
+ {
+ case 'E': /* entry metatypes */
+ if (strEQ (name, "BTE_UNKNOWN")) { *arg = BTE_UNKNOWN; ok = TRUE; }
+ if (strEQ (name, "BTE_REGULAR")) { *arg = BTE_REGULAR; ok = TRUE; }
+ if (strEQ (name, "BTE_COMMENT")) { *arg = BTE_COMMENT; ok = TRUE; }
+ if (strEQ (name, "BTE_PREAMBLE")) { *arg = BTE_PREAMBLE; ok = TRUE; }
+ if (strEQ (name, "BTE_MACRODEF")) { *arg = BTE_MACRODEF; ok = TRUE; }
+ break;
+ case 'A': /* AST nodetypes (not all of them) */
+ if (strEQ (name, "BTAST_STRING")) { *arg = BTAST_STRING; ok = TRUE; }
+ if (strEQ (name, "BTAST_NUMBER")) { *arg = BTAST_NUMBER; ok = TRUE; }
+ if (strEQ (name, "BTAST_MACRO")) { *arg = BTAST_MACRO; ok = TRUE; }
+ break;
+ case 'N': /* name parts */
+ if (strEQ (name, "BTN_FIRST")) { *arg = BTN_FIRST; ok = TRUE; }
+ if (strEQ (name, "BTN_VON")) { *arg = BTN_VON; ok = TRUE; }
+ if (strEQ (name, "BTN_LAST")) { *arg = BTN_LAST; ok = TRUE; }
+ if (strEQ (name, "BTN_JR")) { *arg = BTN_JR; ok = TRUE; }
+ if (strEQ (name, "BTN_NONE")) { *arg = BTN_NONE; ok = TRUE; }
+ break;
+ case 'J': /* token join methods */
+ if (strEQ (name, "BTJ_MAYTIE")) { *arg = BTJ_MAYTIE; ok = TRUE; }
+ if (strEQ (name, "BTJ_SPACE")) { *arg = BTJ_SPACE; ok = TRUE; }
+ if (strEQ (name, "BTJ_FORCETIE")) { *arg = BTJ_FORCETIE; ok = TRUE; }
+ if (strEQ (name, "BTJ_NOTHING")) { *arg = BTJ_NOTHING; ok = TRUE; }
+ break;
+ default:
+ break;
+ }
+
+ return ok;
+}
+
+
+/* ----------------------------------------------------------------------
+ * Stuff for converting a btparse entry AST to a Perl structure:
+ * convert_value() [private]
+ * convert_assigned_entry() [private]
+ * convert_value_entry() [private]
+ * ast_to_hash()
+ */
+
+static SV *
+convert_value (char * field_name, AST * field, boolean preserve)
+{
+ AST * value;
+ bt_nodetype
+ nodetype;
+ char * text;
+ SV * sv_field_value;
+
+ value = bt_next_value (field, NULL, &nodetype, &text);
+ if (preserve)
+ {
+ HV * val_stash; /* stash for Text::BibTeX::Value pkg */
+ HV * sval_stash; /* and for Text::BibTeX::SimpleValue */
+ AV * compound_value; /* list of simple values */
+ SV * sval_contents[2]; /* type and text */
+ AV * simple_value; /* list of (type, text) */
+ SV * simple_value_ref; /* ref to simple_value */
+
+ /*
+ * Get the stashes for the two classes into which we'll be
+ * blessing things.
+ */
+ val_stash = gv_stashpv ("Text::BibTeX::Value", TRUE);
+ sval_stash = gv_stashpv ("Text::BibTeX::SimpleValue", TRUE);
+
+ if (val_stash == NULL || sval_stash == NULL) {
+ croak ("unable to get stash for one or both of "
+ "Text::BibTeX::Value or Text::BibTeX::SimpleValue");
+ }
+
+ /* Start the compound value as an empty list */
+ compound_value = newAV ();
+
+ /* Walk the list of simple values */
+ while (value)
+ {
+ /*
+ * Convert the nodetype and text to SVs and save them in what will
+ * soon become a Text::BibTeX::SimpleValue object.
+ */
+ sval_contents[0] = newSViv ((IV) nodetype);
+ sval_contents[1] = newSVpv (text, 0);
+ simple_value = av_make (2, sval_contents);
+
+ /*
+ * We're done with these two SVs (they're saved in the
+ * simple_value AV), so decrement them out of existence
+ */
+ SvREFCNT_dec (sval_contents[0]);
+ SvREFCNT_dec (sval_contents[1]);
+
+ /* Create the SimpleValue object by blessing a reference */
+ simple_value_ref = newRV_noinc ((SV *) simple_value);
+ sv_bless (simple_value_ref, sval_stash);
+
+ /* Push this SimpleValue object onto the main list */
+ av_push (compound_value, simple_value_ref);
+
+ /* And find the next simple value in this field */
+ value = bt_next_value (field, value, &nodetype, &text);
+ }
+
+ /* Make a Text::BibTeX::Value object from our list of SimpleValues */
+ sv_field_value = newRV_noinc ((SV *) compound_value);
+ sv_bless (sv_field_value, val_stash);
+ }
+ else
+ {
+ if (value &&
+ (nodetype != BTAST_STRING ||
+ bt_next_value (field, value, NULL, NULL) != NULL))
+ {
+ croak ("BibTeX.xs: internal error in entry post-processing--"
+ "value for field %s is not a simple string",
+ field_name);
+ }
+
+ DBG_ACTION (2, printf (" field=%s, value=\"%s\"\n",
+ field_name, text));
+ sv_field_value = text ? newSVpv (text, 0) : &PL_sv_undef;
+ }
+
+ return sv_field_value;
+} /* convert_value () */
+
+
+static void
+convert_assigned_entry (AST *top, HV *entry, boolean preserve)
+{
+ AV * flist; /* the field list -- put into entry */
+ HV * values; /* the field values -- put into entry */
+ HV * lines; /* line numbers of entry and its fields */
+ AST * field;
+ char * field_name;
+ AST * item;
+ char * item_text;
+ int prev_line;
+
+ /*
+ * Start the line number hash. It will contain (num_fields)+2 elements;
+ * one for each field (keyed on the field name), and the `start' and
+ * `stop' lines for the entry as a whole. (Currently, the `stop' line
+ * number is the same as the line number of the last field. This isn't
+ * strictly correct, but by the time we get our hands on the AST, that
+ * closing brace or parenthesis is long lost -- so this is the best we
+ * get. I just want to put this redundant line number in in case some
+ * day I get ambitious and keep track of its true value.)
+ */
+
+ lines = newHV ();
+ hv_store (lines, "START", 5, newSViv (top->line), 0);
+
+ /*
+ * Now loop over all fields in the entry. As we loop, we build
+ * three structures: the list of field names, the hash relating
+ * field names to (fully expanded) values, and the list of line
+ * numbers.
+ */
+
+ DBG_ACTION (2, printf (" creating field list, value hash\n"));
+ flist = newAV ();
+ values = newHV ();
+
+ DBG_ACTION (2, printf (" getting fields and values\n"));
+ field = bt_next_field (top, NULL, &field_name);
+ while (field)
+ {
+ SV * sv_field_name;
+ SV * sv_field_value;
+
+ if (!field_name) /* this shouldn't happen -- but if */
+ continue; /* it does, skipping the field seems */
+ /* reasonable to me */
+
+ /* Convert the field name to an SV (for storing in the entry hash) */
+ sv_field_name = newSVpv (field_name, 0);
+
+ /*
+ * Convert the field value to an SV; this might be just a string, or
+ * it might be a reference to a Text::BibTeX::Value object (if
+ * 'preserve' is true).
+ */
+ sv_field_value = convert_value (field_name, field, preserve);
+
+ /*
+ * Push the field name onto the field list, add the field value to
+ * the values hash, and add the line number onto the line number
+ * hash.
+ */
+ av_push (flist, sv_field_name);
+ hv_store (values, field_name, strlen (field_name), sv_field_value, 0);
+ hv_store (lines, field_name, strlen (field_name),
+ newSViv (field->line), 0);
+ prev_line = field->line; /* so we can duplicate last line no. */
+
+ field = bt_next_field (top, field, &field_name);
+ DBG_ACTION (2, printf (" stored field/value; next will be %s\n",
+ field_name));
+ }
+
+
+ /*
+ * Duplicate the last element of `lines' (kludge until we keep track of
+ * the true end-of-entry line number).
+ */
+ hv_store (lines, "STOP", 4, newSViv (prev_line), 0);
+
+
+ /* Put refs to field list, value hash, and line list into the main hash */
+
+ DBG_ACTION (2, printf (" got all fields; storing list/hash refs\n"));
+ hv_store (entry, "fields", 6, newRV ((SV *) flist), 0);
+ hv_store (entry, "values", 6, newRV ((SV *) values), 0);
+ hv_store (entry, "lines", 5, newRV ((SV *) lines), 0);
+
+} /* convert_assigned_entry () */
+
+
+static void
+convert_value_entry (AST *top, HV *entry, boolean preserve)
+{
+ HV * lines; /* line numbers of entry and its fields */
+ AST * item,
+ * prev_item = NULL;
+ int last_line;
+ char * value;
+ SV * sv_value;
+
+ /*
+ * Start the line number hash. For "value" entries, it's a bit simpler --
+ * just a `start' and `stop' line number. Again, the `stop' line is
+ * inaccurate; it's just the line number of the last value in the
+ * entry.
+ */
+ lines = newHV ();
+ hv_store (lines, "START", 5, newSViv (top->line), 0);
+
+ /* Walk the list of values to find the last one (for its line number) */
+ item = NULL;
+ while ((item = bt_next_value (top, item, NULL, NULL)))
+ prev_item = item;
+
+ if (prev_item) {
+ last_line = prev_item->line;
+ hv_store (lines, "STOP", 4, newSViv (last_line), 0);
+
+ /* Store the line number hash in the entry hash */
+ hv_store (entry, "lines", 5, newRV ((SV *) lines), 0);
+ }
+
+ /* And get the value of the entry as a single string (fully processed) */
+
+ if (preserve)
+ {
+ sv_value = convert_value (NULL, top, TRUE);
+ }
+ else
+ {
+ value = bt_get_text (top);
+ sv_value = value ? newSVpv (value, 0) : &PL_sv_undef;
+ }
+ hv_store (entry, "value", 5, sv_value, 0);
+
+} /* convert_value_entry () */
+
+
+void
+ast_to_hash (SV * entry_ref,
+ AST * top,
+ boolean parse_status,
+ boolean preserve)
+{
+ char * type;
+ char * key;
+ bt_metatype
+ metatype;
+ btshort options; /* post-processing options */
+ HV * entry; /* the main hash -- build and return */
+
+ DBG_ACTION (1, printf ("ast_to_hash: entry\n"));
+
+ /* printf ("checking that entry_ref is a ref and a hash ref\n"); */
+ if (! (SvROK (entry_ref) && (SvTYPE (SvRV (entry_ref)) == SVt_PVHV)))
+ croak ("entry_ref must be a hash ref");
+ entry = (HV *) SvRV (entry_ref);
+
+ /*
+ * Clear out all hash values that might not be replaced in this
+ * conversion (in case the user parses into an existing
+ * Text::BibTeX::Entry object). (We don't blow the hash away with
+ * hv_clear() in case higher-up code has put interesting stuff into it.)
+ */
+
+ hv_delete (entry, "key", 3, G_DISCARD);
+ hv_delete (entry, "fields", 6, G_DISCARD);
+ hv_delete (entry, "lines", 5, G_DISCARD);
+ hv_delete (entry, "values", 6, G_DISCARD);
+ hv_delete (entry, "value", 5, G_DISCARD);
+
+ /*
+ * Perform entry post-processing. How exactly we post-process depends on
+ * 1) the entry type, and 2) the 'preserve' flag.
+ */
+
+ metatype = bt_entry_metatype (top);
+ if (preserve) /* if true, then entry type */
+ { /* doesn't matter */
+ options = BTO_MINIMAL;
+ }
+ else
+ {
+ if (metatype == BTE_MACRODEF)
+ options = BTO_MACRO;
+ else
+ options = BTO_FULL;
+ }
+
+ /*
+ * Postprocess the entry, with the string-processing options we just
+ * determined plus "no store macros" turned on. (That's because
+ * macros will already have been stored by the postprocessing done
+ * by bt_parse*; we don't want to do it again and generate spurious
+ * warnings!
+ */
+ bt_postprocess_entry (top, options | BTO_NOSTORE);
+
+
+ /*
+ * Start filling in the hash; all entries have a type and metatype,
+ * and we'll do the key here (even though it's not in all entries)
+ * for good measure.
+ */
+
+ type = bt_entry_type (top);
+ key = bt_entry_key (top);
+ DBG_ACTION (2, printf (" inserting type (%s), metatype (%d)\n",
+ type ? type : "*none*", bt_entry_metatype (top)));
+ DBG_ACTION (2, printf (" ... key (%s) status (%d)\n",
+ key ? key : "*none*", parse_status));
+
+ if (!type)
+ croak ("entry has no type");
+ hv_store (entry, "type", 4, newSVpv (type, 0), 0);
+ hv_store (entry, "metatype", 8, newSViv (bt_entry_metatype (top)), 0);
+
+ if (key)
+ hv_store (entry, "key", 3, newSVpv (key, 0), 0);
+
+ hv_store (entry, "status", 6, newSViv ((IV) parse_status), 0);
+
+
+ switch (metatype)
+ {
+ case BTE_MACRODEF:
+ case BTE_REGULAR:
+ convert_assigned_entry (top, entry, preserve);
+ break;
+
+ case BTE_COMMENT:
+ case BTE_PREAMBLE:
+ convert_value_entry (top, entry, preserve);
+ break;
+
+ default: /* this should never happen! */
+ croak ("unknown entry metatype (%d)\n", bt_entry_metatype (top));
+ }
+
+ /*
+ * If 'preserve' was true, then the user is going to need the
+ * Text::BibTeX::Value module!
+ *
+ * XXX this doesn't work! Why?!?!
+ */
+/*
+ if (preserve)
+ {
+ printf ("requiring Text::BibTeX::Value...\n");
+ perl_require_pv ("Text::BibTeX::Value");
+ }
+*/
+
+ /* And finally, free up the AST */
+
+ bt_free_ast (top);
+
+/* hv_store (entry, "ast", 3, newSViv ((IV) top), 0); */
+
+ DBG_ACTION (1, printf ("ast_to_hash: exit\n"));
+} /* ast_to_hash () */
+
+
+/* ----------------------------------------------------------------------
+ * Stuff for converting a list of C strings to Perl
+ * convert_stringlist() [private]
+ * store_stringlist()
+ */
+
+static SV *
+convert_stringlist (char **list, int num_strings)
+{
+ int i;
+ AV * perl_list;
+ SV * sv_string;
+
+ perl_list = newAV ();
+ for (i = 0; i < num_strings; i++)
+ {
+ sv_string = newSVpv (list[i], 0);
+ av_push (perl_list, sv_string);
+ }
+
+ return newRV ((SV *) perl_list);
+
+} /* convert_stringlist() */
+
+
+void
+store_stringlist (HV *hash, char *key, char **list, int num_strings)
+{
+ SV * listref;
+
+ if (list)
+ {
+ DBG_ACTION (2,
+ {
+ int i;
+
+ printf ("store_stringlist(): hash=%p, key=%s, list=(",
+ hash, key);
+ for (i = 0; i < num_strings; i++)
+ printf ("%s%c", list[i], (i == num_strings-1) ? ')' : ',');
+ printf ("\n");
+ })
+
+ listref = convert_stringlist (list, num_strings);
+ hv_store (hash, key, strlen (key), listref, 0);
+ }
+ else
+ {
+ DBG_ACTION (2, printf ("store_stringlist(): hash=%p, key=%s: deleting\n",
+ hash, key))
+ hv_delete (hash, key, strlen (key), G_DISCARD);
+ }
+
+} /* store_stringlist() */
diff --git a/xscode/btxs_support.h b/xscode/btxs_support.h
new file mode 100644
index 0000000..d083929
--- /dev/null
+++ b/xscode/btxs_support.h
@@ -0,0 +1,45 @@
+/* ------------------------------------------------------------------------
+@NAME : btxs_support.h
+@DESCRIPTION: Macros, prototypes, and whatnot needed by both btxs_support.c
+ and BibTeX.xs.
+@GLOBALS :
+@CREATED : 1997/11/16, Greg Ward
+@MODIFIED :
+@VERSION : $Id$
+@COPYRIGHT : Copyright (c) 1997-2000 by Gregory P. Ward. All rights reserved.
+-------------------------------------------------------------------------- */
+
+#ifndef BTXS_SUPPORT_H
+#define BTXS_SUPPORT_H
+
+#ifndef BT_DEBUG
+# define BT_DEBUG 0
+#endif
+
+#if BT_DEBUG
+# define DBG_ACTION(level,action) if (BT_DEBUG >= level) { action; }
+#else
+# define DBG_ACTION(level,action)
+#endif
+
+/* Portability hacks go here... */
+
+/*
+ * First, on SGIs, <string.h> doesn't prototype strdup() if _POSIX_SOURCE
+ * is defined -- and it usually is for Perl, because that's the default.
+ * So we workaround this by putting a prototype here. Yuck.
+ */
+#if defined(__sgi) && defined(_POSIX_SOURCE)
+extern char *strdup(const char *);
+#endif
+
+
+/* Prototypes */
+void store_stringlist (HV *hash, char *key, char **list, int num_strings);
+void ast_to_hash (SV * entry_ref,
+ AST * top,
+ boolean parse_status,
+ boolean preserve);
+int constant (char * name, IV * arg);
+
+#endif /* BTXS_SUPPORT_H */