Merge tag 'upstream/7.20.20170828+ds'

Upstream version 7.20.20170828(+ds).
author: Aaron M. Ucko <ucko@debian.org> 2017-10-05 22:06:53 -0400
committer: Aaron M. Ucko <ucko@debian.org> 2017-10-05 22:06:53 -0400
commit: 2c3f0ede6219c7c8a98cbe6d0ee9f7e63573afe3 (patch)
tree: 1ccf78163f4d890aaa6d00b9ff927383c6e4d73b
parent: f009e40f3fc2759c477eadb276a61b39705f8e99 (diff)
parent: 57368b018e0265f06fca749d685ad69c986a0a93 (diff)
8 files changed, 307 insertions, 2550 deletions
diff --git a/debian/changelog b/debian/changelog
index 1318537..e081adf 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,11 +1,11 @@
-ncbi-entrez-direct (7.10.20170814+ds-1) UNRELEASED; urgency=medium
+ncbi-entrez-direct (7.20.20170828+ds-1) UNRELEASED; urgency=medium
 
   * New upstream release.  (NOT YET RELEASED.)
   * debian/examples: Add several new scripts for experimental local record
     storage, which may migrate to /usr/bin when they're more mature.
   * debian/man/{efilter,esearch,xtract}.1: Update for new release.
 
- -- Aaron M. Ucko <ucko@debian.org>  Thu, 05 Oct 2017 21:50:01 -0400
+ -- Aaron M. Ucko <ucko@debian.org>  Thu, 05 Oct 2017 22:06:52 -0400
 
 ncbi-entrez-direct (6.90.20170705+ds-2) unstable; urgency=medium
 
diff --git a/edirect.pl b/edirect.pl
index 8ce717c..4520e93 100755
--- a/edirect.pl
+++ b/edirect.pl
@@ -43,7 +43,7 @@ use File::Spec;
 
 # EDirect version number
 
-$version = "7.10";
+$version = "7.20";
 
 BEGIN
 {
diff --git a/setup.sh b/setup.sh
index e25ec73..5adcc67 100755
--- a/setup.sh
+++ b/setup.sh
@@ -43,10 +43,18 @@ if [ -f xtract."$osname" ]
 then
   chmod +x xtract."$osname"
 else
-  echo "Unable to download a prebuilt xtract executable; attempting to"
-  echo "build one from xtract.go.  A Perl fallback is also available, and"
-  echo "will be used if necessary, so please disregard any errors below."
-  go build -o xtract."$osname" xtract.go
+  if hash go 2>/dev/null
+  then
+    echo "Unable to download xtract executable; building from xtract.go."
+    go build -o xtract."$osname" xtract.go
+  fi
+  if [ ! -f xtract."$osname" ]
+  then
+    echo -e "Unable to download xtract executable. Please execute the following:\n\n"
+    echo -e "  ./ftp-cp ftp.ncbi.nlm.nih.gov /entrez/entrezdirect xtract.$osname.gz"
+    echo -e "  gunzip -f xtract.$osname.gz\n"
+    echo -e "  chmod +x xtract.$osname\n"
+  fi
 fi
 
 echo ""
diff --git a/stash-pubmed b/stash-pubmed
index c869975..c076ebe 100755
--- a/stash-pubmed
+++ b/stash-pubmed
@@ -1,8 +1,18 @@
 #!/bin/sh
 
 target="$1"
+flags="none"
+
+if [ "$#" -gt 1 ]
+then
+  flags="$1"
+  target="$2"
+fi
+
 for fl in *.xml
 do
+  base=${fl%.xml}
   echo "$fl"
-  xtract -input "$fl" -stash "$target" -index MedlineCitation/PMID -pattern PubmedArticle
+  xtract -flags "$flags" -stash "$target" -input "$base.xml" -unique "$base.uid" \
+    -index MedlineCitation/PMID -pattern PubmedArticle
 done
diff --git a/unpack-pubmed b/unpack-pubmed
index 46bfd39..a671512 100755
--- a/unpack-pubmed
+++ b/unpack-pubmed
@@ -1,22 +1,20 @@
 #!/bin/sh
 
+flags="none"
+
+if [ "$#" -gt 0 ]
+then
+  flags="$1"
+fi
+
 for fl in *.xml.gz
 do
   base=${fl%.xml.gz}
-  if [ -f "$base.snt" ]
-  then
-    continue
-  fi
   if [ -f "$base.xml" ]
   then
     continue
   fi
   echo "$fl"
-  gunzip -c "$fl" | xtract -strict -compress -format flush > "$base.tmp.xml"
-  xtract -input "$base.tmp.xml" -pattern PubmedArticle -element MedlineCitation/PMID > "$base.uid"
-  xtract -input "$base.tmp.xml" -unique "$base.uid" -index MedlineCitation/PMID \
+  gunzip -c "$base.xml.gz" | xtract -flags "$flags" -unique "$base.uid" -index MedlineCitation/PMID \
     -head "<PubmedArticleSet>" -tail "</PubmedArticleSet>" -pattern PubmedArticle > "$base.xml"
-  rm "$base.tmp.xml"
-  rm "$base.uid"
-  touch "$base.snt"
 done
diff --git a/xtract b/xtract
index ae44929..50e6eb9 100755
--- a/xtract
+++ b/xtract
@@ -1,10 +1,17 @@
 #!/bin/sh
 PATH=/bin:/usr/bin
 export PATH
-compiled=$0.`uname -s | sed -e 's/_NT-.*$/_NT/; s/^MINGW[0-9]*/CYGWIN/'`
+osname=`uname -s | sed -e 's/_NT-.*$/_NT/; s/^MINGW[0-9]*/CYGWIN/'`
+compiled=$0."$osname"
 if [ -x "$compiled" ]
 then
-    exec "$compiled" "$@"
+  exec "$compiled" "$@"
 else
-    exec $0.pl -fallback "$@"
+  echo ""
+  echo "Unable to locate xtract executable. Please execute the following:"
+  echo ""
+  echo "  ftp-cp ftp.ncbi.nlm.nih.gov /entrez/entrezdirect xtract.$osname.gz"
+  echo "  gunzip -f xtract.$osname.gz"
+  echo "  chmod +x xtract.$osname"
+  echo ""
 fi
diff --git a/xtract.go b/xtract.go
index 46c760e..2d91bf7 100644
--- a/xtract.go
+++ b/xtract.go
@@ -93,8 +93,8 @@ Overview
 
 Processing Flags
 
-  -mixed           Allow PubMed mixed content
   -strict          Remove HTML highlight tags
+  -mixed           Allow PubMed mixed content
 
   -accent          Delete Unicode accents
   -ascii           Unicode to numeric character references
@@ -327,17 +327,15 @@ Examples
 `
 
 const xtractExtras = `
+Processing Flags
+
+  -flags      [strict|mixed|none]
+
 Local Record Indexing
 
   -stash      Base path for individual XML files
   -index      Name of element to use for identifier
-
-Processing Commands
-
-  -prepare    [release|report] Compare daily update to stash
-  -ignore     Ignore contents of object in -prepare comparisons
-  -missing    Print list of missing identifiers
-  -unique     File of UIDs for skipping all but last version
+  -unique     File of UIDs for removing intermediate records
 
 Sample File Download
 
@@ -361,11 +359,11 @@ Human Subset Extraction
 PubMed Download
 
   download-pubmed baseline updatefiles
-  unpack-pubmed
+  unpack-pubmed mixed
 
 PubMed Archive Creation
 
-  stash-pubmed /Volumes/myssd/Pubmed
+  stash-pubmed mixed /Volumes/myssd/Pubmed
 
 PubMed Archive Retrieval
 
@@ -374,6 +372,12 @@ PubMed Archive Retrieval
 `
 
 const xtractAdvanced = `
+Processing Commands
+
+  -prepare    [release|report] Compare daily update to stash
+  -ignore     Ignore contents of object in -prepare comparisons
+  -missing    Print list of missing identifiers
+
 Update Candidate Report
 
   gzcat medline*.xml.gz | xtract -strict -compress -format flush |
@@ -452,14 +456,14 @@ Performance Tuning Script
 
 Processor Titration Results
 
-  1    27748    207
-  2    51011    272
-  3    73487    700
-  4    93032    2559
-  5    92596    1549
-  6    89513    1570
-  7    84872    1145
-  8    83829    952
+  1    27622    31
+  2    51799    312
+  3    74853    593
+  4    95867    1337
+  5    97171    4019
+  6    93460    2458
+  7    87467    1030
+  8    82448    2651
 
 Execution Profiling
 
@@ -618,7 +622,7 @@ Gene Regions
   LOCUS       NC_000076               2142 bp    DNA     linear   CON 09-FEB-2015
   DEFINITION  Mus musculus strain C57BL/6J chromosome 10, GRCm38.p3 C57BL/6J.
   ACCESSION   NC_000076 REGION: complement(75771233..75773374) GPC_000000783
-  VERSION     NC_000076.6  GI:372099100
+  VERSION     NC_000076.6
   ...
   FEATURES             Location/Qualifiers
        source          1..2142
@@ -2276,7 +2280,7 @@ type Tables struct {
 	DeGloss   bool
 	DoMixed   bool
 	DeAccent  bool
-	DoAscii   bool
+	DoASCII   bool
 }
 
 type Node struct {
@@ -2487,26 +2491,22 @@ func TrimPunctuation(str string) string {
 		}
 	}
 
-	if max > 0 {
-		if str[0] == '(' && !strings.Contains(str, ")") {
-			// trim isolated left parentheses
-			str = str[1:]
-			max--
-		}
+	if max > 0 && str[0] == '(' && !strings.Contains(str, ")") {
+		// trim isolated left parentheses
+		str = str[1:]
+		max--
 	}
 
-	if max > 1 {
-		if str[max-1] == ')' && !strings.Contains(str, "(") {
-			// trim isolated right parentheses
-			str = str[:max-1]
-			// max--
-		}
+	if max > 1 && str[max-1] == ')' && !strings.Contains(str, "(") {
+		// trim isolated right parentheses
+		str = str[:max-1]
+		// max--
 	}
 
 	return str
 }
 
-func HtmlAhead(text string, pos int) int {
+func HTMLAhead(text string, pos int) int {
 
 	max := len(text) - pos
 
@@ -2570,7 +2570,7 @@ func HtmlAhead(text string, pos int) int {
 	return 0
 }
 
-func HtmlBehind(bufr []byte, pos int) bool {
+func HTMLBehind(bufr []byte, pos int) bool {
 
 	if pos > 1 && bufr[pos-2] == '<' {
 		ch := bufr[pos-1]
@@ -2781,7 +2781,7 @@ var (
 	rpair *strings.Replacer
 )
 
-func DoHtmlReplace(str string) string {
+func DoHTMLReplace(str string) string {
 
 	// replacer/repairer not reentrant, protected by mutex
 	rlock.Lock()
@@ -2862,7 +2862,7 @@ func DoHtmlReplace(str string) string {
 	return str
 }
 
-func DoHtmlRepair(str string) string {
+func DoHTMLRepair(str string) string {
 
 	// replacer/repairer not reentrant, protected by mutex
 	rlock.Lock()
@@ -2923,7 +2923,7 @@ func DoHtmlRepair(str string) string {
 	return str
 }
 
-func DoTrimFlankingHtml(str string) string {
+func DoTrimFlankingHTML(str string) string {
 
 	badPrefix := [10]string{
 		"<i></i>",
@@ -3050,7 +3050,7 @@ func DoAccentTransform(str string) string {
 	return str
 }
 
-func UnicodeToAscii(str string) string {
+func UnicodeToASCII(str string) string {
 
 	var buffer bytes.Buffer
 
@@ -3874,16 +3874,16 @@ type XMLReader struct {
 	Closed     bool
 	Docompress bool
 	Docleanup  bool
-	Leavehtml  bool
+	LeaveHTML  bool
 }
 
-func NewXMLReader(in io.Reader, doCompress, doCleanup, leaveHtml bool) *XMLReader {
+func NewXMLReader(in io.Reader, doCompress, doCleanup, leaveHTML bool) *XMLReader {
 
 	if in == nil {
 		return nil
 	}
 
-	rdr := &XMLReader{Reader: in, Docompress: doCompress, Docleanup: doCleanup, Leavehtml: leaveHtml}
+	rdr := &XMLReader{Reader: in, Docompress: doCompress, Docleanup: doCleanup, LeaveHTML: leaveHTML}
 
 	// 65536 appears to be the maximum number of characters presented to io.Reader when input is piped from stdin
 	// increasing size of buffer when input is from a file does not improve program performance
@@ -3940,9 +3940,9 @@ func (rdr *XMLReader) NextBlock() string {
 		pos := -1
 		for pos = len(bufr) - 1; pos >= 0; pos-- {
 			if bufr[pos] == '>' {
-				if rdr.Leavehtml {
+				if rdr.LeaveHTML {
 					// optionally skip backwards past embedded i, b, u, sub, and sup HTML open, close, and empty tags
-					if HtmlBehind(bufr, pos) {
+					if HTMLBehind(bufr, pos) {
 						continue
 					}
 				}
@@ -4521,7 +4521,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
 
 		start := idx
 
-		if ch == '<' && (plainText || HtmlAhead(text, idx) == 0) {
+		if ch == '<' && (plainText || HTMLAhead(text, idx) == 0) {
 
 			// at start of element
 			idx++
@@ -4723,7 +4723,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
 				}
 				if ch == '<' && !plainText {
 					// optionally allow HTML text formatting elements and super/subscripts
-					advance := HtmlAhead(text, idx)
+					advance := HTMLAhead(text, idx)
 					if advance > 0 {
 						idx += advance
 						ch = text[idx]
@@ -4934,7 +4934,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
 		startLine := 0
 
 		// warn if HTML tags are not well-formed
-		unbalancedHtml := func(text string) bool {
+		unbalancedHTML := func(text string) bool {
 
 			var arry []string
 
@@ -5037,7 +5037,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
 						fmt.Fprintf(os.Stdout, "Contents not expected before </%s>, line %d\n", parent, line)
 					}
 					if tbls.DeGloss || tbls.DoMixed {
-						if unbalancedHtml(name) {
+						if unbalancedHTML(name) {
 							fmt.Fprintf(os.Stdout, "Unbalanced mixed-content tags, line %d\n", line)
 						}
 					}
@@ -5393,7 +5393,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
 						str = RemoveUnicodeMarkup(str)
 					}
 					if HasAngleBracket(str) {
-						str = DoHtmlReplace(str)
+						str = DoHTMLReplace(str)
 					}
 				}
 				if tbls.DoMixed {
@@ -5401,18 +5401,18 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
 						str = SimulateUnicodeMarkup(str)
 					}
 					if HasAngleBracket(str) {
-						str = DoHtmlRepair(str)
+						str = DoHTMLRepair(str)
 					}
-					str = DoTrimFlankingHtml(str)
+					str = DoTrimFlankingHTML(str)
 				}
 				if tbls.DeAccent {
 					if IsNotASCII(str) {
 						str = DoAccentTransform(str)
 					}
 				}
-				if tbls.DoAscii {
+				if tbls.DoASCII {
 					if IsNotASCII(str) {
-						str = UnicodeToAscii(str)
+						str = UnicodeToASCII(str)
 					}
 				}
 
@@ -5530,9 +5530,9 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
 					attr = DoAccentTransform(attr)
 				}
 			}
-			if tbls.DoAscii {
+			if tbls.DoASCII {
 				if IsNotASCII(attr) {
-					attr = UnicodeToAscii(attr)
+					attr = UnicodeToASCII(attr)
 				}
 			}
 
@@ -5773,7 +5773,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
 							name = RemoveUnicodeMarkup(name)
 						}
 						if HasAngleBracket(name) {
-							name = DoHtmlReplace(name)
+							name = DoHTMLReplace(name)
 						}
 					}
 					if tbls.DoMixed {
@@ -5781,18 +5781,18 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
 							name = SimulateUnicodeMarkup(name)
 						}
 						if HasAngleBracket(name) {
-							name = DoHtmlRepair(name)
+							name = DoHTMLRepair(name)
 						}
-						name = DoTrimFlankingHtml(name)
+						name = DoTrimFlankingHTML(name)
 					}
 					if tbls.DeAccent {
 						if IsNotASCII(name) {
 							name = DoAccentTransform(name)
 						}
 					}
-					if tbls.DoAscii {
+					if tbls.DoASCII {
 						if IsNotASCII(name) {
-							name = UnicodeToAscii(name)
+							name = UnicodeToASCII(name)
 						}
 					}
 					if HasFlankingSpace(name) {
@@ -7168,7 +7168,7 @@ func ProcessClause(curr *Node, stages []*Step, mask, prev, pfx, sfx, sep, def st
 					str = RemoveUnicodeMarkup(str)
 				}
 				if HasAngleBracket(str) {
-					str = DoHtmlReplace(str)
+					str = DoHTMLReplace(str)
 				}
 
 				// break terms at spaces, allowing hyphenated words
@@ -7993,7 +7993,7 @@ func ProcessQuery(Text, parent string, index int, cmds *Block, tbls *Tables, act
 
 		start := idx
 
-		if ch == '<' && (plainText || HtmlAhead(text, idx) == 0) {
+		if ch == '<' && (plainText || HTMLAhead(text, idx) == 0) {
 
 			// at start of element
 			idx++
@@ -8142,7 +8142,7 @@ func ProcessQuery(Text, parent string, index int, cmds *Block, tbls *Tables, act
 				}
 				if ch == '<' && !plainText {
 					// optionally allow HTML text formatting elements and super/subscripts
-					advance := HtmlAhead(text, idx)
+					advance := HTMLAhead(text, idx)
 					if advance > 0 {
 						idx += advance
 						ch = text[idx]
@@ -8218,7 +8218,7 @@ func ProcessQuery(Text, parent string, index int, cmds *Block, tbls *Tables, act
 						name = RemoveUnicodeMarkup(name)
 					}
 					if HasAngleBracket(name) {
-						name = DoHtmlReplace(name)
+						name = DoHTMLReplace(name)
 					}
 				}
 				if tbls.DoMixed {
@@ -8226,18 +8226,18 @@ func ProcessQuery(Text, parent string, index int, cmds *Block, tbls *Tables, act
 						name = SimulateUnicodeMarkup(name)
 					}
 					if HasAngleBracket(name) {
-						name = DoHtmlReplace(name)
+						name = DoHTMLReplace(name)
 					}
-					name = DoTrimFlankingHtml(name)
+					name = DoTrimFlankingHTML(name)
 				}
 				if tbls.DeAccent {
 					if IsNotASCII(name) {
 						name = DoAccentTransform(name)
 					}
 				}
-				if tbls.DoAscii {
+				if tbls.DoASCII {
 					if IsNotASCII(name) {
-						name = UnicodeToAscii(name)
+						name = UnicodeToASCII(name)
 					}
 				}
 				node.Contents = name
@@ -8546,7 +8546,7 @@ func (h *ExtractHeap) Pop() interface{} {
 // process with single goroutine calls defer close(out) so consumer(s) can range over channel
 // process with multiple instances calls defer wg.Done(), separate goroutine uses wg.Wait() to delay close(out)
 
-func CreateProducer(pat, star string, rdr *XMLReader, tbls *Tables) <-chan Extract {
+func CreateProducer(pat, star string, rdr *XMLReader, uidFile string, tbls *Tables) <-chan Extract {
 
 	if rdr == nil || tbls == nil {
 		return nil
@@ -8558,15 +8558,72 @@ func CreateProducer(pat, star string, rdr *XMLReader, tbls *Tables) <-chan Extra
 		os.Exit(1)
 	}
 
+	// create map that counts instances of each UID
+	order := make(map[string]int)
+
+	checkIDs := false
+
+	if uidFile != "" {
+		checkIDs = true
+
+		// read file of identifiers to use for filtering
+		fl, err := os.Open(uidFile)
+		if err != nil {
+			fmt.Fprintf(os.Stderr, "\nERROR: Unable to open identifier file '%s'\n", uidFile)
+			os.Exit(1)
+		}
+
+		scanr := bufio.NewScanner(fl)
+
+		// read lines of identifiers
+		for scanr.Scan() {
+
+			id := scanr.Text()
+
+			// map records count for given identifier
+			val := order[id]
+			val++
+			order[id] = val
+		}
+
+		fl.Close()
+	}
+
 	// xmlProducer sends partitioned XML strings through channel
 	xmlProducer := func(pat, star string, rdr *XMLReader, out chan<- Extract) {
 
 		// close channel when all records have been processed
 		defer close(out)
 
+		parent := ""
+		if star == "*" {
+			parent = pat
+		}
+
 		// partition all input by pattern and send XML substring to available consumer through channel
 		PartitionPattern(pat, star, rdr,
 			func(rec int, ofs int64, str string) {
+
+				if checkIDs {
+					id := ProcessQuery(str[:], parent, rec, nil, tbls, DOINDEX)
+					if id == "" {
+						return
+					}
+
+					val, ok := order[id]
+					if !ok {
+						// not in identifier list, skip
+						return
+					}
+					// decrement count in map
+					val--
+					order[id] = val
+					if val > 0 {
+						// only write last record with a given identifier
+						return
+					}
+				}
+
 				out <- Extract{rec, "", str}
 			})
 	}
@@ -9058,7 +9115,10 @@ func main() {
 	deGloss := false
 	doMixed := false
 	deAccent := false
-	doAscii := false
+	doASCII := false
+
+	// -flags sets -strict or -mixed cleanup flags from argument
+	flgs := ""
 
 	// read data from file instead of stdin
 	fileName := ""
@@ -9076,7 +9136,7 @@ func main() {
 	// element to use as local data index
 	indx := ""
 
-	// file of index values for removing duplicates
+	// file of index values for removing duplicates (read or write, depending upon context)
 	unqe := ""
 
 	// phrase to find anywhere in XML
@@ -9165,10 +9225,10 @@ func main() {
 			fileName = args[1]
 			// skip past first of two arguments
 			args = args[1:]
-		// file with selected indexes for removing duplicates
+		// uid file for removing duplicates
 		case "-unique":
 			if len(args) < 2 {
-				fmt.Fprintf(os.Stderr, "\nERROR: Unique identifier file is missing\n")
+				fmt.Fprintf(os.Stderr, "\nERROR: Unique identifier file name is missing\n")
 				os.Exit(1)
 			}
 			unqe = args[1]
@@ -9217,7 +9277,15 @@ func main() {
 		case "-accent", "-plain":
 			deAccent = true
 		case "-ascii":
-			doAscii = true
+			doASCII = true
+		case "-flags":
+			if len(args) < 2 {
+				fmt.Fprintf(os.Stderr, "\nERROR: Flags argument is missing\n")
+				os.Exit(1)
+			}
+			flgs = args[1]
+			// skip past first of two arguments
+			args = args[1:]
 		// debugging flags
 		case "-prepare":
 			cmpr = true
@@ -9270,6 +9338,20 @@ func main() {
 		}
 	}
 
+	// -flags allows script to set -strict or -mixed from argument
+	switch flgs {
+	case "strict":
+		deGloss = true
+	case "mixed":
+		doMixed = true
+	case "none", "default":
+	default:
+		if flgs != "" {
+			fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized -flags value '%s'\n", flgs)
+			os.Exit(1)
+		}
+	}
+
 	// reality checks on number of processors to use
 	// performance degrades if capacity is above maximum number of partitions per second (context switching?)
 	if numProcs == 0 {
@@ -9442,7 +9524,7 @@ func main() {
 	tbls.DeGloss = deGloss
 	tbls.DoMixed = doMixed
 	tbls.DeAccent = deAccent
-	tbls.DoAscii = doAscii
+	tbls.DoASCII = doASCII
 
 	// FILE NAME CAN BE SUPPLIED WITH -input COMMAND
 
@@ -9879,7 +9961,7 @@ func main() {
 
 	// COMPARE XML UPDATES TO LOCAL DIRECTORY, RETAIN NEW OR SUBSTANTIVELY CHANGED RECORDS
 
-	// -prepare plus -stash plus -index plus -pattern compares XML files against stash (undocumented)
+	// -prepare plus -stash plus -index plus -pattern compares XML files against stash
 	if stsh != "" && indx != "" && cmpr {
 
 		doReport := false
@@ -10022,10 +10104,10 @@ func main() {
 
 	// SAVE XML COMPONENT RECORDS TO LOCAL DIRECTORY INDEXED BY TRIE ON IDENTIFIER
 
-	// -stash plus -index plus -pattern saves XML files in trie-based directory structure
+	// -stash plus -index [plus -unique] plus -pattern saves XML files in trie-based directory structure
 	if stsh != "" && indx != "" {
 
-		xmlq := CreateProducer(topPattern, star, rdr, tbls)
+		xmlq := CreateProducer(topPattern, star, rdr, unqe, tbls)
 		idnq := CreateExaminers(tbls, parent, xmlq)
 		unsq := CreateUnshuffler(tbls, idnq)
 		unqq := CreateUniquer(tbls, unsq)
@@ -10050,40 +10132,83 @@ func main() {
 		return
 	}
 
-	// READ FILE OF IDENTIFIERS AND EXTRACT SELECTED RECORDS FROM XML INPUT FILE
+	// GENERATE UID LIST AND REMOVE LEADING SPACES FROM XML
 
-	// -index plus -unique [plus -head/-tail/-hd/-tl] plus -pattern with no other extraction arguments
-	// takes an XML input file and a file of its UIDs and keeps only the last version of each record
-	if indx != "" && unqe != "" && len(args) == 2 {
+	// -index plus -unique [plus -head/-tail/-hd/-tl] plus -pattern takes an XML input file and
+	// writes a trimmed version with leading spaces removed, also creating a file of its UIDs
+	if stsh == "" && indx != "" && unqe != "" {
 
-		// read file of identifiers to use for filtering
-		fl, err := os.Open(unqe)
+		fl, err := os.Create(unqe)
 		if err != nil {
-			fmt.Fprintf(os.Stderr, "\nERROR: Unable to open identifier file '%s'\n", unqe)
+			fmt.Fprintf(os.Stderr, "\nERROR: Unable to open uid output file '%s'\n", unqe)
 			os.Exit(1)
 		}
 
-		// create map that counts instances of each UID
-		order := make(map[string]int)
+		if head != "" {
+			os.Stdout.WriteString(head)
+			os.Stdout.WriteString("\n")
+		}
 
-		scanr := bufio.NewScanner(fl)
+		// write output, efficiently skipping leading spaces on each line
+		writeFlush := func(text string) {
 
-		// read lines of identifiers
-		for scanr.Scan() {
+			if text == "" {
+				return
+			}
 
-			id := scanr.Text()
+			var buffer bytes.Buffer
 
-			// map records count for given identifier
-			val := order[id]
-			val++
-			order[id] = val
-		}
+			max := len(text)
+			idx := 0
+			inBlank := &tbls.InBlank
 
-		fl.Close()
+			for idx < max {
 
-		if head != "" {
-			os.Stdout.WriteString(head)
-			os.Stdout.WriteString("\n")
+				// skip past leading blanks and empty lines
+				for idx < max {
+					ch := text[idx]
+					if !inBlank[ch] {
+						break
+					}
+					idx++
+				}
+
+				start := idx
+
+				// skip to next newline
+				for idx < max {
+					if text[idx] == '\n' {
+						break
+					}
+					idx++
+				}
+
+				str := text[start:idx]
+
+				if str == "" {
+					continue
+				}
+
+				// skip processing instruction
+				if strings.HasPrefix(str, "<?") && strings.HasSuffix(str, "?>") {
+					continue
+				}
+
+				// trim spaces next to angle bracket
+				for strings.Contains(str, "> ") {
+					str = strings.Replace(str, "> ", ">", 1)
+				}
+				for strings.Contains(str, " <") {
+					str = strings.Replace(str, " <", "<", 1)
+				}
+
+				buffer.WriteString(str[:])
+				buffer.WriteString("\n")
+			}
+
+			rsult := buffer.String()
+
+			os.Stdout.WriteString(rsult)
 		}
 
 		PartitionPattern(topPattern, star, rdr,
@@ -10095,27 +10220,43 @@ func main() {
 					return
 				}
 
-				val, ok := order[id]
-				if !ok {
-					// not in identifier list, skip
-					return
-				}
-				// decrement count in map
-				val--
-				order[id] = val
-				if val > 0 {
-					// only write last record with a given identifier
-					return
-				}
+				fl.WriteString(id)
+				fl.WriteString("\n")
 
 				if hd != "" {
 					os.Stdout.WriteString(hd)
 					os.Stdout.WriteString("\n")
 				}
 
-				// write selected record
-				os.Stdout.WriteString(str[:])
-				os.Stdout.WriteString("\n")
+				if tbls.DeGloss {
+					if HasMarkup(str) {
+						str = RemoveUnicodeMarkup(str)
+					}
+					if HasAngleBracket(str) {
+						str = DoHTMLReplace(str)
+					}
+				}
+				if tbls.DoMixed {
+					if HasMarkup(str) {
+						str = SimulateUnicodeMarkup(str)
+					}
+					if HasAngleBracket(str) {
+						str = DoHTMLRepair(str)
+					}
+					str = DoTrimFlankingHTML(str)
+				}
+				if tbls.DeAccent {
+					if IsNotASCII(str) {
+						str = DoAccentTransform(str)
+					}
+				}
+				if tbls.DoASCII {
+					if IsNotASCII(str) {
+						str = UnicodeToASCII(str)
+					}
+				}
+
+				writeFlush(str[:])
 
 				if tl != "" {
 					os.Stdout.WriteString(tl)
@@ -10128,6 +10269,12 @@ func main() {
 			os.Stdout.WriteString("\n")
 		}
 
+		err = fl.Sync()
+		if err != nil {
+			fmt.Println(err.Error())
+		}
+		fl.Close()
+
 		if timr {
 			printDuration("records")
 		}
@@ -10333,7 +10480,7 @@ func main() {
 					os.Exit(1)
 				}
 
-				xmlq := CreateProducer(topPattern, star, rdr, tbls)
+				xmlq := CreateProducer(topPattern, star, rdr, "", tbls)
 				tblq := CreateConsumers(cmds, tbls, parent, xmlq)
 
 				if xmlq == nil || tblq == nil {
@@ -10445,7 +10592,7 @@ func main() {
 	// LAUNCH PRODUCER, CONSUMER, AND UNSHUFFLER SERVERS
 
 	// launch producer goroutine to partition XML by pattern
-	xmlq := CreateProducer(topPattern, star, rdr, tbls)
+	xmlq := CreateProducer(topPattern, star, rdr, "", tbls)
 
 	// launch consumer goroutines to parse and explore partitioned XML objects
 	tblq := CreateConsumers(cmds, tbls, parent, xmlq)
diff --git a/xtract.pl b/xtract.pl
deleted file mode 100755
index 25d8374..0000000
--- a/xtract.pl
+++ /dev/null
@@ -1,2413 +0,0 @@
-#!/usr/bin/perl
-
-# ===========================================================================
-#
-#                            PUBLIC DOMAIN NOTICE
-#            National Center for Biotechnology Information (NCBI)
-#
-#  This software/database is a "United States Government Work" under the
-#  terms of the United States Copyright Act.  It was written as part of
-#  the author's official duties as a United States Government employee and
-#  thus cannot be copyrighted.  This software/database is freely available
-#  to the public for use. The National Library of Medicine and the U.S.
-#  Government do not place any restriction on its use or reproduction.
-#  We would, however, appreciate having the NCBI and the author cited in
-#  any work or product based on this material.
-#
-#  Although all reasonable efforts have been taken to ensure the accuracy
-#  and reliability of the software and data, the NLM and the U.S.
-#  Government do not and cannot warrant the performance or results that
-#  may be obtained by using this software or data. The NLM and the U.S.
-#  Government disclaim all warranties, express or implied, including
-#  warranties of performance, merchantability or fitness for any particular
-#  purpose.
-#
-# ===========================================================================
-#
-# File Name:  xtract
-#
-# Author:  Jonathan Kans
-#
-# Version Creation Date:   8/20/12
-#
-# ==========================================================================
-
-# Entrez Direct - EDirect
-
-# use strict;
-use warnings;
-
-# definitions
-
-use constant false => 0;
-use constant true  => 1;
-
-# xtract version number
-
-$version = "2.9999";
-
-# initialize memory hash
-
-my %memory = ();
-
-# initialize synopsis variables
-
-my %synopsis_acc = ();
-my $synopsis_max = 0;
-
-# initial level is global
-
-my $initial_level = 7;
-
-# utility subroutines
-
-sub convert_http {
-  my $str = shift (@_);
-  $str =~ s/\&amp\;/\&/g;
-  $str =~ s/\&apos\;/\'/g;
-  $str =~ s/\&gt\;/\>/g;
-  $str =~ s/\&lt\;/\</g;
-  $str =~ s/\&quot\;/\"/g;
-  return $str;
-}
-
-sub convert_slash {
-  my $str = shift (@_);
-  $str =~ s/\\t/\t/g;
-  $str =~ s/\\n/\n/g;
-  $str =~ s/\\r/\n/g;
-  return $str;
-}
-
-# xtract parses and extracts data from XML, driven by command-line arguments
-
-sub process_element {
-
-  my $line = shift (@_);
-  my $val = shift (@_);
-  my $prev = shift (@_);
-  my $pfx = shift (@_);
-  my $sfx = shift (@_);
-  my $sep = shift (@_);
-  my $cmmd = shift (@_);
-  my $variable = shift (@_);
-  my $indx = shift (@_);
-  my $sclr = shift (@_);
-
-  if ( $val eq "" ) {
-    return false;
-  }
-
-  # -element "*" writes current XML object
-
-  if ( $val eq "*" ) {
-    print "$prev";
-    print "$pfx";
-    print "$line";
-    print "$sfx";
-    return true;
-  }
-
-  my @accum = ();
-
-  # commas are used to select unrelated items to show between pfx and sfx
-
-  my @itms = split (',', $val);
-  foreach $itm (@itms) {
-
-    my @atts = ();
-    my @vals = ();
-    my @working = ();
-
-    my $do_count = false;
-    my $do_length = false;
-
-
-    # look for # or % modifiers before XML element specification
-
-    if ( $itm =~ /^#(.+)/ ) {
-
-      # items in quotations starting with # sign display count of matches
-
-      $do_count = true;
-      $itm = $1;
-
-    } elsif ( $itm =~ /^%(.+)/ ) {
-
-      # items in quotations starting with % sign display sum of matched string lengths
-
-      $do_length = true;
-      $itm = $1;
-    }
-
-
-    # INSDSeq special cases - &Feature and &Qualifier
-
-    if ( $itm =~ /^\&Feature$/ ) {
-
-      $itm = "INSDFeature_key";
-
-    } elsif ( $itm =~ /^\&Qualifier$/ ) {
-
-      $itm = "INSDQualifier_value";
-    }
-
-
-    # -element pattern recognition
-
-    if ( $itm =~ /^&([A-Z0-9]+)$/ ) {
-
-      # match stored &VARIABLE
-
-      my $idx = $1;
-      if ( exists ($memory{$idx}) and $memory{"$idx"} ne "" ) {
-        my $val = $memory{$idx};
-        $dec = convert_http($val);
-        push (@working, $dec);
-      }
-
-    } elsif ( $itm =~ /^\?([A-Z]+)\((.*)\)/ ) {
-
-      # match exploration flag (alias to print is in parentheses) (undocumented)
-
-      my $itm = $1;
-      my $rep = $2;
-
-      $dec = convert_http($rep);
-
-      # first and last elements are always present, and are the same for a one-element list
-
-      if ( $itm eq "FIRST" and $indx == 1 ) {
-        push (@working, $dec);
-      } elsif ( $itm eq "LAST" and $indx == $sclr ) {
-        push (@working, $dec);
-
-      # single, head, inner, tail are mutually exclusive situations
-
-      } elsif ( $itm eq "SINGLE" and $sclr == 1 ) {
-        push (@working, $dec);
-
-      } elsif ( $itm eq "HEAD" and $indx == 1 and $sclr > 1 ) {
-        push (@working, $dec);
-      } elsif ( $itm eq "INNER" and $indx > 1 and $indx < $sclr ) {
-        push (@working, $dec);
-      } elsif ( $itm eq "TAIL" and $indx == $sclr and $sclr > 1 ) {
-        push (@working, $dec);
-
-      # even and odd are based only on index value, not list length
-
-      } elsif ( $itm eq "EVEN" and ( $indx % 2) == 0 ) {
-        push (@working, $dec);
-      } elsif ( $itm eq "ODD" and ( $indx % 2) == 1 ) {
-        push (@working, $dec);
-      }
-
-    } elsif ( $itm =~ /^@(.+)/ ) {
-
-      # match unqualified @attribute
-
-      my $att = $1;
-      @atts = ($line =~ /<\S+ ([^>]+)>/g);
-      foreach $val (@atts) {
-        if ( $val =~ /$att=\"([^\"]+)\"/ ) {
-          $dec = convert_http($1);
-          push (@working, $dec);
-        }
-      }
-
-    } elsif ( $itm =~ /(.+)@(.+)/ ) {
-
-      # match qualified tag@attribute
-
-      my $tag = $1;
-      my $att = $2;
-      @atts = ($line =~ /<$tag ([^>]+)>/g);
-      foreach $val (@atts) {
-        if ( $val =~ /$att=\"([^\"]+)\"/ ) {
-          $dec = convert_http($1);
-          push (@working, $dec);
-        }
-      }
-
-    } elsif ( $itm =~ /(.+)\/(.+)/ ) {
-
-      # match parent/child by tracking depth level of tokens
-
-      my $tag = $1;
-      my $chd = $2;
-      @vals = ($line =~ /<$tag(?:\s+.+?)?>(.+?)<\/$tag>/g);
-      foreach $val (@vals) {
-        my $lvl = 0;
-        my @tokens = split (/(?<=>)(?=<)/, $val);
-        foreach $tkn (@tokens) {
-          if ( $lvl < 0 ) {
-            # ignore remainder if level drops below zero
-          } elsif ( $tkn =~ /^<.+?>.+?<\/.+?>$/ ) {
-            # content-containing tag does not change level
-            if ( $tkn =~ /<$chd(?:\s+.+?)?>(.+?)<\/$chd>/ ) {
-              # child tag matches, only accept if immediately below parent
-              if ( $lvl == 0 ) {
-                $dec = convert_http($1);
-                push (@working, $dec);
-              }
-            }
-          } elsif ( $tkn =~ /\/>$/ ) {
-            # self-closing token does not change level
-          } elsif ( $tkn =~ /^<[^\/]/ ) {
-            # open tag increments level
-            $lvl++;
-          } elsif ( $tkn =~ /^<\// ) {
-            # close tag decrements level
-            $lvl--;
-          }
-        }
-      }
-
-    } elsif ( $itm =~ /(.+)\((.*)\)/ ) {
-
-      # match self-closing tag (alias to print is in parentheses)
-
-      my $tag = $1;
-      my $rep = $2;
-      if ( $rep eq "" ) {
-        $rep = $tag;
-      }
-
-      # [ is it possible to look for both forms in one pass ? ]
-
-      @vals = ($line =~ /(<$tag.*?><\/$tag>)/g);
-
-      if ( scalar @vals != 0 ) {
-
-        # found alternative form ( <tag></tag> ) of self-closing tag
-
-        foreach $val (@vals) {
-          $dec = convert_http($rep);
-          push (@working, $dec);
-        }
-
-      } else {
-
-        # look for compact form ( <tag/> ) if no match to alternative form
-
-        if ( scalar @vals == 0 ) {
-          @vals = ($line =~ /(<$tag.*?\/>)/g);
-          foreach $val (@vals) {
-            $dec = convert_http($rep);
-            push (@working, $dec);
-          }
-        }
-      }
-
-    } else {
-
-      # match tag with contents
-
-      my $tag = $itm;
-      @vals = ($line =~ /<$tag(?:\s+.+?)?>(.+?)<\/$tag>/g);
-      foreach $val (@vals) {
-        $dec = convert_http($val);
-        push (@working, $dec);
-      }
-    }
-
-
-    my $num = scalar @working;
-
-    # if -first, -last, etc., remove all but matching subset
-
-    if ( $num < 1 ) {
-
-      # no matches collected, skip
-
-    } elsif ( $cmmd eq "-first" ) {
-
-      @working = splice (@working, 0, 1);
-      $num = scalar @working;
-
-    } elsif ( $cmmd eq "-last" ) {
-
-      @working = splice (@working, $num - 1, 1);
-      $num = scalar @working;
-
-    } elsif ( $cmmd eq "-single" ) {
-
-      if ( $num > 1 ) {
-        $num = 0;
-      }
-
-    } elsif ( $cmmd eq "-head" ) {
-
-      if ( $num < 2 ) {
-        $num = 0;
-      } else {
-        @working = splice (@working, 0, 1);
-        $num = scalar @working;
-      }
-
-    } elsif ( $cmmd eq "-inner" ) {
-
-      if ( $num < 3 ) {
-        $num = 0;
-      } else {
-        @working = splice (@working, 1, $num - 2);
-        $num = scalar @working;
-      }
-
-    } elsif ( $cmmd eq "-tail" ) {
-
-      if ( $num < 2 ) {
-        $num = 0;
-      } else {
-        @working = splice (@working, $num - 1, 1);
-        $num = scalar @working;
-      }
-    }
-
-
-    # store current results into accumulator
-
-    if ( $num == 0 ) {
-
-      # no printable results
-
-    } elsif ( $do_count ) {
-
-      push (@accum, $num);
-
-    } elsif ( $do_length ) {
-
-      my $len = 0;
-      foreach $dec (@working) {
-        $len += length ( $dec );
-      }
-
-      push (@accum, $len);
-
-    } elsif ( $cmmd eq "-even" ) {
-
-      my $take = false;
-      foreach $dec (@working) {
-        if ( $take ) {
-          push (@accum, $dec);
-        }
-        $take = (! $take);
-      }
-
-    } elsif ( $cmmd eq "-odd" ) {
-
-      my $take = true;
-      foreach $dec (@working) {
-        if ( $take ) {
-          push (@accum, $dec);
-        }
-        $take = (! $take);
-      }
-
-    } else {
-
-      foreach $dec (@working) {
-        push (@accum, $dec);
-      }
-    }
-  }
-
-
-  # process results
-
-  if ( scalar @accum == 0 ) {
-
-    return false;
-
-  } elsif ( $variable ne "") {
-
-    # store accumulated components into variable
-
-    my $str = $pfx;
-    my $between = "";
-    foreach $dec (@accum) {
-      $str .= "$between";
-      $between = $sep;
-      $str .= "$dec";
-    }
-    $str .= "$sfx";
-
-    $memory{"$variable"} = "$str";
-
-  } else {
-
-    # format accumulated elements
-
-    my $before = $prev . $pfx;
-    my $between = "";
-    my $after = "";
-    foreach $dec (@accum) {
-      print "$before";
-      $before = "";
-      print "$between";
-      $between = $sep;
-      $after = $sfx;
-      print "$dec";
-    }
-    print "$after";
-    return true;
-  }
-
-  return false;
-}
-
-sub process_flags {
-
-  my $line = shift (@_);
-  my @args = @{shift (@_)};
-  my $tab = shift (@_);
-  my $ret = shift (@_);
-  my $indx = shift (@_);
-  my $sclr = shift (@_);
-
-  my $pfx = "";
-  my $sfx = "";
-  my $sep = "\t";
-
-  my $col = "\t";
-  my $lin = "\n";
-
-  my $okay = false;
-  my $cmmd = "";
-
-  my $max = scalar @args;
-
-  for ( my $i = 0; $i < $max; $i++ ) {
-
-    # read command-line arguments
-
-    my $val = $args[$i];
-
-    # if new argument starts with hyphen
-
-    if ( $val =~ /^\-/ ) {
-
-      # not an -element value, so set $okay flag to false
-
-      $okay = false;
-
-      # all flags should be followed by at least one value
-
-      if ( $i + 1 >= $max and $val ne "-outline" and $val ne "-synopsis" ) {
-
-        # warn that the last flag has no value
-
-        print STDERR "No argument after '$val'\n";
-
-        # break out of for loop to return from function
-
-        last;
-
-        # otherwise safe to increment with $i++ and then dereference $args [$i]
-      }
-    }
-
-    # print outline of XML structure
-
-    if ( $val eq "-outline" ) {
-
-      my $lvl = 0;
-      my $dpth = 0;
-
-      my @tokens = split (/(?<=>)(?=<)/, $line);
-      foreach $tkn (@tokens) {
-        if ( $lvl < 0 ) {
-          # ignore remainder if level drops below zero
-        } elsif ( $tkn =~ /^<.+?>.+?<\/(.+?)>$/ ) {
-          # content-containing tag
-          if ( $tkn =~ /^<(\S+).*?>.*>$/ ) {
-            $tkn = $1;
-            $lvl++;
-            $dpth++;
-            for ( $i = 0; $i < $dpth; $i++ ) {
-              print "  ";
-            }
-            print "$tkn\n";
-            $lvl--;
-            $dpth--;
-          }
-        } elsif ( $tkn =~ /^<.+?\/>$/ ) {
-          # self-closing token
-          if ( $tkn =~ /^<(\S+).*?\/>$/ ) {
-            $tkn = $1;
-            $lvl++;
-            $dpth++;
-            for ( $i = 0; $i < $dpth; $i++ ) {
-              print "  ";
-            }
-            print "$tkn\n";
-            $lvl--;
-            $dpth--;
-          }
-        } elsif ( $tkn =~ /^<[^\/]/ ) {
-          # open tag increments level
-          $lvl++;
-          $dpth++;
-          if ( $tkn =~ /^<(\S+).*?>$/ ) {
-            $tkn = $1;
-            if ( $tkn ne "?xml" and
-                 $tkn ne "!DOCTYPE" and
-                 $tkn ne "eSummaryResult" and
-                 $tkn ne "eLinkResult" and
-                 $tkn ne "eInfoResult" and
-                 $tkn ne "PubmedArticleSet" and
-                 $tkn ne "DocumentSummarySet" and
-                 $tkn ne "INSDSet" and
-                 $tkn ne "Entrezgene-Set" and
-                 $tkn ne "TaxaSet" ) {
-              for ( $i = 0; $i < $dpth; $i++ ) {
-                print "  ";
-              }
-              print "$tkn\n";
-            } else {
-              $dpth--;
-            }
-          }
-        } elsif ( $tkn =~ /^<\// ) {
-          # close tag decrements level
-          $lvl--;
-          $dpth--;
-        }
-      }
-
-    # collect summary of XML paths, print at end
-
-    } elsif ( $val eq "-synopsis" ) {
-
-      my @arr = ();
-      my $lvl = 0;
-
-      my @tokens = split (/(?<=>)(?=<)/, $line);
-      foreach $tkn (@tokens) {
-        my $record_level = 0;
-        if ( $lvl < 0 ) {
-          # ignore remainder if level drops below zero
-        } elsif ( $tkn =~ /^<.+?>.+?<\/(.+?)>$/ ) {
-          # content-containing tag
-          if ( $tkn =~ /^<(\S+).*?>.*>$/ ) {
-            $tkn = $1;
-            $lvl++;
-            $arr[$lvl] = $tkn;
-            $record_level = $lvl;
-            $lvl--;
-          }
-        } elsif ( $tkn =~ /^<.+?\/>$/ ) {
-          # self-closing token
-          if ( $tkn =~ /^<(\S+).*?\/>$/ ) {
-            $tkn = $1;
-            $lvl++;
-            $arr[$lvl] = $tkn;
-            $record_level = $lvl;
-            $lvl--;
-          }
-        } elsif ( $tkn =~ /^<[^\/]/ ) {
-          # open tag increments level
-          $lvl++;
-          if ( $tkn =~ /^<(\S+).*?>$/ ) {
-            $tkn = $1;
-            $arr[$lvl] = $tkn;
-            $record_level = $lvl;
-          }
-        } elsif ( $tkn =~ /^<\// ) {
-          # close tag decrements level
-          $lvl--;
-        }
-        if ( $record_level > 0 ) {
-          my $str = "";
-          my $pfx = "";
-          for ( $i = 1; $i <= $record_level; $i++ ) {
-            $itm = $arr[$i];
-            if ( $itm ne "?xml" and
-                 $itm ne "!DOCTYPE" and
-                 $itm ne "eSummaryResult" and
-                 $itm ne "eLinkResult" and
-                 $itm ne "eInfoResult" and
-                 $itm ne "PubmedArticleSet" and
-                 $itm ne "DocumentSummarySet" and
-                 $itm ne "INSDSet" and
-                 $itm ne "Entrezgene-Set" and
-                 $itm ne "TaxaSet" ) {
-              $str .= "$pfx";
-              $pfx = "/";
-              $str .= "$arr[$i]";
-            }
-          }
-          if ( $str ne "" ) {
-            my $val = 0;
-            if ( exists ($synopsis_acc{"$str"}) ) {
-              $val = $synopsis_acc{"$str"};
-            }
-            $val++;
-            $synopsis_acc{"$str"} = $val;
-            if ( $val > $synopsis_max ) {
-              $synopsis_max = $val;
-            }
-            print "$str\n"
-          }
-        }
-      }
-
-    # look for arguments that adjust output formatting
-
-    } elsif ( $val eq "-pfx" ) {
-      $i++;
-      $pfx = convert_slash ( $args[$i] );
-    } elsif ( $val eq "-sfx" ) {
-      $i++;
-      $sfx = convert_slash ( $args[$i] );
-    } elsif ( $val eq "-sep" ) {
-      $i++;
-      $sep = convert_slash ( $args[$i] );
-    } elsif ( $val eq "-tab" ) {
-      $i++;
-      $col = convert_slash ( $args[$i] );
-    } elsif ( $val eq "-ret" ) {
-      $i++;
-      $lin = convert_slash ( $args[$i] );
-    } elsif ( $val eq "-lbl" ) {
-      $i++;
-      print "$tab";
-      $lbl = convert_slash ( $args[$i] );
-      print "$lbl";
-      $tab = $col;
-      $ret = $lin;
-
-    # look for -element or limiting variants, non-hyphenated arguments to follow
-
-    } elsif ( $val eq "-element" or
-              $val eq "-first" or
-              $val eq "-last" or
-              $val eq "-even" or
-              $val eq "-odd" or
-              $val eq "-single" or
-              $val eq "-head" or
-              $val eq "-inner" or
-              $val eq "-tail" ) {
-
-      $okay = true;
-      $cmmd = $val;
-
-    # hyphen followed by capital letters or digits is a variable
-
-    } elsif ( $val =~ /^\-([A-Z0-9]+)([a-z]?)$/ ) {
-
-      my $variable = $1;
-      my $first_or_last = $2;
-
-      $cmmd = "-element";
-
-      # detect f, l, etc., variable suffix for -first, -last, etc. (undocumented)
-
-      if ( $first_or_last eq "f" ) {
-        $cmmd = "-first";
-      } elsif ( $first_or_last eq "l" ) {
-        $cmmd = "-last";
-      } elsif ( $first_or_last eq "e" ) {
-        $cmmd = "-even";
-      } elsif ( $first_or_last eq "o" ) {
-        $cmmd = "-odd";
-      } elsif ( $first_or_last eq "s" ) {
-        $cmmd = "-single";
-      } elsif ( $first_or_last eq "h" ) {
-        $cmmd = "-head";
-      } elsif ( $first_or_last eq "i" ) {
-        $cmmd = "-inner";
-      } elsif ( $first_or_last eq "t" ) {
-        $cmmd = "-tail";
-      }
-
-      # clear the variable to avoid old value persisting if no subsequent match
-      # commented out in favor of clearing by assigning a literal value containing an empty string
-
-      # $memory{"$variable"} = "";
-
-      $i++;
-      $val = $args[$i];
-
-      # set variable to new value
-
-      if ( $val =~ /^\((.*)\)$/ ) {
-
-        # parentheses contain literal value for variable
-
-        $val = convert_slash ( $1 );
-        $memory{"$variable"} = "$val";
-
-      } else {
-
-        # if XML tag name, record what would otherwise be printed by -element
-
-        process_element ( $line, $val, $tab, $pfx, $sfx, $sep, $cmmd, $variable, $indx, $sclr );
-
-      }
-
-    # reality check on unsupported arguments
-
-    } elsif ( $val =~ /^\-/ ) {
-
-      print STDERR "Unrecognized argument '$val'\n";
-
-    } elsif ( $okay ) {
-
-      # process -element values
-
-      if ( process_element ( $line, $val, $tab, $pfx, $sfx, $sep, $cmmd, "", $indx, $sclr )) {
-        $tab = $col;
-        $ret = $lin;
-      }
-
-    } else {
-
-      print STDERR "No -element before '$val'\n";
-
-    }
-  }
-
-  return $tab, $ret;
-}
-
-sub has_content {
-
-  my $line = shift (@_);
-  my $str = shift (@_);
-  my $indx = shift (@_);
-  my $sclr = shift (@_);
-
-  if ( $str eq "" or $str eq "&" ) {
-    return false;
-  }
-
-
-  # match by non-empty variable value (undocumented)
-
-  if ( $str =~ /^&([A-Z0-9]+)$/ ) {
-    $str = $1;
-    if ( exists ($memory{"$str"}) and $memory{"$str"} ne "" ) {
-      return true;
-    }
-    return false;
-  }
-
-  # remove leading backslash used to prevent content from being mistaken for variable
-
-  $str =~ s/^\\&/&/;
-
-
-  # match by exploration flag (undocumented)
-
-  if ( $str =~ /^\?([A-Z]+)/ ) {
-    $str = $1;
-
-    # first and last elements are always present, and are the same for a one-element list
-
-    if ( $str eq "FIRST" and $indx == 1 ) {
-      return true;
-    } elsif ( $str eq "LAST" and $indx == $sclr ) {
-      return true;
-
-    # single, head, inner, tail are mutually exclusive situations
-
-    } elsif ( $str eq "SINGLE" and $sclr == 1 ) {
-      return true;
-
-    } elsif ( $str eq "HEAD" and $indx == 1 and $sclr > 1 ) {
-      return true;
-    } elsif ( $str eq "INNER" and $indx > 1 and $indx < $sclr ) {
-      return true;
-    } elsif ( $str eq "TAIL" and $indx == $sclr and $sclr > 1 ) {
-      return true;
-
-    # even and odd are based only on index value, not list length
-
-    } elsif ( $str eq "EVEN" and ( $indx % 2) == 0 ) {
-      return true;
-    } elsif ( $str eq "ODD" and ( $indx % 2) == 1 ) {
-      return true;
-    }
-  }
-
-  # remove leading backslash used to prevent content from being mistaken for flag
-
-  $str =~ s/^\\\?/?/;
-
-
-  # exact match by data contents, not using regular expression, case insensitive
-
-  if ( index ( uc($line), uc($str) ) >= 0 ) {
-    return true;
-  }
-
-  return false;
-}
-
-sub has_element {
-
-  my $line = shift (@_);
-  my $str = shift (@_);
-
-  if ( $str eq "" ) {
-    return false;
-  }
-
-
-  # track depth level of tokens if pattern was parent/child (undocumented)
-  
-  if ( $str =~ /(.+)\/(.+)/ ) {
-    my $ptrn = $1;
-    my $chld = $2;
-
-    my @vals = ();
-    my $lvl = 0;
-    my $dpth = 0;
-    my $in_pat = false;
-
-    my @tokens = split (/(?<=>)(?=<)/, $line);
-    foreach $tkn (@tokens) {
-
-      if ( $lvl < 0 ) {
-        # ignore remainder if level drops below zero
-      } elsif ( $tkn =~ /^<.+?>.+?<\/.+?>$/ ) {
-        # content-containing tag does not change level
-        if ( $in_pat and $lvl == $dpth ) {
-          push (@vals, $tkn);
-        }
-      } elsif ( $tkn =~ /\/>$/ ) {
-        # self-closing token does not change level
-        if ( $in_pat and $lvl == $dpth ) {
-          push (@vals, $tkn);
-        }
-      } elsif ( $tkn =~ /^<[^\/]/ ) {
-        # open tag increments level
-        $lvl++;
-        if ( $tkn =~ /<$ptrn(?:\s+.+?)?>/ and $dpth == 0 ) {
-          $in_pat = true;
-          $dpth = $lvl;
-          push (@vals, $tkn);
-        }
-      } elsif ( $tkn =~ /^<\// ) {
-        # close tag decrements level
-        if ( $in_pat and $tkn =~ /<\/$ptrn>/ and $lvl == $dpth ) {
-          push (@vals, $tkn);
-          $in_pat = false;
-          $dpth = 0;
-        }
-        $lvl--;
-      }
-    }
-
-    if ( scalar @vals > 0 ) {
-      my $sub = join ("", @vals);
-      if ( has_element ( $sub, $chld )) {
-        return true;
-      }
-    }
-
-    return false;
-  }
-
-
-  # match by specific variable value (undocumented)
-
-  if ( $str =~ /^\&([A-Z0-9]+)\:(.+)$/ ) {
-    my $ptrn = $1;
-    my $chld = $2;
-
-    if ( exists ($memory{"$ptrn"}) and $memory{"$ptrn"} eq "$chld" ) {
-      return true;
-    }
-
-    return false;
-  }
-
-
-  # match by non-empty variable value (undocumented)
-
-  if ( $str =~ /^\&([A-Z0-9]+)$/ ) {
-    $str = $1;
-
-    if ( exists ($memory{"$str"}) and $memory{"$str"} ne "" ) {
-      return true;
-    }
-
-    return false;
-  }
-
-
-  # regular expression search for element@attribute:value
-
-  if ( $str =~ /(.+)@(.+)\:(.+)/ ) {
-    my $ptrn = $1;
-    my $attr = $2;
-    my $chld = $3;
-
-    if ( $line =~ /<$ptrn ([^>]+)>/i ) {
-      my $atts = $1;
-      if ( $atts =~ /$attr=\"$chld\"/ ) {
-        return true;
-      }
-    }
-
-    return false;
-  }
-
-
-  # regular expression search for @attribute:value
-
-  if ( $str =~ /@(.+)\:(.+)/ ) {
-    my $attr = $1;
-    my $chld = $2;
-
-    if ( $line =~ /<\S+ ([^>]+)>/i ) {
-      my $atts = $1;
-      if ( $atts =~ /$attr=\"$chld\"/ ) {
-        return true;
-      }
-    }
-
-    return false;
-  }
-
-
-  # regular expression search for element@attribute
-
-  if ( $str =~ /(.+)@(.+)/ ) {
-    my $ptrn = $1;
-    my $attr = $2;
-
-    if ( $line =~ /<$ptrn ([^>]+)>/i ) {
-      my $atts = $1;
-      if ( $atts =~ /$attr=\"([^\"]+)\"/ ) {
-        return true;
-      }
-    }
-
-    return false;
-  }
-
-
-  # regular expression search for @attribute
-
-  if ( $str =~ /@(.+)/ ) {
-    my $attr = $1;
-
-    if ( $line =~ /<\S+ ([^>]+)>/i ) {
-      my $atts = $1;
-      if ( $atts =~ /$attr=\"([^\"]+)\"/ ) {
-        return true;
-      }
-    }
-
-    return false;
-  }
-
-
-  # INSDSeq special cases - &Feature:name and &Qualifier:name
-
-  if ( $str =~ /^\&Feature\:(.+)/ ) {
-
-    $str = "INSDFeature_key:" . "$1";
-
-  } elsif ( $str =~ /^\&Qualifier\:(.+)/ ) {
-
-    $str = "INSDQualifier_name:" . "$1";
-  }
-
-
-  # regular expression search for element:value
-
-  if ( $str =~ /(.+)\:(.+)/ ) {
-    my $ptrn = $1;
-    my $chld = $2;
-
-    if ( $line =~ /<$ptrn(?:\s+.+?)?>$chld<\/$ptrn>/i ) {
-      return true;
-    }
-
-    return false;
-  }
-
-
-  # regular expression search for element tag
-
-  if ( $line =~ /<$str(?:\s+.+?)?>/i ) {
-    return true;
-  }
-
-  # also test self-closing tag
-
-  if ( $line =~ /<$str(?:\s+.+?)?\/>/i ) {
-    return true;
-  }
-
-  return false;
-}
-
-sub process_level {
-
-  my $line = shift (@_);
-  my $par = shift (@_);
-  my $chd = shift (@_);
-  my @args = @{shift (@_)};
-  my $tab = shift (@_);
-  my $ret = shift (@_);
-  my $level = shift (@_);
-  my $indx = shift (@_);
-  my $sclr = shift (@_);
-
-
-  # handle heterogeneous child blocks if pattern was parent/*
-
-  if ( $chd eq "*" and $par ne "" and $par ne "*" ) {
-
-    my @vals = ();
-    my $lvl = 0;
-    my $in_pat = false;
-    my $pfx = "";
-    my @working = ();
-
-    if ( $line =~ /^<$par(?:\s+.+?)?>(.+?)<\/$par>$/ ) {
-      $line = $1;
-    }
-
-    my @tokens = split (/(?<=>)(?=<)/, $line);
-    foreach $tkn (@tokens) {
-
-      if ( $lvl < 0 ) {
-        # ignore remainder if level drops below zero
-      } elsif ( $tkn =~ /^<.+?>.+?<\/.+?>$/ ) {
-        # content-containing tag does not change level
-        if ( $lvl > 0 ) {
-          push (@working, $tkn);
-        } else {
-          # match to full object, push contents directly to final array
-          push (@vals, $tkn);
-        }
-      } elsif ( $tkn =~ /\/>$/ ) {
-        # self-closing token does not change level
-        if ( $lvl > 0 ) {
-          push (@working, $tkn);
-        } else {
-          push (@vals, $tkn);
-        }
-      } elsif ( $tkn =~ /^<[^\/]/ ) {
-        # open tag increments level
-        push (@working, $tkn);
-        $lvl++;
-      } elsif ( $tkn =~ /^<\// ) {
-        # close tag decrements level
-        push (@working, $tkn);
-        $lvl--;
-        if ( $lvl == 0 ) {
-          my $str = join ("", @working);
-          push (@vals, $str);
-          @working = ();
-        }
-      }
-    }
-    if ( scalar @working > 0 ) {
-      my $str = join ("", @working);
-      push (@vals, $str);
-      @working = ();
-    }
-
-    foreach $val (@vals) {
-      if ( $level == $initial_level ) {
-        print "$pfx";
-        %memory = ();
-      }
-      ( $tab, $ret ) = process_level ( $val, "", "", \@args, $tab, $ret, $level, $indx, $sclr );
-      if ( $level == $initial_level ) {
-        $pfx = $ret;
-        $tab = "";
-      }
-    }
-
-    return $tab, $ret;
-  }
-
-  # track depth level of tokens if pattern was parent/child
-
-  if ( $chd ne "" ) {
-
-    my @vals = ();
-    my $lvl = 0;
-    my $dpth = 0;
-    my $in_pat = false;
-    my $pfx = "";
-    my @working = ();
-
-    my @tokens = split (/(?<=>)(?=<)/, $line);
-    foreach $tkn (@tokens) {
-
-      if ( $lvl < 0 ) {
-        # ignore remainder if level drops below zero
-      } elsif ( $tkn =~ /^<.+?>.+?<\/.+?>$/ ) {
-        # content-containing tag does not change level
-        if ( $in_pat ) {
-          push (@working, $tkn);
-        } elsif ( $tkn =~ /<$chd(?:\s+.+?)?>(.+?)<\/$chd>/ and $lvl < 2 ) {
-          # match to full object, push directly to final array
-          push (@vals, $tkn);
-        }
-      } elsif ( $tkn =~ /\/>$/ ) {
-        # self-closing token does not change level
-        if ( $in_pat ) {
-          push (@working, $tkn);
-        }
-      } elsif ( $tkn =~ /^<[^\/]/ ) {
-        # open tag increments level
-        if ( $in_pat ) {
-          push (@working, $tkn);
-        } elsif ( $tkn =~ /<$chd(?:\s+.+?)?>/ and $lvl < 2 ) {
-          $in_pat = true;
-          $dpth = $lvl;
-          push (@working, $tkn);
-        }
-        $lvl++;
-      } elsif ( $tkn =~ /^<\// ) {
-        # close tag decrements level
-        $lvl--;
-        if ( $in_pat ) {
-          push (@working, $tkn);
-        }
-        if ( $tkn =~ /<\/$chd>/ ) {
-          if ( $lvl <= $dpth ) {
-            $in_pat = false;
-            my $str = join ("", @working);
-            push (@vals, $str);
-            @working = ();
-          }
-        }
-      }
-    }
-    if ( scalar @working > 0 ) {
-      my $str = join ("", @working);
-      push (@vals, $str);
-      @working = ();
-    }
-
-    foreach $val (@vals) {
-      if ( $level == $initial_level ) {
-        print "$pfx";
-        %memory = ();
-      }
-      ( $tab, $ret ) = process_level ( $val, "", "", \@args, $tab, $ret, $level, $indx, $sclr );
-      if ( $level == $initial_level ) {
-        $pfx = $ret;
-        $tab = "";
-      }
-    }
-
-    return $tab, $ret;
-  }
-
-
-  my $max = scalar @args;
-
-
-  # allow conditional arguments in any order
-
-  my $go_on = true;
-  while ( $go_on ) {
-
-    if ( $max > 2 and $args[0] eq "-position" ) {
-
-      # -position command filters by object position in list
-
-      my $pstn = $args[1];
-      $max -= 2;
-      @args = splice (@args, 2, $max);
-
-      # first and last elements are always present, and are the same for a one-element list
-
-      if ( $pstn eq "first" and  $indx > 1 ) {
-        return $tab, $ret;
-      } elsif ( $pstn eq "last" and $indx < $sclr ) {
-        return $tab, $ret;
-
-      # single, head, inner, tail are mutually exclusive situations
-
-      } elsif ( $pstn eq "single" and $sclr > 1 ) {
-        return $tab, $ret;
-
-      } elsif ( $pstn eq "head" and ( $indx > 1 or $sclr == 1 ) ) {
-        return $tab, $ret;
-      } elsif ( $pstn eq "inner" and ( $indx == 1 or $indx == $sclr ) ) {
-        return $tab, $ret;
-      } elsif ( $pstn eq "tail" and ( $indx < $sclr or $sclr == 1 ) ) {
-        return $tab, $ret;
-
-      # single and multiple are mutually exclusive situations
-
-      } elsif ( $pstn eq "multiple" and $sclr == 1 ) {
-        return $tab, $ret;
-
-      # even and odd are based only on index value, not list length
-
-      } elsif ( $pstn eq "even" and ( $indx % 2) == 1 ) {
-        return $tab, $ret;
-      } elsif ( $pstn eq "odd" and ( $indx % 2) == 0 ) {
-        return $tab, $ret;
-
-      # value is actual numeric index
-
-      } elsif ( $pstn =~ /^[0-9]+$/ and $indx != $pstn ) {
-        return $tab, $ret;
-      }
-
-    } elsif ( $max > 2 and $args[0] eq "-match" ) {
-
-      # -match ... -and/-or ... command filters for indicated element tag [:value]
-
-      my $required = 1;
-      my $observed = 0;
-      my $prevbool = "";
-
-      my $mtch = $args[1];
-      $max -= 2;
-      @args = splice (@args, 2, $max);
-
-      if ( has_element ( $line, $mtch )) {
-        $observed++;
-      }
-
-      # "or" succeeds on any match, "and" requires all tests to match
-
-      while ( $max > 2 and ( $args[0] eq "-or" or $args[0] eq "-and" )) {
-
-        if ( $prevbool ne "" and $prevbool ne $args[0] ) {
-          print STDERR "A mixture of -and and -or commands cannot follow -match\n";
-          return $tab, $ret;
-        }
-
-        $prevbool = $args[0];
-        if ( $prevbool eq "-and" ) {
-          $required++;
-        }
-
-        $mtch = $args[1];
-        $max -= 2;
-        @args = splice (@args, 2, $max);
-
-        if ( has_element ( $line, $mtch )) {
-          $observed++;
-        }
-      }
-
-      if ( $observed < $required ) {
-        return $tab, $ret;
-      }
-
-    } elsif ( $max > 2 and $args[0] eq "-avoid" ) {
-
-      # -avoid ... -and ... command filters against indicated element tag [:value]
-
-      my $skip = $args[1];
-      $max -= 2;
-      @args = splice (@args, 2, $max);
-
-      if ( has_element ( $line, $skip )) {
-        return $tab, $ret;
-      }
-
-      # colloquial "and" is really logical "or" - any match bails out of function
-
-      while ( $max > 2 and $args[0] eq "-and" ) {
-
-        $skip = $args[1];
-        $max -= 2;
-        @args = splice (@args, 2, $max);
-
-        if ( has_element ( $line, $skip )) {
-          return $tab, $ret;
-        }
-      }
-
-      if ( $max > 1 and $args[0] eq "-or" ) {
-        print STDERR "The -or command cannot follow -avoid\n";
-        return $tab, $ret;
-      }
-
-    } elsif ( $max > 2 and $args[0] eq "-present" ) {
-
-      # -present ... -and/-or ... command filters for indicated data content
-
-      my $required = 1;
-      my $observed = 0;
-      my $prevbool = "";
-
-      my $mtch = $args[1];
-      $max -= 2;
-      @args = splice (@args, 2, $max);
-
-      if ( has_content ( $line, $mtch, $indx, $sclr )) {
-        $observed++;
-      }
-
-      # "or" succeeds on any match, "and" requires all tests to match
-
-      while ( $max > 2 and ( $args[0] eq "-or" or $args[0] eq "-and" )) {
-
-        if ( $prevbool ne "" and $prevbool ne $args[0] ) {
-          print STDERR "A mixture of -and and -or commands cannot follow -present\n";
-          return $tab, $ret;
-        }
-
-        $prevbool = $args[0];
-        if ( $prevbool eq "-and" ) {
-          $required++;
-        }
-
-        $mtch = $args[1];
-        $max -= 2;
-        @args = splice (@args, 2, $max);
-
-        if ( has_content ( $line, $mtch, $indx, $sclr )) {
-          $observed++;
-        }
-      }
-
-      if ( $observed < $required ) {
-        return $tab, $ret;
-      }
-
-    } elsif ( $max > 2 and $args[0] eq "-absent" ) {
-
-      # -absent ... -and ... command filters against indicated data content
-
-      my $skip = $args[1];
-      $max -= 2;
-      @args = splice (@args, 2, $max);
-
-      if ( has_content ( $line, $skip, $indx, $sclr )) {
-        return $tab, $ret;
-      }
-
-      # colloquial "and" is really logical "or" - any match bails out of function
-
-      while ( $max > 2 and $args[0] eq "-and" ) {
-
-        $skip = $args[1];
-        $max -= 2;
-        @args = splice (@args, 2, $max);
-
-        if ( has_content ( $line, $skip, $indx, $sclr )) {
-          return $tab, $ret;
-        }
-      }
-
-      if ( $max > 1 and $args[0] eq "-or" ) {
-        print STDERR "The -or command cannot follow -absent\n";
-        return $tab, $ret;
-      }
-
-    } elsif ( $max > 1 and $args[0] eq "-trim" ) {
-
-      # -trim removes first XML tag
-
-      while ( $max > 1 and $args[0] eq "-trim" ) {
-
-        $max--;
-        @args = splice (@args, 1, $max);
-
-        if ( $line =~ /^<.+?>(.+)$/ ) {
-          $line = $1;
-        }
-      }
-
-
-    } else {
-
-    # no other conditional tests or trim commands, break loop
-
-      $go_on = false;
-    }
-  }
-
-
-  # at level 0 break recursion for nested organizers, process -element
-
-  if ( $level < 1 ) {
-
-    ( $tab, $ret ) = process_flags ( $line, \@args, $tab, $ret, $indx, $sclr );
-    return $tab, $ret;
-  }
-
-
-  # get name of command for current organizer level
-
-  # initial capital signifies tokenized search matching nesting levels
-
-  # (tokenizing is significantly slower than regular expression pattern matching)
-
-  my $name = "";
-  my $capname = "";
-
-  if ( $level == 7 ) {
-    $name = "-division";
-    $capname = "-Division";
-  } elsif ( $level == 6) {
-    $name = "-group";
-    $capname = "-Group";
-  } elsif ( $level == 5) {
-    $name = "-branch";
-    $capname = "-Branch";
-  } elsif ( $level == 4) {
-    $name = "-block";
-    $capname = "-Block";
-  } elsif ( $level == 3) {
-    $name = "-section";
-    $capname = "-Section";
-  } elsif ( $level == 2) {
-    $name = "-subset";
-    $capname = "-Subset";
-  } elsif ( $level == 1) {
-    $name = "-unit";
-    $capname = "-Unit";
-  }
-
-
-  # find chains of arguments between indicated command names
-
-  my $start = 0;
-  my $stop = 0;
-
-  for ( $start = 0; $start < $max; $start = $stop ) {
-    $stop = $start + 1;
-    while ( $stop < $max and $args[$stop] ne $name and $args[$stop] ne $capname ) {
-      $stop++;
-    }
-
-    # collect arguments between -group, -block, or -subset directives
-
-    my @tmp = @args;
-
-    if ( $tmp[$start] eq $name and $stop - $start >= 2 ) {
-
-      my $pat = $tmp[$start + 1];
-      my $chd = "";
-
-      if ( $pat =~ /\*\/(.+)/ ) {
-        # -block */Taxon replaces -trim
-        $pat = $1;
-        if ( $line =~ /^<.+?>(.+)$/ ) {
-          $line = $1;
-        }
-      }
-
-      # INSDSeq special cases - &Feature and &Qualifier
-
-      if ( $pat =~ /^\&Feature$/ ) {
-        $pat = "INSDFeature";
-      } elsif ( $pat =~ /^\&Qualifier$/ ) {
-        $pat = "INSDQualifier";
-      }
-
-      if ( $pat =~ /(.+)\/(.+)/ ) {
-        $pat = $1;
-        $chd = $2;
-      }
-      $start += 2;
-      my @group = splice (@tmp, $start, $stop - $start);
-
-      # normal non-greedy pattern matching for non-nested XML
-
-      my @vals = ($line =~ /(<$pat(?:\s+.+?)?>.+?<\/$pat>)/g);
-
-      if ( scalar @vals > 0 ) {
-
-        my $curr = 0;
-        my $totl = scalar @vals;
-        foreach $val (@vals) {
-          $curr++;
-          ( $tab, $ret ) = process_level ( $val, $pat, $chd, \@group, $tab, $ret, $level - 1, $curr, $totl );
-        }
-
-      } else {
-
-        @vals = ($line =~ /(<$pat(?:\s+.+?)\/>)/g);
-
-        my $curr = 0;
-        my $totl = scalar @vals;
-        foreach $val (@vals) {
-          $curr++;
-          ( $tab, $ret ) = process_level ( $val, $pat, $chd, \@group, $tab, $ret, $level - 1, $curr, $totl );
-        }
-      }
-
-    } elsif ( $tmp[$start] eq $capname and $stop - $start >= 2 ) {
-
-      my $pat = $tmp[$start + 1];
-      my $chd = "";
-      if ( $pat =~ /(.+)\/(.+)/ ) {
-        $pat = $1;
-        $chd = $2;
-      }
-      $start += 2;
-      my @group = splice (@tmp, $start, $stop - $start);
-
-      # tokenized pattern matching for XML with nested tags
-
-      my @vals = ();
-      my $lvl = 0;
-      my $dpth = 0;
-      my $in_pat = false;
-      my @working = ();
-
-      my @tokens = split (/(?<=>)(?=<)/, $line);
-      foreach $tkn (@tokens) {
-
-        if ( $lvl < 0 ) {
-          # ignore remainder if level drops below zero
-        } elsif ( $tkn =~ /^<.+?>.+?<\/.+?>$/ ) {
-          # content-containing tag does not change level
-          if ( $in_pat ) {
-            push (@working, $tkn);
-          } elsif ( $tkn =~ /<$pat(?:\s+.+?)?>(.+?)<\/$pat>/ ) {
-            # match to full object, push directly to final array
-            push (@vals, $tkn);
-          }
-        } elsif ( $tkn =~ /\/>$/ ) {
-          # self-closing token does not change level
-          if ( $in_pat ) {
-            push (@working, $tkn);
-          }
-        } elsif ( $tkn =~ /^<[^\/]/ ) {
-          # open tag increments level
-          if ( $in_pat ) {
-            push (@working, $tkn);
-          } elsif ( $tkn =~ /<$pat(?:\s+.+?)?>/ ) {
-            $in_pat = true;
-            $dpth = $lvl;
-            push (@working, $tkn);
-          }
-          $lvl++;
-        } elsif ( $tkn =~ /^<\// ) {
-          # close tag decrements level
-          $lvl--;
-          if ( $in_pat ) {
-            push (@working, $tkn);
-          }
-          if ( $tkn =~ /<\/$pat>/ ) {
-            if ( $lvl <= $dpth ) {
-              $in_pat = false;
-              my $str = join ("", @working);
-              push (@vals, $str);
-              @working = ();
-            }
-          }
-        }
-      }
-      if ( scalar @working > 0 ) {
-        my $str = join ("", @working);
-        push (@vals, $str);
-        @working = ();
-      }
-
-      my $curr = 0;
-      my $totl = scalar @vals;
-      foreach $val (@vals) {
-        $curr++;
-        ( $tab, $ret ) = process_level ( $val, $pat, $chd, \@group, $tab, $ret, $level - 1, $curr, $totl );
-      }
-
-    } else {
-
-      my @group = splice (@tmp, $start, $stop - $start);
-      ( $tab, $ret ) = process_level ( $line, "", "", \@group, $tab, $ret, $level - 1, $indx, $sclr );
-    }
-  }
-
-  return $tab, $ret;
-}
-
-sub process_insd {
-
-  # ... | xtract -insd [+/-] mat_peptide "%peptide" product peptide
-
-  my $quot = shift (@_);
-  my @args = @{shift (@_)};
-
-  my $max = scalar @args;
-
-  my @working = ();
-
-  # print common arguments
-
-  push (@working, "-pattern");
-  push (@working, "INSDSeq");
-  push (@working, "-ACCN");
-  push (@working, "INSDSeq_accession-version");
-
-  # collect descriptors
-
-  if ( $args[1] =~ /INSD/ ) {
-
-    push (@working, "-pfx");
-    push (@working, "$quot\\n$quot");
-    push (@working, "-element");
-    push (@working, "$quot\&ACCN$quot");
-
-    for ( my $i = 1; $i < $max; $i++ ) {
-      my $val = $args[$i];
-
-      push (@working, "-block");
-      push (@working, "INSDSeq");
-      push (@working, "-element");
-      push (@working, "$quot$val$quot");
-    }
-
-    return @working;
-  }
-
-  # collect qualifiers
-
-  my @qualifiers = ();
-  my @required = ();
-
-  my $feat = "";
-  my $partial = 0;
-
-  for ( my $i = 1; $i < $max; $i++ ) {
-    my $val = $args[$i];
-
-    if ( $feat eq "" ) {
-      if ( $val eq "+" or $val eq "complete" ) {
-        $partial = 1;
-      } elsif ( $val eq "-" or $val eq "partial" ) {
-        $partial = -1;
-      } else {
-        $feat = $val;
-      }
-    } else {
-      if ( $val =~ /^\+(.+)/ ) {
-        $val = $1;
-        push (@required, $val);
-      }
-      push (@qualifiers, $val);
-    }
-  }
-
-  push (@working, "-group");
-  push (@working, "INSDFeature");
-
-  my @itms = split (',', $feat);
-  my $fcmd = "-match";
-  foreach $itm (@itms) {
-    push (@working, "$fcmd");
-    push (@working, "INSDFeature_key:$itm");
-    $fcmd = "-or";
-  }
-
-  if ( $partial == 1 ) {
-    push (@working, "-avoid");
-    push (@working, "INSDFeature_partial5");
-    push (@working, "-and");
-    push (@working, "INSDFeature_partial3");
-  } elsif ( $partial == -1 ) {
-    push (@working, "-match");
-    push (@working, "INSDFeature_partial5");
-    push (@working, "-or");
-    push (@working, "INSDFeature_partial3");
-  }
-
-  my $rcmd = "-match";
-  foreach my $val (@required) {
-    push (@working, "$rcmd");
-    push (@working, "INSDQualifier_name:$val");
-    $rcmd = "-and";
-  }
-
-  push (@working, "-pfx");
-  push (@working, "$quot\\n$quot");
-  push (@working, "-element");
-  push (@working, "$quot&ACCN$quot");
-
-  foreach my $val (@qualifiers) {
-    if ( $val =~ /INSD/ ) {
-      push (@working, "-block");
-      push (@working, "INSDFeature");
-      push (@working, "-element");
-      push (@working, "$quot$val$quot");
-
-    } else {
-
-      push (@working, "-block");
-      push (@working, "INSDQualifier");
-
-      if ( $val =~ /^%(.+)/ ) {
-        $val = $1;
-        push (@working, "-match");
-        push (@working, "INSDQualifier_name:$val");
-        push (@working, "-element");
-        push (@working, "$quot\%INSDQualifier_value$quot");
-      } else {
-        push (@working, "-match");
-        push (@working, "INSDQualifier_name:$val");
-        push (@working, "-element");
-        push (@working, "INSDQualifier_value");
-      }
-    }
-  }
-
-  return @working;
-}
-
-my $xtract_help = qq{
-This Perl version of xtract is obsolete, and is no longer maintained or
-supported. The new version, which is written in the Go language, is more
-than an order of magnitude faster. To build the new xtract, download and
-install the open source Go compiler on your computer, then run:
-
-  go build xtract.go
-
-};
-
-sub xtract {
-
-  # ... | xtract -pattern ... -group ... -block ... -subset ... -element ...
-
-  my $compress_spaces = true;
-
-  my $max = scalar @ARGV;
-
-  if ( $max < 1 ) {
-    print STDERR "No arguments were supplied to xtract.pl\n";
-    return;
-  }
-
-  my $timer = false;
-
-  if ( $ARGV[0] eq "-timer" ) {
-
-    # report execution time of original Perl implementation for comparison to new Go compiled version
-    $timer = true;
-
-  } elsif ( 0 && $ARGV[0] ne "-fallback" ) {
-
-    # ensure that platform-specific Go compiled version was used if available
-    print STDERR "\nPLEASE REWRITE YOUR SCRIPT TO CALL XTRACT INSTEAD OF XTRACT.PL.\n";
-    print STDERR "XTRACT WILL RUN AN IMPROVED, COMPILED EXECUTABLE THAT CAN USE\n";
-    print STDERR "MULTIPLE CPU CORES TO PROCESS RECORDS BETWEEN ONE AND TWO ORDERS\n";
-    print STDERR "OF MAGNITUDE FASTER THAN THE ORIGINAL PERL IMPLEMENTATION.\n\n";
-    return;
-  }
-
-  # skip past -timer or -fallback command
-  shift @ARGV;
-  $max = scalar @ARGV;
-
-  if ( $max > 0 and $ARGV[0] eq "-help" ) {
-    print "xtract $version (Perl version)\n";
-    print $xtract_help;
-    return;
-  }
-
-  if ( $max > 0 and $ARGV[0] eq "-version" ) {
-    print "$version\n";
-    return;
-  }
-
-  # report deprecated commands
-
-  if ( $max > 0 ) {
-    foreach $argument (@ARGV) {
-      if ( $argument eq "-present" ) {
-        print STDERR "Argument -present is deprecated, use -match Object:Value\n";
-      }
-      if ( $argument eq "-absent" ) {
-        print STDERR "Argument -absent is deprecated, use -avoid Object:Value\n";
-      }
-      if ( $argument eq "-trim" ) {
-        print STDERR "Argument -trim is deprecated, use -block */Object\n";
-      }
-      if ( $argument eq "-make" or $argument eq "-fuse" or $argument eq "-split" or $argument eq "-pipe" or $argument eq "-repeat" ) {
-        print STDERR "Argument '$argument' is deprecated\n";
-      }
-    }
-  }
-
-  # -nocompress allows reading of BLAST XML match data with internal runs of spaces (undocumented)
-
-  if ( $max > 0 and $ARGV[0] eq "-nocompress" ) {
-    shift @ARGV;
-    $max = scalar @ARGV;
-    $compress_spaces = false;
-  }
-
-  # -insd to simplify extraction of INSDSeq qualifiers
-
-  if ( $max > 0 and $ARGV[0] eq "-insd" ) {
-    if ( $max < 3 and $ARGV[1] !~ /INSD/) {
-      print STDERR "Must supply a feature key and at least one qualifier name\n";
-      return;
-
-    } elsif ( -t STDIN ) {
-
-      # -insd without piped input will generate an extraction script
-
-      @ARGV = process_insd ( "\"", \@ARGV );
-      print "xtract ";
-      foreach $itm (@ARGV) {
-        print "$itm ";
-      }
-      print "| \\\n";
-      return;
-
-    } else {
-
-      # -insd with piped input will dynamically execute the extraction
-
-      @ARGV = process_insd ( "", \@ARGV );
-      $max = scalar @ARGV;
-    }
-  }
-
-  # recommended nested organizer levels are -group, -block, and -subset
-
-  # intermediate levels (-division, -branch, -section, and -unit) kept in reserve
-
-  # initial capitalized versions (-Group, -Block, etc.) use tokenized pattern matching
-
-  if ( $max > 1 and $ARGV[0] eq "-split" ) {
-
-    # -split reads data stream and writes chunks on separate lines
-
-    my $pat = $ARGV[1];
-    my $in_pat = false;
-    my @working = ();
-
-    while ( defined($thisline = <STDIN>) ) {
-      $thisline =~ s/\r//g;
-      $thisline =~ s/\n//g;
-      $thisline =~ s/\t//g;
-      if ( $compress_spaces ) {
-        $thisline =~ s/ +/ /g;
-      }
-      $thisline =~ s/> +</></g;
-
-      my @tokens = split (/(?<=>)(?=<)/, $thisline);
-      foreach $tkn (@tokens) {
-        if ( $tkn =~ /^ +(.+)$/ ) {
-          $tkn = $1;
-        }
-        if ( $tkn =~ /^(.+) +$/ ) {
-          $tkn = $1;
-        }
-        if ( $tkn =~ /<$pat(?:\s+.+?)?>(.+?)<\/$pat>/ ) {
-          if ( scalar @working > 0 ) {
-            my $str = join ("", @working);
-            print "$str\n";
-            @working = ();
-          }
-          $in_pat = false;
-          print "$tkn\n";
-        } elsif ( $tkn =~ /<$pat(?:\s+.+?)?>/ ) {
-          push (@working, $tkn);
-          $in_pat = true;
-        } elsif ( $tkn =~ /<\/$pat>/ ) {
-          push (@working, $tkn);
-          $in_pat = false;
-          my $str = join ("", @working);
-          print "$str\n";
-          @working = ();
-        } elsif ( $in_pat) {
-          push (@working, $tkn);
-        }
-      }
-    }
-    if ( scalar @working > 0 ) {
-      my $str = join ("", @working);
-      print "$str\n";
-    }
-
-    return;
-  }
-
-  if ( $max > 0 and $ARGV[0] eq "-format" ) {
-
-    my $lvl = 0;
-
-    my $xml_ok = true;
-    my $doctype_ok = true;
-
-    my $root_token = "";
-    my $print_head = false;
-    my $print_tail = false;
-
-    my $xml = "";
-    my $doctype = "";
-    my $pfx = "";
-    my $sfx = "";
-
-    my $go_on = true;
-
-    # support -pfx, -sfx, -xml, and -doctype arguments
-
-    for ( my $i = 1; $i < $max; $i++ ) {
-
-      if ( $ARGV[$i] eq "-pfx" ) {
-        $i++;
-        $pfx = convert_slash ( $ARGV[$i] );
-      } elsif ( $ARGV[$i] eq "-sfx" ) {
-        $i++;
-        $sfx = convert_slash ( $ARGV[$i] );
-      } elsif ( $ARGV[$i] eq "-xml" ) {
-        $i++;
-        $xml = convert_slash ( $ARGV[$i] );
-      } elsif ( $ARGV[$i] eq "-doctype" ) {
-        $i++;
-        $doctype = convert_slash ( $ARGV[$i] );
-      }
-    }
-
-    while ( $go_on ) {
-
-      $thisline = "";
-      if ( $xml ne "" ) {
-        $thisline = $xml;
-        $xml = "";
-      } elsif ( $doctype ne "" ) {
-        $thisline = $doctype;
-        $doctype = "";
-      } elsif ( $pfx ne "" ) {
-        $thisline = $pfx;
-        $pfx = "";
-      } elsif ( defined($thisline = <STDIN>) ) {
-      } elsif ( $sfx ne "" ) {
-        $thisline = $sfx;
-        $sfx = "";
-      } else {
-        $go_on = false;
-      }
-
-      if ( $go_on and $thisline ne "" ) {
-        $thisline =~ s/\r//g;
-        $thisline =~ s/\n//g;
-        $thisline =~ s/\t//g;
-        if ( $compress_spaces ) {
-          $thisline =~ s/ +/ /g;
-        }
-        $thisline =~ s/> +</></g;
-
-        my @tokens = split (/(?<=>)(?=<)/, $thisline);
-        foreach $tkn (@tokens) {
-          if ( $tkn =~ /^ +(.+)$/ ) {
-            $tkn = $1;
-          }
-          if ( $tkn =~ /^(.+) +$/ ) {
-            $tkn = $1;
-          }
-          if ( $lvl < 0 ) {
-            # ignore remainder if level drops below zero
-          } elsif ( $tkn =~ /^<\?xml/ ) {
-            if ( $xml_ok ) {
-              # only print first xml line
-              print "$tkn\n";
-              $xml_ok = false;
-            }
-          } elsif ( $tkn =~ /^<!DOCTYPE (\S+)/ ) {
-            if ( $doctype_ok ) {
-              # only print first DOCTYPE line, extract root token
-              $root_token = $1;
-              print "$tkn\n";
-              $doctype_ok = false;
-              $print_head = true;
-            }
-          } elsif ( $tkn =~ /^<\?.+\?>$/ ) {
-            # processing instruction
-            $lvl++;
-            for ( $i = 1; $i < $lvl; $i++ ) {
-              print "  ";
-            }
-            print "$tkn\n";
-            $lvl--;
-          } elsif ( $tkn =~ /^<\!--.+-->$/ ) {
-            # comment
-            $lvl++;
-            for ( $i = 1; $i < $lvl; $i++ ) {
-              print "  ";
-            }
-            print "$tkn\n";
-            $lvl--;
-          } elsif ( $tkn =~ /^<.+?>.+?<\/(.+?)>$/ ) {
-            # content-containing tag
-            if ( $tkn =~ /^<(\S+).*?>.*>$/ ) {
-              $lvl++;
-              for ( $i = 1; $i < $lvl; $i++ ) {
-                print "  ";
-              }
-              print "$tkn\n";
-              $lvl--;
-            }
-          } elsif ( $tkn =~ /^<.+?\/>$/ ) {
-            # self-closing token
-            if ( $tkn =~ /^<(\S+).*?\/>$/ ) {
-              $lvl++;
-              for ( $i = 1; $i < $lvl; $i++ ) {
-                print "  ";
-              }
-              print "$tkn\n";
-              $lvl--;
-            }
-          } elsif ( $tkn =~ /^<[^\/]/ ) {
-            # open tag increments level
-            $lvl++;
-            if ( $tkn eq "<$root_token>" ) {
-              if ( $print_head ) {
-                print "<$root_token>\n";
-                $print_head = false;
-                $print_tail = true;
-              }
-            } else {
-              if ( $tkn =~ /^<(\S+).*?>$/ ) {
-                for ( $i = 1; $i < $lvl; $i++ ) {
-                  print "  ";
-                }
-                print "$tkn\n";
-              } elsif ( $tkn =~ /^<(\S+).*?$/ ) {
-                for ( $i = 1; $i < $lvl; $i++ ) {
-                  print "  ";
-                }
-                print "$tkn";
-              }
-            }
-          } elsif ( $tkn =~ /^<\// ) {
-            # close tag decrements level
-            if ( $tkn ne "</$root_token>" ) {
-              for ( $i = 1; $i < $lvl; $i++ ) {
-                print "  ";
-              }
-              print "$tkn\n";
-            }
-            $lvl--;
-          } elsif ( $tkn =~ /^>$/ ) {
-            print "$tkn\n";
-          } elsif ( $tkn =~ />$/ ) {
-            print " $tkn\n";
-          } else {
-            print " $tkn";
-          }
-        }
-      }
-    }
-
-    if ( $print_tail ) {
-      print "</$root_token>\n";
-    }
-
-    return;
-  }
-
-  my $ret = "";
-  my $tab = "";
-
-  if ( $max > 2 and $ARGV[0] eq "-pipe" ) {
-
-    # -pipe reads data that has already been compressed by xtract -split
-
-    shift @ARGV;
-    while ( defined($thisline = <STDIN>) ) {
-      $thisline =~ s/\r//g;
-      $thisline =~ s/\n//g;
-      $thisline =~ s/\t//g;
-      if ( $compress_spaces ) {
-        $thisline =~ s/ +/ /g;
-      }
-      $thisline =~ s/> +</></g;
-
-      my @tmp = @ARGV;
-      %memory = ();
-      ( $tab, $ret ) = process_level ( $thisline, "", "", \@tmp, "", "", $initial_level, 1, 1 );
-      print "$ret";
-    }
-
-    return;
-  }
-
-  my $begin_time = time();
-
-  if ( $max > 2 and $ARGV[0] eq "-pattern" ) {
-
-    # simple -pattern reads data stream and processes one pattern at a time
-
-    my $pat = $ARGV[1];
-    my $chd = "";
-    if ( $pat =~ /(.+)\/(.+)/ ) {
-      $pat = $1;
-      $chd = $2;
-    }
-
-    @ARGV = splice (@ARGV, 2, $max - 2);
-
-    my $in_pat = false;
-    my @working = ();
-
-    while ( defined($thisline = <STDIN>) ) {
-      $thisline =~ s/\r//g;
-      $thisline =~ s/\n//g;
-      $thisline =~ s/\t//g;
-      if ( $compress_spaces ) {
-        $thisline =~ s/ +/ /g;
-      }
-      $thisline =~ s/> +</></g;
-
-      my @tokens = split (/(?<=>)(?=<)/, $thisline);
-      foreach $tkn (@tokens) {
-        if ( $tkn =~ /^ +(.+)$/ ) {
-          $tkn = $1;
-        }
-        if ( $tkn =~ /^(.+) +$/ ) {
-          $tkn = $1;
-        }
-        if ( $tkn =~ /<$pat(?:\s+.+?)?>/ ) {
-          if ( $tkn =~ /<$pat(?:\s+.+?)?>(.+?)<\/$pat>/ ) {
-            if ( scalar @working > 0 ) {
-              my $str = join ("", @working);
-              @working = ();
-              my @tmp = @ARGV;
-              %memory = ();
-              ( $tab, $ret ) = process_level ( $str, $pat, $chd, \@tmp, "", "", $initial_level, 1, 1 );
-              print "$ret";
-            }
-            $in_pat = false;
-            my @tmp = @ARGV;
-            %memory = ();
-            ( $tab, $ret ) = process_level ( $tkn, $pat, $chd, \@tmp, "", "", $initial_level, 1, 1 );
-            print "$ret";
-          } else {
-            push (@working, $tkn);
-            $in_pat = true;
-          }
-        } elsif ( $tkn =~ /<\/$pat>/ ) {
-          push (@working, $tkn);
-          $in_pat = false;
-          my $str = join ("", @working);
-          @working = ();
-          my @tmp = @ARGV;
-          %memory = ();
-          ( $tab, $ret ) = process_level ( $str, $pat, $chd, \@tmp, "", "", $initial_level, 1, 1 );
-          print "$ret";
-        } elsif ( $in_pat) {
-          push (@working, $tkn);
-        }
-      }
-    }
-    if ( scalar @working > 0 ) {
-      my $str = join ("", @working);
-      @working = ();
-      my @tmp = @ARGV;
-      %memory = ();
-      ( $tab, $ret ) = process_level ( $str, $pat, $chd, \@tmp, "", "", $initial_level, 1, 1 );
-      print "$ret";
-    }
-
-    my $end_time = time();
-    my $elapsed = $end_time - $begin_time;
-    if ( $timer ) {
-      if ( $elapsed > 1 ) {
-        print STDERR "\nXtract.pl ran in $elapsed seconds\n\n";
-      } elsif ( $elapsed > 0 ) {
-        print STDERR "\nXtract.pl ran in $elapsed second\n\n";
-      }
-    }
-
-    return;
-  }
-
-  if ( $max > 2 and $ARGV[0] eq "-Pattern" ) {
-
-    # capitalized -Pattern tracks depth of top-level pattern (e.g., for nested Taxon structure)
-
-    my $pat = $ARGV[1];
-    my $chd = "";
-    if ( $pat =~ /(.+)\/(.+)/ ) {
-      $pat = $1;
-      $chd = $2;
-    }
-
-    @ARGV = splice (@ARGV, 2, $max - 2);
-
-    my $in_pat = false;
-    my $lvl = 0;
-    my @working = ();
-
-    while ( defined($thisline = <STDIN>) ) {
-      $thisline =~ s/\r//g;
-      $thisline =~ s/\n//g;
-      $thisline =~ s/\t//g;
-      if ( $compress_spaces ) {
-        $thisline =~ s/ +/ /g;
-      }
-      $thisline =~ s/> +</></g;
-
-      my @tokens = split (/(?<=>)(?=<)/, $thisline);
-      foreach $tkn (@tokens) {
-        if ( $tkn =~ /^ +(.+)$/ ) {
-          $tkn = $1;
-        }
-        if ( $tkn =~ /^(.+) +$/ ) {
-          $tkn = $1;
-        }
-        if ( $tkn =~ /<$pat(?:\s+.+?)?>/ ) {
-          if ( $tkn =~ /<$pat(?:\s+.+?)?>(.+?)<\/$pat>/ ) {
-            if ( scalar @working > 0 ) {
-              my $str = join ("", @working);
-              @working = ();
-              my @tmp = @ARGV;
-              %memory = ();
-              ( $tab, $ret ) = process_level ( $str, $pat, $chd, \@tmp, "", "", $initial_level, 1, 1 );
-              print "$ret";
-            }
-            $in_pat = false;
-            my @tmp = @ARGV;
-            %memory = ();
-            ( $tab, $ret ) = process_level ( $tkn, $pat, $chd, \@tmp, "", "", $initial_level, 1, 1 );
-            print "$ret";
-          } else {
-            $lvl++;
-            push (@working, $tkn);
-            $in_pat = true;
-          }
-        } elsif ( $tkn =~ /<\/$pat>/ ) {
-          $lvl--;
-          if ($lvl < 1) {
-            push (@working, $tkn);
-            $in_pat = false;
-            my $str = join ("", @working);
-            @working = ();
-            my @tmp = @ARGV;
-            %memory = ();
-            ( $tab, $ret ) = process_level ( $str, $pat, $chd, \@tmp, "", "", $initial_level, 1, 1 );
-            print "$ret";
-          } elsif ( $in_pat) {
-            push (@working, $tkn);
-          }
-        } elsif ( $in_pat) {
-          push (@working, $tkn);
-        }
-      }
-    }
-    if ( scalar @working > 0 ) {
-      my $str = join ("", @working);
-      @working = ();
-      my @tmp = @ARGV;
-      %memory = ();
-      ( $tab, $ret ) = process_level ( $str, $pat, $chd, \@tmp, "", "", $initial_level, 1, 1 );
-      print "$ret";
-    }
-
-    return;
-  }
-
-  my $rpt = 1;
-
-  # read entire XML input stream into a single string
-
-  $holdTerminator = $/;
-  undef $/;
-  $data = <STDIN>;
-  $/ = $holdTerminator;
-
-  # remove newlines, tabs, space between tokens, compress runs of spaces,
-
-  $data =~ s/\r//g;
-  $data =~ s/\n//g;
-  $data =~ s/\t//g;
-  if ( $compress_spaces ) {
-   $data =~ s/ +/ /g;
-  }
-  $data =~ s/> +</></g;
-
-  if ( $max == 0 or ( $max == 1 and $ARGV[0] ne "-outline" and $ARGV[0] ne "-synopsis" )) {
-
-    # no useful arguments, just print entire line of compressed XML
-
-    print "$data\n";
-
-    return;
-  }
-
-  if ( $max > 2 and $ARGV[0] eq "-repeat" ) {
-
-    # first look for -repeat count for performance testing
-
-    $rpt = $ARGV[1];
-    @ARGV = splice (@ARGV, 2, $max - 2);
-    $max = scalar @ARGV;
-  }
-
-  if ( $ARGV[0] ne "-pattern" ) {
-
-    # no -pattern, process entire XML without looking for pattern
-
-    my @args = @ARGV;
-    %memory = ();
-    ( $tab, $ret ) = process_level ( $data, "", "", \@args, "", "", $initial_level, 1, 1 );
-    print "$ret";
-
-    return;
-  }
-
-  # split by -pattern - can handle single term or parent/child (including heterogeneous parent/*)
-
-  my $ptrn = $ARGV[1];
-  my $chld = "";
-  if ( $ptrn =~ /(.+)\/(.+)/ ) {
-    $ptrn = $1;
-    $chld = $2;
-  }
-  @vals = ($data =~ /(<$ptrn(?:\s+.+?)?>.+?<\/$ptrn>)/g);
-
-  if ( $max == 2 ) {
-    foreach $val (@vals) {
-      print "$val\n";
-    }
-
-    return;
-  }
-
-
-  # process remaining arguments
-
-  my @args = splice (@ARGV, 2, $max - 2);
-
-  my $curr = 0;
-  my $totl = scalar @vals;
-
-  for ( my $idx = 0; $idx < $rpt; $idx++ ) {
-    foreach $val (@vals) {
-      %memory = ();
-      ( $tab, $ret ) = process_level ( $val, $ptrn, $chld, \@args, "", "", $initial_level, $curr, $totl );
-      print "$ret";
-    }
-  }
-}
-
-# execute XML extraction
-
-xtract ();
-
-# if -synopsis, print cumulative summary of XML paths
-
-# if ( scalar (keys %synopsis_acc) > 0 ) {
-#   my @keys;
-#   @keys = keys %synopsis_acc;
-#   for my $ky (sort @keys) {
-#     $vl = $synopsis_acc{$ky};
-#     for ( $i = 0; $i < $vl; $i++ ) {
-#     	print "$ky\n";
-#     }
-#   }
-# }
-
-# close input and output files
-
-close (STDIN);
-close (STDOUT);
-close (STDERR);
author	Aaron M. Ucko <ucko@debian.org>	2017-10-05 22:06:53 -0400
committer	Aaron M. Ucko <ucko@debian.org>	2017-10-05 22:06:53 -0400
commit	2c3f0ede6219c7c8a98cbe6d0ee9f7e63573afe3 (patch)
tree	1ccf78163f4d890aaa6d00b9ff927383c6e4d73b
parent	f009e40f3fc2759c477eadb276a61b39705f8e99 (diff)
parent	57368b018e0265f06fca749d685ad69c986a0a93 (diff)