Merge tag 'upstream/7.40.20170926+ds'

Upstream version 7.40.20170926(+ds).
author: Aaron M. Ucko <ucko@debian.org> 2017-10-06 17:43:37 -0400
committer: Aaron M. Ucko <ucko@debian.org> 2017-10-06 17:44:16 -0400
commit: 0d510bdeee8747dd089294a7210944e8c236cdad (patch)
tree: fa4d2c46f995c8f7eaab9a3c6428710e8fc488fb
parent: 91f37314681b612fec14016b237f144f67d81a49 (diff)
parent: a0990267f337037396f8665df411d0b8bc641a66 (diff)
3 files changed, 254 insertions, 135 deletions
diff --git a/debian/changelog b/debian/changelog
index e5072b2..6bc8fc0 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,4 +1,4 @@
-ncbi-entrez-direct (7.30.20170918+ds-1) UNRELEASED; urgency=medium
+ncbi-entrez-direct (7.40.20170926+ds-1) UNRELEASED; urgency=medium
 
   * New upstream release.  (NOT YET RELEASED.)
   * debian/control: Unconditionalize Go build dependencies.
@@ -8,7 +8,7 @@ ncbi-entrez-direct (7.30.20170918+ds-1) UNRELEASED; urgency=medium
   * debian/rules: Remove fallback logic to install the old Perl
     implementation of xtract, now retired upstream.
 
- -- Aaron M. Ucko <ucko@debian.org>  Fri, 06 Oct 2017 17:43:21 -0400
+ -- Aaron M. Ucko <ucko@debian.org>  Fri, 06 Oct 2017 17:43:37 -0400
 
 ncbi-entrez-direct (6.90.20170705+ds-2) unstable; urgency=medium
 
diff --git a/edirect.pl b/edirect.pl
index a755fd2..5326a3f 100755
--- a/edirect.pl
+++ b/edirect.pl
@@ -43,7 +43,7 @@ use File::Spec;
 
 # EDirect version number
 
-$version = "7.30";
+$version = "7.40";
 
 BEGIN
 {
@@ -197,6 +197,11 @@ sub clearflags {
   "shows#significantly#since#so#some#such#than#that#the#their#theirs#them#" .
   "then#there#therefore#these#they#this#those#through#thus#to#upon#use#used#" .
   "using#various#very#was#we#were#what#when#which#while#with#within#without#would#";
+
+  $os = "$^O";
+
+  $api_key = "";
+  $api_key = $ENV{NCBI_API_KEY} if defined $ENV{NCBI_API_KEY};
 }
 
 # gets a live UID for any database
@@ -432,27 +437,6 @@ sub get_email {
   return $addr;
 }
 
-# correct misspellings in query
-
-sub spell_check_query {
-
-  my $db = shift (@_);
-  my $qury = shift (@_);
-
-  my $url = $base . $espell;
-
-  my $enc = uri_escape($query);
-  $arg = "db=$db&term=$enc";
-
-  my $data = do_post ($url, $arg, $tool, $email, true);
-
-  Encode::_utf8_on($data);
-
-  $qury = $1 if ( $data =~ /<CorrectedQuery>(.+)<\/CorrectedQuery>/ );
-
-  return $qury;
-}
-
 # elink and epost currently need a separate ESearch to get the correct result count
 
 sub get_count {
@@ -471,6 +455,14 @@ sub get_count {
 
   $url .= "&edirect=$version";
 
+  if ( $os ne "" ) {
+    $url .= "&os=$os";
+  }
+
+  if ( $api_key ne "" ) {
+    $url .= "&api_key=$api_key";
+  }
+
   if ( $tulx eq "" ) {
     $tulx = "entrez-direct";
   }
@@ -545,6 +537,14 @@ sub get_uids {
 
   $url .= "&edirect=$version";
 
+  if ( $os ne "" ) {
+    $url .= "&os=$os";
+  }
+
+  if ( $api_key ne "" ) {
+    $url .= "&api_key=$api_key";
+  }
+
   if ( $tulx eq "" ) {
     $tulx = "edirect";
   }
@@ -593,6 +593,14 @@ sub do_post_yielding_ref {
   my $emlx = shift (@_);
   my $intr = shift (@_);
 
+  if ( $os ne "" ) {
+    $argx .= "&os=$os";
+  }
+
+  if ( $api_key ne "" ) {
+    $argx .= "&api_key=$api_key";
+  }
+
   $argx .= "&edirect=$version";
 
   if ( $intr ) {
@@ -1356,6 +1364,27 @@ sub process_extras {
   return $xtras;
 }
 
+# correct misspellings in query
+
+sub spell_check_query {
+
+  my $db = shift (@_);
+  my $qury = shift (@_);
+
+  my $url = $base . $espell;
+
+  my $enc = uri_escape($query);
+  $arg = "db=$db&term=$enc";
+
+  my $data = do_post ($url, $arg, $tool, $email, true);
+
+  Encode::_utf8_on($data);
+
+  $qury = $1 if ( $data =~ /<CorrectedQuery>(.+)<\/CorrectedQuery>/ );
+
+  return $qury;
+}
+
 sub efilt {
 
   # ... | edirect.pl -filter -query "bacteria [ORGN]" -days 365 | ...
@@ -1382,6 +1411,7 @@ sub efilt {
     "source=s" => \$source,
     "status=s" => \$status,
     "type=s" => \$gtype,
+    "api_key=s" => \$api_key,
     "email=s" => \$emaddr,
     "tool=s" => \$tuul,
     "help" => \$help,
@@ -2122,6 +2152,7 @@ sub eftch {
     "extrafeat=i" => \$extrafeat,
     "start=i" => \$min,
     "stop=i" => \$max,
+    "api_key=s" => \$api_key,
     "email=s" => \$emaddr,
     "tool=s" => \$tuul,
     "pipe" => \$pipe,
@@ -2669,6 +2700,7 @@ sub einfo {
     "dbs" => \$dbs,
     "fields" => \$fields,
     "links" => \$links,
+    "api_key=s" => \$api_key,
     "email=s" => \$emaddr,
     "tool=s" => \$tuul,
     "help" => \$help,
@@ -2726,6 +2758,16 @@ sub einfo {
     $prefix = "&";
   }
 
+  if ( $os ne "" ) {
+    $url .= "$prefix" . "os=$os";
+    $prefix = "&";
+  }
+
+  if ( $api_key ne "" ) {
+    $url .= "$prefix" . "api_key=$api_key";
+    $prefix = "&";
+  }
+
   $url .= "$prefix" . "edirect=$version";
   $prefix = "&";
 
@@ -3182,6 +3224,7 @@ sub elink {
     "batch" => \$batch,
     "holding=s" => \$holding,
     "label=s" => \$lbl,
+    "api_key=s" => \$api_key,
     "email=s" => \$emaddr,
     "tool=s" => \$tuul,
     "help" => \$help,
@@ -3524,6 +3567,7 @@ sub entfy {
 
   MyGetOptions(
     $ntfy_help,
+    "api_key=s" => \$api_key,
     "email=s" => \$emaddr,
     "tool=s" => \$tuul,
     "help" => \$help,
@@ -3695,6 +3739,7 @@ sub epost {
     "format=s" => \$field,
     "input=s" => \$input,
     "label=s" => \$lbl,
+    "api_key=s" => \$api_key,
     "email=s" => \$emaddr,
     "tool=s" => \$tuul,
     "help" => \$help,
@@ -3958,6 +4003,7 @@ sub espel {
     $spell_help,
     "db=s" => \$db,
     "query=s" => \$query,
+    "api_key=s" => \$api_key,
     "email=s" => \$emaddr,
     "tool=s" => \$tuul,
     "help" => \$help,
@@ -4430,6 +4476,7 @@ sub esrch {
     "split=s" => \$split,
     "merge=s" => \$meadow,
     "pairs=s" => \$pair,
+    "api_key=s" => \$api_key,
     "email=s" => \$emaddr,
     "tool=s" => \$tuul,
     "help" => \$help,
diff --git a/xtract.go b/xtract.go
index 10e3210..2825215 100644
--- a/xtract.go
+++ b/xtract.go
@@ -80,7 +80,7 @@ import (
 
 // VERSION AND HELP MESSAGE TEXT
 
-const xtractVersion = "7.30"
+const xtractVersion = "7.40"
 
 const xtractHelp = `
 Overview
@@ -337,6 +337,7 @@ Local Record Indexing
   -flag       [strict|mixed|none]
   -gzip       Use compression for local XML files
   -hash       Print UIDs and checksum values to stdout
+  -skip       File of UIDs to skip
 
 Sample File Download
 
@@ -422,7 +423,7 @@ Reconstruct Release Files
 Experimental Postings File Creation
 
   efetch -db pubmed -id 12857958,2981625 -format xml |
-  xtract -e2index |
+  xtract -e2index PubmedArticle MedlineCitation/PMID ArticleTitle,AbstractText,Keyword |
   xtract -pattern IdxDocument -UID IdxUid \
     -block NORM -pfc "\n" -element "&UID",NORM |
   LC_ALL='C' sort -k 2f -k 1n |
@@ -431,51 +432,6 @@ Experimental Postings File Creation
 DISABLE ANTI-VIRUS FILE SCANNING FOR LOCAL ARCHIVES OR MOVE TO TRUSTED FILES
 
 DISABLE SPOTLIGHT INDEXING FOR EXTERNAL DISKS CONTAINING LOCAL ARCHIVES
-
-APFS Disk Creation
-
-  diskutil list
-
-  diskutil apfs createContainer /dev/disk1s2
-  diskutil apfs addVolume disk1s2 APFS myssd
-
-  diskutil mountDisk /dev/disk1
-
-  sudo mdutil -i off /Volumes/myssd
-  sudo mdutil -E /Volumes/myssd
-  sudo rm -rf /Volumes/myssd/.Spotlight*
-  sudo rm -rf /Volumes/myssd/.fseventsd
-
-  touch /Volumes/myssd/.metadata_never_index
-  chmod 444 /Volumes/myssd/.metadata_never_index
-  mkdir /Volumes/myssd/.fseventsd
-  touch /Volumes/myssd/.fseventsd/no_log
-
-  Apple->System Preferences
-    Spotlight
-      Privacy
-        Add: /Volumes/myssd
-
-FAT Disk Creation
-
-  diskutil eraseDisk FAT32 BACKUP /dev/disk1
-
-Ramdisk Creation
-
-  RAMDISK_SIZE_GB=4
-  RAMDISK_SECTORS=$((2097152 * $RAMDISK_SIZE_GB))
-  DISK_ID=$(hdiutil attach -nomount ram://$RAMDISK_SECTORS)
-  echo "Disk ID is :" $DISK_ID
-  diskutil erasevolume HFS+ myssd ${DISK_ID}
-
-Ramdisk Deletion
-
-  diskutil list
-
-  umount -f ${DISK_ID}
-  hdiutil detach ${DISK_ID}
-
-  (OR EJECT BY DRAGGING DISK IMAGE TO TRASH)
 `
 
 const xtractInternal = `
@@ -2505,11 +2461,6 @@ func TrimPunctuation(str string) string {
 
 	max := len(str)
 
-	hasLeftP := strings.Contains(str, "(")
-	hasRightP := strings.Contains(str, ")")
-	hasLeftB := strings.Contains(str, "[")
-	hasRightB := strings.Contains(str, "]")
-
 	doOneTrim := func() {
 
 		if max > 0 {
@@ -2558,6 +2509,9 @@ func TrimPunctuation(str string) string {
 			max -= 2
 		}
 
+		hasLeftP := strings.Contains(str, "(")
+		hasRightP := strings.Contains(str, ")")
+
 		if max > 1 && str[0] == '(' && str[1] == '(' && !hasRightP {
 			// trim leading double parentheses
 			str = str[2:]
@@ -2582,6 +2536,9 @@ func TrimPunctuation(str string) string {
 			max--
 		}
 
+		hasLeftB := strings.Contains(str, "[")
+		hasRightB := strings.Contains(str, "]")
+
 		if max > 0 && str[0] == '[' && !hasRightB {
 			// trim isolated left bracket
 			str = str[1:]
@@ -3272,7 +3229,7 @@ func ParseArguments(args []string, pttrn string) *Block {
 	// parseCommands does initial parsing of exploration command structure
 	parseCommands = func(parent *Block, startLevel LevelType) {
 
-		// function to find next highest level exploration argument
+		// find next highest level exploration argument
 		findNextLevel := func(args []string, level LevelType) (LevelType, string, string) {
 
 			if len(args) > 1 {
@@ -3309,7 +3266,7 @@ func ParseArguments(args []string, pttrn string) *Block {
 			return
 		}
 
-		// function to group arguments at a given exploration level
+		// group arguments at a given exploration level
 		subsetCommands := func(args []string) *Block {
 
 			max := len(args)
@@ -3404,7 +3361,7 @@ func ParseArguments(args []string, pttrn string) *Block {
 
 		status := UNSET
 
-		// function to parse conditional clause into execution step
+		// parse conditional clause into execution step
 		parseStep := func(op *Operation, elementColonValue bool) {
 
 			if op == nil {
@@ -3624,7 +3581,7 @@ func ParseArguments(args []string, pttrn string) *Block {
 
 		status := UNSET
 
-		// function to parse next argument
+		// parse next argument
 		nextStatus := func(str string) OpType {
 
 			status = ParseFlag(str)
@@ -3655,7 +3612,7 @@ func ParseArguments(args []string, pttrn string) *Block {
 			return status
 		}
 
-		// function to parse extraction clause into individual steps
+		// parse extraction clause into individual steps
 		parseSteps := func(op *Operation, pttrn string) {
 
 			if op == nil {
@@ -4124,7 +4081,7 @@ func PartitionPattern(pat, star string, rdr *XMLReader, proc func(int, int64, st
 		CharSkip  [256]int
 	}
 
-	// function to initialize <pattern> to </pattern> scanner
+	// initialize <pattern> to </pattern> scanner
 	newScanner := func(pattern string) *Scanner {
 
 		if pattern == "" {
@@ -4151,7 +4108,7 @@ func PartitionPattern(pat, star string, rdr *XMLReader, proc func(int, int64, st
 		return scr
 	}
 
-	// function check surroundings of match candidate
+	// check surroundings of match candidate
 	isAnElement := func(text string, lf, rt, mx int) bool {
 
 		if (lf >= 0 && text[lf] == '<') || (lf > 0 && text[lf] == '/' && text[lf-1] == '<') {
@@ -4218,7 +4175,7 @@ func PartitionPattern(pat, star string, rdr *XMLReader, proc func(int, int64, st
 		STOPPATTERN
 	)
 
-	// function to find next element with pattern name
+	// find next element with pattern name
 	nextPattern := func(scr *Scanner, text string, pos int) (PatternType, int, int) {
 
 		if scr == nil || text == "" {
@@ -4364,7 +4321,7 @@ func PartitionPattern(pat, star string, rdr *XMLReader, proc func(int, int64, st
 			return
 		}
 
-		// function to find next element in XML
+		// find next element in XML
 		nextElement := func(text string, pos int) string {
 
 			txtlen := len(text)
@@ -4525,7 +4482,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
 
 	plainText := (!tbls.DeGloss && !tbls.DoMixed)
 
-	// function to get next XML token
+	// get next XML token
 	nextToken := func(idx int) (TagType, string, string, int, int) {
 
 		if Text == "" {
@@ -5587,7 +5544,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
 		justStartName := ""
 		justStartIndent := 0
 
-		// function to indent a specified number of spaces
+		// indent a specified number of spaces
 		doIndent := func(indt int) {
 			if compRecrd || flushLeft {
 				return
@@ -5603,7 +5560,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
 			buffer.WriteString(indentSpaces[i])
 		}
 
-		// function to handle delayed start tag
+		// handle delayed start tag
 		doDelayedName := func() {
 			if needsRightBracket != "" {
 				buffer.WriteString(">")
@@ -5620,7 +5577,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
 
 		closingTag := ""
 
-		// function to print attributes
+		// print attributes
 		printAttributes := func(attr string) {
 
 			attr = strings.TrimSpace(attr)
@@ -6598,29 +6555,56 @@ func ProcessHydra(isPipe bool) []string {
 // ENTREZ2INDEX COMMAND GENERATOR
 
 // ProcessE2Index generates extraction commands to create input for Entrez2Index (undocumented)
-func ProcessE2Index(isPipe bool) []string {
+func ProcessE2Index(args []string, isPipe bool) []string {
 
 	var acc []string
 
+	max := len(args)
+	if max < 3 {
+		fmt.Fprintf(os.Stderr, "\nERROR: Insufficient command-line arguments supplied to xtract -e2index\n")
+		os.Exit(1)
+	}
+
+	patrn := args[0]
+	ident := args[1]
+
+	args = args[2:]
+
 	if isPipe {
 		acc = append(acc, "-head", "<IdxDocumentSet>", "-tail", "</IdxDocumentSet>")
 		acc = append(acc, "-hd", "  <IdxDocument>\\n", "-tl", "  </IdxDocument>")
-		acc = append(acc, "-pattern", "PubmedArticle")
+		acc = append(acc, "-pattern")
+		ql := fmt.Sprintf("\"%s\"", patrn)
+		acc = append(acc, ql)
 		acc = append(acc, "-pfx", "    <IdxUid>", "-sfx", "</IdxUid>\\n")
-		acc = append(acc, "-element", "MedlineCitation/PMID")
+		acc = append(acc, "-element")
+		ql = fmt.Sprintf("\"%s\"", ident)
+		acc = append(acc, ql)
 		acc = append(acc, "-clr", "-rst", "-tab", "")
 		acc = append(acc, "-lbl", "    <IdxSearchFields>\\n")
-		acc = append(acc, "-indices", "ArticleTitle,AbstractText,Keyword")
+		acc = append(acc, "-indices")
+		for _, str := range args {
+			ql = fmt.Sprintf("\"%s\"", str)
+			acc = append(acc, ql)
+		}
 		acc = append(acc, "-clr", "-lbl", "    </IdxSearchFields>\\n")
 	} else {
 		acc = append(acc, "-head", "\"<IdxDocumentSet>\"", "-tail", "\"</IdxDocumentSet>\"")
 		acc = append(acc, "-hd", "\"  <IdxDocument>\\n\"", "-tl", "\"  </IdxDocument>\"")
-		acc = append(acc, "-pattern", "PubmedArticle")
+		acc = append(acc, "-pattern")
+		ql := fmt.Sprintf("\"%s\"", patrn)
+		acc = append(acc, ql)
 		acc = append(acc, "-pfx", "\"    <IdxUid>\"", "-sfx", "\"</IdxUid>\\n\"")
-		acc = append(acc, "-element", "MedlineCitation/PMID")
+		acc = append(acc, "-element")
+		ql = fmt.Sprintf("\"%s\"", ident)
+		acc = append(acc, ql)
 		acc = append(acc, "-clr", "-rst", "-tab", "\"\"")
 		acc = append(acc, "-lbl", "\"    <IdxSearchFields>\\n\"")
-		acc = append(acc, "-indices", "ArticleTitle,AbstractText,Keyword")
+		acc = append(acc, "-indices")
+		for _, str := range args {
+			ql = fmt.Sprintf("\"%s\"", str)
+			acc = append(acc, ql)
+		}
 		acc = append(acc, "-clr", "-lbl", "\"    </IdxSearchFields>\\n\"")
 	}
 
@@ -6811,7 +6795,7 @@ func PrintSubtree(node *Node, style IndentType, printAttrs bool, proc func(strin
 		"                  ",
 	}
 
-	// function to indent a specified number of spaces
+	// indent a specified number of spaces
 	doIndent := func(indt int) {
 		i := indt
 		for i > 9 {
@@ -7631,7 +7615,7 @@ func ConditionsAreSatisfied(conditions []*Operation, curr *Node, mask string, in
 	isMatch := false
 	isAvoid := false
 
-	// function to test string or numeric constraints
+	// test string or numeric constraints
 	testConstraint := func(str string, constraint *Step) bool {
 
 		if str == "" || constraint == nil {
@@ -8077,7 +8061,7 @@ func ProcessQuery(Text, parent string, index int, cmds *Block, tbls *Tables, act
 	FarmMax := tbls.FarmSize
 	FarmItems := make([]Node, FarmMax)
 
-	// function to allocate multiple nodes in a large array for memory management efficiency
+	// allocate multiple nodes in a large array for memory management efficiency
 	nextNode := func(strt, attr, prnt string) *Node {
 
 		// if farm array slots used up, allocate new array
@@ -8108,7 +8092,7 @@ func ProcessQuery(Text, parent string, index int, cmds *Block, tbls *Tables, act
 
 	plainText := (!tbls.DeGloss && !tbls.DoMixed)
 
-	// function to get next XML token
+	// get next XML token
 	nextToken := func(idx int) (TagType, string, string, int) {
 
 		// lookup table array pointers
@@ -8575,7 +8559,7 @@ func ProcessQuery(Text, parent string, index int, cmds *Block, tbls *Tables, act
 	return ""
 }
 
-// FUNCTION TO CONVERT IDENTIFIER TO DIRECTORY PATH FOR LOCAL FILE ARCHIVE
+// CONVERT IDENTIFIER TO DIRECTORY PATH FOR LOCAL FILE ARCHIVE
 
 // MakeArchiveTrie allows a short prefix of letters with an optional underscore, and splits the remainder into character pairs
 func MakeArchiveTrie(str string, arry [132]rune) string {
@@ -8643,7 +8627,7 @@ func MakeArchiveTrie(str string, arry [132]rune) string {
 	return strings.ToUpper(string(arry[:i]))
 }
 
-// FUNCTION TO CONVERT TERM TO DIRECTORY PATH FOR POSTINGS FILE STORAGE
+// CONVERT TERM TO DIRECTORY PATH FOR POSTINGS FILE STORAGE
 
 // MakePostingsTrie splits a string into characters, separated by path delimiting slashes
 func MakePostingsTrie(str string, arry [516]rune) string {
@@ -9014,6 +8998,72 @@ func CreateUniquer(tbls *Tables, inp <-chan Extract) <-chan Extract {
 	return out
 }
 
+func CreateDeleter(tbls *Tables, dltd string, inp <-chan Extract) <-chan Extract {
+
+	if tbls == nil || inp == nil {
+		return nil
+	}
+
+	out := make(chan Extract, tbls.ChanDepth)
+	if out == nil {
+		fmt.Fprintf(os.Stderr, "\nERROR: Unable to create deleter channel\n")
+		os.Exit(1)
+	}
+
+	// map to track UIDs to skip
+	shouldSkip := make(map[string]bool)
+
+	checkMap := false
+
+	if dltd != "" && dltd != "-" {
+		fmt.Fprintf(os.Stderr, "\nEnter CreateDeleter Scanner\n")
+		checkMap = true
+
+		skipFile, err := os.Open(dltd)
+		if err != nil {
+			fmt.Fprintf(os.Stderr, "\nERROR: Unable to read skip file\n")
+			os.Exit(1)
+		}
+
+		scanr := bufio.NewScanner(skipFile)
+
+		for scanr.Scan() {
+
+			// read lines of identifiers
+			id := scanr.Text()
+
+			// add to exclusion map
+			shouldSkip[id] = true
+		}
+
+		skipFile.Close()
+		fmt.Fprintf(os.Stderr, "\nLeave CreateDeleter Scanner\n")
+	}
+
+	// xmlDeleter removes records listed as deleted
+	xmlDeleter := func(inp <-chan Extract, out chan<- Extract) {
+
+		// close channel when all records have been processed
+		defer close(out)
+
+		for curr := range inp {
+
+			// check if identifier was deleted
+			if checkMap && shouldSkip[curr.Ident] {
+				continue
+			}
+
+			// send to output channel
+			out <- curr
+		}
+	}
+
+	// launch single deleter goroutine
+	go xmlDeleter(inp, out)
+
+	return out
+}
+
 func CreateStashers(tbls *Tables, inp <-chan Extract) <-chan string {
 
 	if tbls == nil || inp == nil {
@@ -9611,6 +9661,9 @@ func main() {
 	// path for local data indexed as trie
 	stsh := ""
 
+	// file of UIDs to skip
+	dltd := ""
+
 	// path for postings files indexed as trie
 	pstg := ""
 
@@ -9634,7 +9687,7 @@ func main() {
 	// repeat the specified extraction 5 times for each -proc from 1 to nCPU
 	trial := false
 
-	// function to get numeric value
+	// get numeric value
 	getNumericArg := func(name string, zer, min, max int) int {
 
 		if len(args) < 2 {
@@ -9718,6 +9771,15 @@ func main() {
 			stsh = args[1]
 			// skip past first of two arguments
 			args = args[1:]
+		// UIDs to ignore
+		case "-skip":
+			if len(args) < 2 {
+				fmt.Fprintf(os.Stderr, "\nERROR: Skip file is missing\n")
+				os.Exit(1)
+			}
+			dltd = args[1]
+			// skip past first of two arguments
+			args = args[1:]
 		// local directory path for postings files (undocumented)
 		case "-posting", "-postings":
 			if len(args) < 2 {
@@ -10139,7 +10201,9 @@ func main() {
 	// -e2index shortcut for experimental indexing code (undocumented)
 	if args[0] == "-e2index" {
 
-		res := ProcessE2Index(isPipe || usingFile)
+		args = args[1:]
+
+		res := ProcessE2Index(args, isPipe || usingFile)
 
 		if !isPipe && !usingFile {
 			// no piped input, so write output instructions
@@ -10188,14 +10252,41 @@ func main() {
 		defer pprof.StopCPUProfile()
 	}
 
+	// SPECIAL FORMATTING COMMANDS
+
+	inSwitch = true
+	action := NOPROCESS
+
+	switch args[0] {
+	case "-format":
+		action = DOFORMAT
+	case "-outline":
+		action = DOOUTLINE
+	case "-synopsis":
+		action = DOSYNOPSIS
+	case "-verify", "-validate":
+		action = DOVERIFY
+	case "-filter":
+		action = DOFILTER
+	default:
+		// if not any of the formatting commands, keep going
+		inSwitch = false
+	}
+
+	if inSwitch {
+		ProcessXMLStream(rdr, tbls, args, action)
+		return
+	}
+
 	// INITIALIZE PROCESS TIMER AND RECORD COUNT
 
 	startTime := time.Now()
 	recordCount := 0
 	byteCount := 0
 
-	// function to print processing rate and program duration
+	// print processing rate and program duration
 	printDuration := func(name string) {
+
 		stopTime := time.Now()
 		duration := stopTime.Sub(startTime)
 		seconds := float64(duration.Nanoseconds()) / 1e9
@@ -10205,6 +10296,7 @@ func main() {
 		} else {
 			fmt.Fprintf(os.Stderr, "\nXtract processed %d %s in %.3f seconds", recordCount, name, seconds)
 		}
+
 		if seconds >= 0.001 && recordCount > 0 {
 			rate := int(float64(recordCount) / seconds)
 			if rate >= 1000000 {
@@ -10224,33 +10316,8 @@ func main() {
 			}
 			fmt.Fprintf(os.Stderr, ")")
 		}
-		fmt.Fprintf(os.Stderr, "\n\n")
-	}
-
-	// SPECIAL FORMATTING COMMANDS
 
-	inSwitch = true
-	action := NOPROCESS
-
-	switch args[0] {
-	case "-format":
-		action = DOFORMAT
-	case "-outline":
-		action = DOOUTLINE
-	case "-synopsis":
-		action = DOSYNOPSIS
-	case "-verify", "-validate":
-		action = DOVERIFY
-	case "-filter":
-		action = DOFILTER
-	default:
-		// if not any of the formatting commands, keep going
-		inSwitch = false
-	}
-
-	if inSwitch {
-		ProcessXMLStream(rdr, tbls, args, action)
-		return
+		fmt.Fprintf(os.Stderr, "\n\n")
 	}
 
 	// SPECIFY STRINGS TO GO BEFORE AND AFTER ENTIRE OUTPUT OR EACH RECORD
@@ -10427,7 +10494,7 @@ func main() {
 	// -archive without -index retrieves XML files in trie-based directory structure
 	if stsh != "" && indx == "" {
 
-		uidq := CreateUIDReader(in, tbls)
+		uidq := CreateUIDReader(rdr.Reader, tbls)
 		strq := CreateFetchers(tbls, uidq)
 		unsq := CreateUnshuffler(tbls, strq)
 
@@ -10577,7 +10644,7 @@ func main() {
 					return
 				}
 
-				// function to print new or updated XML record
+				// print new or updated XML record
 				printRecord := func(stn string, isNew bool) {
 
 					if stn == "" {
@@ -10691,9 +10758,14 @@ func main() {
 		idnq := CreateExaminers(tbls, parent, xmlq)
 		unsq := CreateUnshuffler(tbls, idnq)
 		unqq := CreateUniquer(tbls, unsq)
-		stsq := CreateStashers(tbls, unqq)
+		delq := unqq
+		if dltd != "" {
+			// only create deleter if -skip argument is present
+			delq = CreateDeleter(tbls, dltd, unqq)
+		}
+		stsq := CreateStashers(tbls, delq)
 
-		if xmlq == nil || idnq == nil || unsq == nil || unqq == nil || stsq == nil {
+		if xmlq == nil || idnq == nil || unsq == nil || unqq == nil || delq == nil || stsq == nil {
 			fmt.Fprintf(os.Stderr, "\nERROR: Unable to create stash generator\n")
 			os.Exit(1)
 		}
author	Aaron M. Ucko <ucko@debian.org>	2017-10-06 17:43:37 -0400
committer	Aaron M. Ucko <ucko@debian.org>	2017-10-06 17:44:16 -0400
commit	0d510bdeee8747dd089294a7210944e8c236cdad (patch)
tree	fa4d2c46f995c8f7eaab9a3c6428710e8fc488fb
parent	91f37314681b612fec14016b237f144f67d81a49 (diff)
parent	a0990267f337037396f8665df411d0b8bc641a66 (diff)