New upstream version 7.00.20170714+ds

author: Aaron M. Ucko <ucko@debian.org> 2017-10-05 20:46:01 -0400
committer: Aaron M. Ucko <ucko@debian.org> 2017-10-05 20:46:01 -0400
commit: cbf48d042bde0f7a63d81f49f615d66661f7770b (patch)
tree: b4fc012174e4c8f1f33e075e735d68bbb0696027
parent: d253448077bb6306232bdaf404df863f5aa42ce8 (diff)
2 files changed, 210 insertions, 61 deletions
diff --git a/edirect.pl b/edirect.pl
index a8d1050..b58f33e 100755
--- a/edirect.pl
+++ b/edirect.pl
@@ -162,6 +162,7 @@ sub clearflags {
   $pipe = false;
   $pub = "";
   $query = "";
+  $raw = false;
   $related = false;
   $rldate = 0;
   $seq_start = 0;
@@ -1784,7 +1785,9 @@ sub esmry {
 
     Encode::_utf8_on($data);
 
-    $data = fix_bad_encoding($dbase, $data);
+    if (! $raw) {
+      $data = fix_bad_encoding($dbase, $data);
+    }
 
     # remove eSummaryResult wrapper
     $data =~ s/<!DOCTYPE eSummaryResult PUBLIC/<!DOCTYPE DocumentSummarySet PUBLIC/g;
@@ -1902,7 +1905,9 @@ sub esmry {
 
       Encode::_utf8_on($data);
 
-      $data = fix_bad_encoding($dbase, $data);
+      if (! $raw) {
+        $data = fix_bad_encoding($dbase, $data);
+      }
 
       # remove eSummaryResult wrapper
       $data =~ s/<!DOCTYPE eSummaryResult PUBLIC/<!DOCTYPE DocumentSummarySet PUBLIC/g;
@@ -2125,6 +2130,7 @@ sub eftch {
     "verbose" => \$verbose,
     "debug" => \$debug,
     "log" => \$log,
+    "raw" => \$raw,
     "http=s" => \$http,
     "https=s" => \$http,
     "alias=s" => \$alias,
@@ -2426,17 +2432,19 @@ sub eftch {
 
     Encode::_utf8_on($$data);
 
-    if ( $dbase eq "sra" and $type eq "full" and $mode eq "xml" ) {
-      $$data = fix_sra_xml_encoding($dbase, $$data);
-    }
+    if (! $raw) {
+      if ( $dbase eq "sra" and $type eq "full" and $mode eq "xml" ) {
+        $$data = fix_sra_xml_encoding($dbase, $$data);
+      }
 
-    if ( $dbase eq "pubmed" and $type eq "full" and $mode eq "xml" ) {
-      $$data = fix_pubmed_xml_encoding($dbase, $$data);
-    }
+      if ( $dbase eq "pubmed" and $type eq "full" and $mode eq "xml" ) {
+        $$data = fix_pubmed_xml_encoding($dbase, $$data);
+      }
 
-    if ( $type eq "fasta" or $type eq "fasta_cds_aa" or $type eq "fasta_cds_na" or $type eq "gene_fasta" ) {
-      # remove blank lines in FASTA format
-      $$data =~ s/\n+/\n/g;
+      if ( $type eq "fasta" or $type eq "fasta_cds_aa" or $type eq "fasta_cds_na" or $type eq "gene_fasta" ) {
+        # remove blank lines in FASTA format
+        $$data =~ s/\n+/\n/g;
+      }
     }
 
     print $$data;
@@ -2581,17 +2589,19 @@ sub eftch {
 
       Encode::_utf8_on($$data);
 
-      if ( $dbase eq "sra" and $type eq "full" and $mode eq "xml" ) {
-        $$data = fix_sra_xml_encoding($dbase, $$data);
-      }
+      if (! $raw) {
+        if ( $dbase eq "sra" and $type eq "full" and $mode eq "xml" ) {
+          $$data = fix_sra_xml_encoding($dbase, $$data);
+        }
 
-      if ( $dbase eq "pubmed" and $type eq "full" and $mode eq "xml" ) {
-        $$data = fix_pubmed_xml_encoding($dbase, $$data);
-      }
+        if ( $dbase eq "pubmed" and $type eq "full" and $mode eq "xml" ) {
+          $$data = fix_pubmed_xml_encoding($dbase, $$data);
+        }
 
-      if ( $type eq "fasta" or $type eq "fasta_cds_aa" or $type eq "fasta_cds_na" or $type eq "gene_fasta" ) {
-        # remove blank lines in FASTA format
-        $$data =~ s/\n+/\n/g;
+        if ( $type eq "fasta" or $type eq "fasta_cds_aa" or $type eq "fasta_cds_na" or $type eq "gene_fasta" ) {
+          # remove blank lines in FASTA format
+          $$data =~ s/\n+/\n/g;
+        }
       }
 
       print $$data;
diff --git a/xtract.go b/xtract.go
index 5b1e3df..1d465d0 100644
--- a/xtract.go
+++ b/xtract.go
@@ -460,7 +460,7 @@ Aspera PubMed Download
 
 Aspera PubMed Update
 
-  asp-ls pubmed/baseline |
+  asp-ls pubmed/updatefiles |
   grep -v ".md5" | grep "xml.gz" |
   asp-cp pubmed/updatefiles
 
@@ -1864,7 +1864,8 @@ const (
 	DOTOKEN
 	DOQUERY
 	DOINDEX
-	DOVALID
+	DOFINDBAD
+	DOSAVEBAD
 )
 
 type SeqEndType int
@@ -2886,6 +2887,7 @@ func DoHtmlReplace(str string) string {
 			"&amp;lt;/sup&amp;gt;", "",
 			"&amp;lt;sup/&amp;gt;", "",
 			"&amp;lt;sup /&amp;gt;", "",
+			"&amp;amp;", "&amp;",
 		)
 	}
 
@@ -2946,6 +2948,7 @@ func DoHtmlRepair(str string) string {
 			"&amp;lt;/sup&amp;gt;", "</sup>",
 			"&amp;lt;sup/&amp;gt;", "<sup/>",
 			"&amp;lt;sup /&amp;gt;", "<sup/>",
+			"&amp;amp;", "&amp;",
 		)
 	}
 
@@ -4908,6 +4911,59 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
 		blockLine := 0
 		startLine := 0
 
+		// warn if HTML tags are not well-formed
+		unbalancedHtml := func(text string) bool {
+
+			var arry []string
+
+			idx := 0
+			txtlen := len(text)
+
+			inTag := false
+			start := 0
+
+			for idx < txtlen {
+				ch := text[idx]
+				if ch == '<' {
+					if inTag {
+						return true
+					}
+					inTag = true
+					start = idx
+				} else if ch == '>' {
+					if !inTag {
+						return true
+					}
+					inTag = false
+					curr := text[start+1 : idx]
+					if strings.HasPrefix(curr, "/") {
+						curr = curr[1:]
+						if len(arry) < 1 {
+							return true
+						}
+						prev := arry[len(arry)-1]
+						if curr != prev {
+							return true
+						}
+						arry = arry[:len(arry)-1]
+					} else {
+						arry = append(arry, curr)
+					}
+				}
+				idx++
+			}
+
+			if inTag {
+				return true
+			}
+
+			if len(arry) > 0 {
+				return true
+			}
+
+			return false
+		}
+
 		// verifyLevel recursive definition
 		var verifyLevel func(string, int)
 
@@ -4958,6 +5014,11 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
 					if status != START {
 						fmt.Fprintf(os.Stdout, "Contents not expected before </%s>, line %d\n", parent, line)
 					}
+					if tbls.DeGloss || tbls.DoMixed {
+						if unbalancedHtml(name) {
+							fmt.Fprintf(os.Stdout, "Unbalanced mixed-content tags, line %d\n", line)
+						}
+					}
 					status = CHAR
 				case CDATATAG, COMMENTTAG:
 					status = OTHER
@@ -8417,57 +8478,117 @@ func ProcessQuery(Text, parent string, index int, cmds *Block, tbls *Tables, act
 	}
 
 	// report records with bad mixed content HTML tags or Unicode superscripts/subscripts
-	doValid := func() string {
+	doValid := func(savexml bool) string {
 
 		id := doIndex()
 
 		ok := false
 
-		badTags := [10]string{
+		badSelf := [15]string{
 			"<i/>",
 			"<i />",
+			"<i></i>",
 			"<b/>",
 			"<b />",
+			"<b></b>",
 			"<u/>",
 			"<u />",
+			"<u></u>",
 			"<sup/>",
 			"<sup />",
+			"<sup></sup>",
 			"<sub/>",
 			"<sub />",
+			"<sub></sub>",
+		}
+
+		badSingle := [15]string{
+			"&lt;i&gt;",
+			"&lt;/i&gt;",
+			"&lt;i/&gt;",
+			"&lt;b&gt;",
+			"&lt;/b&gt;",
+			"&lt;b/&gt;",
+			"&lt;u&gt;",
+			"&lt;/u&gt;",
+			"&lt;u/&gt;",
+			"&lt;sub&gt;",
+			"&lt;/sub&gt;",
+			"&lt;sub/&gt;",
+			"&lt;sup&gt;",
+			"&lt;/sup&gt;",
+			"&lt;sup/&gt;",
+		}
+
+		badDouble := [15]string{
+			"&amp;lt;i&amp;gt;",
+			"&amp;lt;/i&amp;gt;",
+			"&amp;lt;i/&amp;gt;",
+			"&amp;lt;b&amp;gt;",
+			"&amp;lt;/b&amp;gt;",
+			"&amp;lt;b/&amp;gt;",
+			"&amp;lt;u&amp;gt;",
+			"&amp;lt;/u&amp;gt;",
+			"&amp;lt;u/&amp;gt;",
+			"&amp;lt;sub&amp;gt;",
+			"&amp;lt;/sub&amp;gt;",
+			"&amp;lt;sub/&amp;gt;",
+			"&amp;lt;sup&amp;gt;",
+			"&amp;lt;/sup&amp;gt;",
+			"&amp;lt;sup/&amp;gt;",
+		}
+
+		badMark := [5]string{
+			"  Self ",
+			"  &lt  ",
+			"  &amp ",
+			"  Supsc",
+			"  Subsc",
+		}
+
+		var badMatch [15]bool
+
+		if strings.Contains(Text[:], "<") {
+			for _, str := range badSelf {
+				if strings.Contains(Text[:], str) {
+					badMatch[0] = true
+					ok = true
+					break
+				}
+			}
 		}
 
-		badMark := [10]string{
-			"  i/   ",
-			"  i /  ",
-			"  b/   ",
-			"  b /  ",
-			"  u/   ",
-			"  u /  ",
-			"  sup/ ",
-			"  sup /",
-			"  sub/ ",
-			"  sub /",
+		if strings.Contains(Text[:], "&lt;") {
+			for _, str := range badSingle {
+				if strings.Contains(Text[:], str) {
+					badMatch[1] = true
+					ok = true
+					break
+				}
+			}
 		}
 
-		var badMatch [10]bool
-
-		uniSup := false
-		uniSub := false
-
-		for i, str := range badTags {
-			if strings.Contains(Text[:], str) {
-				badMatch[i] = true
-				ok = true
+		if strings.Contains(Text[:], "&amp;lt;") {
+			for _, str := range badDouble {
+				if strings.Contains(Text[:], str) {
+					badMatch[2] = true
+					ok = true
+					break
+				}
 			}
 		}
 
+		if ok && savexml {
+			return Text[:]
+		}
+
 		for _, ch := range Text {
 			if ch > 127 {
 				if (ch >= '\u00B2' && ch <= '\u00B9') || (ch >= '\u2070' && ch <= '\u207F') {
-					uniSup = true
+					badMatch[3] = true
 					ok = true
 				} else if ch >= '\u2080' && ch <= '\u208E' {
-					uniSub = true
+					badMatch[4] = true
 					ok = true
 				}
 			}
@@ -8478,17 +8599,6 @@ func ProcessQuery(Text, parent string, index int, cmds *Block, tbls *Tables, act
 		if ok {
 			txt += id
 
-			if uniSup {
-				txt += "  Supsc"
-			} else {
-				txt += "       "
-			}
-			if uniSub {
-				txt += "  Subsc"
-			} else {
-				txt += "       "
-			}
-
 			for i, found := range badMatch {
 				if found {
 					txt += badMark[i]
@@ -8511,8 +8621,10 @@ func ProcessQuery(Text, parent string, index int, cmds *Block, tbls *Tables, act
 		return doQuery()
 	case DOINDEX:
 		return doIndex()
-	case DOVALID:
-		return doValid()
+	case DOFINDBAD:
+		return doValid(false)
+	case DOSAVEBAD:
+		return doValid(true)
 	default:
 	}
 
@@ -9159,6 +9271,7 @@ func main() {
 
 	// check for empty html tags and unicode superscript/subscripts
 	dmgd := false
+	cptr := false
 
 	// phrase to find anywhere in XML
 	phrs := ""
@@ -9292,6 +9405,8 @@ func main() {
 		// check for damaged html tags or unicode superscript/subscript characters (undocumented)
 		case "-damaged", "-damage":
 			dmgd = true
+		case "-capture":
+			cptr = true
 		case "-missing":
 			msng = true
 		// data cleanup flags
@@ -10304,14 +10419,38 @@ func main() {
 	// LOOK FOR IMPROPER HTML TAGS AND UNICODE SUPERSCRIPT/SUBSCRIPT CHARACTERS
 
 	// -index plus -damaged plus -pattern with no other extraction arguments
-	// reports empty html tags or Unicode superscript/subscript characters
+	// reports records with problem html tags or Unicode superscript/subscript characters
 	if indx != "" && dmgd && len(args) == 2 {
 
 		PartitionPattern(topPattern, star, rdr,
 			func(rec int, ofs int64, str string) {
 				recordCount++
 
-				res := ProcessQuery(str[:], parent, rec, nil, tbls, DOVALID)
+				res := ProcessQuery(str[:], parent, rec, nil, tbls, DOFINDBAD)
+				if res == "" {
+					return
+				}
+
+				os.Stdout.WriteString(res[:])
+				os.Stdout.WriteString("\n")
+			})
+
+		if timr {
+			printDuration("records")
+		}
+
+		return
+	}
+
+	// -index plus -capture plus -pattern with no other extraction arguments
+	// saves records with problem html tags or Unicode superscript/subscript characters
+	if indx != "" && cptr && len(args) == 2 {
+
+		PartitionPattern(topPattern, star, rdr,
+			func(rec int, ofs int64, str string) {
+				recordCount++
+
+				res := ProcessQuery(str[:], parent, rec, nil, tbls, DOSAVEBAD)
 				if res == "" {
 					return
 				}
author	Aaron M. Ucko <ucko@debian.org>	2017-10-05 20:46:01 -0400
committer	Aaron M. Ucko <ucko@debian.org>	2017-10-05 20:46:01 -0400
commit	cbf48d042bde0f7a63d81f49f615d66661f7770b (patch)
tree	b4fc012174e4c8f1f33e075e735d68bbb0696027
parent	d253448077bb6306232bdaf404df863f5aa42ce8 (diff)