diff options
author | Aaron M. Ucko <ucko@debian.org> | 2017-10-05 20:46:06 -0400 |
---|---|---|
committer | Aaron M. Ucko <ucko@debian.org> | 2017-10-05 20:46:51 -0400 |
commit | 127c4b9dbc46d4da613a7b121d431162238262b6 (patch) | |
tree | 8a35b0e32f3e3a9369ed65ef6eab7edb2a74b07a | |
parent | b623e1228d3d6186cb7e588b77852c991e449af4 (diff) | |
parent | cbf48d042bde0f7a63d81f49f615d66661f7770b (diff) |
Merge tag 'upstream/7.00.20170714+ds'
Upstream version 7.00.20170714(+ds).
-rw-r--r-- | debian/changelog | 4 | ||||
-rwxr-xr-x | edirect.pl | 50 | ||||
-rw-r--r-- | xtract.go | 221 |
3 files changed, 212 insertions, 63 deletions
diff --git a/debian/changelog b/debian/changelog index eac0925..55e69e6 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,9 +1,9 @@ -ncbi-entrez-direct (7.00.20170710+ds-1) UNRELEASED; urgency=medium +ncbi-entrez-direct (7.00.20170714+ds-1) UNRELEASED; urgency=medium * New upstream release. (NOT YET RELEASED.) * debian/man/{efilter,esearch}.1: Update accordingly. - -- Aaron M. Ucko <ucko@debian.org> Thu, 05 Oct 2017 20:17:36 -0400 + -- Aaron M. Ucko <ucko@debian.org> Thu, 05 Oct 2017 20:46:06 -0400 ncbi-entrez-direct (6.90.20170705+ds-2) unstable; urgency=medium @@ -162,6 +162,7 @@ sub clearflags { $pipe = false; $pub = ""; $query = ""; + $raw = false; $related = false; $rldate = 0; $seq_start = 0; @@ -1784,7 +1785,9 @@ sub esmry { Encode::_utf8_on($data); - $data = fix_bad_encoding($dbase, $data); + if (! $raw) { + $data = fix_bad_encoding($dbase, $data); + } # remove eSummaryResult wrapper $data =~ s/<!DOCTYPE eSummaryResult PUBLIC/<!DOCTYPE DocumentSummarySet PUBLIC/g; @@ -1902,7 +1905,9 @@ sub esmry { Encode::_utf8_on($data); - $data = fix_bad_encoding($dbase, $data); + if (! $raw) { + $data = fix_bad_encoding($dbase, $data); + } # remove eSummaryResult wrapper $data =~ s/<!DOCTYPE eSummaryResult PUBLIC/<!DOCTYPE DocumentSummarySet PUBLIC/g; @@ -2125,6 +2130,7 @@ sub eftch { "verbose" => \$verbose, "debug" => \$debug, "log" => \$log, + "raw" => \$raw, "http=s" => \$http, "https=s" => \$http, "alias=s" => \$alias, @@ -2426,17 +2432,19 @@ sub eftch { Encode::_utf8_on($$data); - if ( $dbase eq "sra" and $type eq "full" and $mode eq "xml" ) { - $$data = fix_sra_xml_encoding($dbase, $$data); - } + if (! $raw) { + if ( $dbase eq "sra" and $type eq "full" and $mode eq "xml" ) { + $$data = fix_sra_xml_encoding($dbase, $$data); + } - if ( $dbase eq "pubmed" and $type eq "full" and $mode eq "xml" ) { - $$data = fix_pubmed_xml_encoding($dbase, $$data); - } + if ( $dbase eq "pubmed" and $type eq "full" and $mode eq "xml" ) { + $$data = fix_pubmed_xml_encoding($dbase, $$data); + } - if ( $type eq "fasta" or $type eq "fasta_cds_aa" or $type eq "fasta_cds_na" or $type eq "gene_fasta" ) { - # remove blank lines in FASTA format - $$data =~ s/\n+/\n/g; + if ( $type eq "fasta" or $type eq "fasta_cds_aa" or $type eq "fasta_cds_na" or $type eq "gene_fasta" ) { + # remove blank lines in FASTA format + $$data =~ s/\n+/\n/g; + } } print $$data; @@ -2581,17 +2589,19 @@ sub eftch { Encode::_utf8_on($$data); - if ( $dbase eq "sra" and $type eq "full" and $mode eq "xml" ) { - $$data = fix_sra_xml_encoding($dbase, $$data); - } + if (! $raw) { + if ( $dbase eq "sra" and $type eq "full" and $mode eq "xml" ) { + $$data = fix_sra_xml_encoding($dbase, $$data); + } - if ( $dbase eq "pubmed" and $type eq "full" and $mode eq "xml" ) { - $$data = fix_pubmed_xml_encoding($dbase, $$data); - } + if ( $dbase eq "pubmed" and $type eq "full" and $mode eq "xml" ) { + $$data = fix_pubmed_xml_encoding($dbase, $$data); + } - if ( $type eq "fasta" or $type eq "fasta_cds_aa" or $type eq "fasta_cds_na" or $type eq "gene_fasta" ) { - # remove blank lines in FASTA format - $$data =~ s/\n+/\n/g; + if ( $type eq "fasta" or $type eq "fasta_cds_aa" or $type eq "fasta_cds_na" or $type eq "gene_fasta" ) { + # remove blank lines in FASTA format + $$data =~ s/\n+/\n/g; + } } print $$data; @@ -460,7 +460,7 @@ Aspera PubMed Download Aspera PubMed Update - asp-ls pubmed/baseline | + asp-ls pubmed/updatefiles | grep -v ".md5" | grep "xml.gz" | asp-cp pubmed/updatefiles @@ -1864,7 +1864,8 @@ const ( DOTOKEN DOQUERY DOINDEX - DOVALID + DOFINDBAD + DOSAVEBAD ) type SeqEndType int @@ -2886,6 +2887,7 @@ func DoHtmlReplace(str string) string { "&lt;/sup&gt;", "", "&lt;sup/&gt;", "", "&lt;sup /&gt;", "", + "&amp;", "&", ) } @@ -2946,6 +2948,7 @@ func DoHtmlRepair(str string) string { "&lt;/sup&gt;", "</sup>", "&lt;sup/&gt;", "<sup/>", "&lt;sup /&gt;", "<sup/>", + "&amp;", "&", ) } @@ -4908,6 +4911,59 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special blockLine := 0 startLine := 0 + // warn if HTML tags are not well-formed + unbalancedHtml := func(text string) bool { + + var arry []string + + idx := 0 + txtlen := len(text) + + inTag := false + start := 0 + + for idx < txtlen { + ch := text[idx] + if ch == '<' { + if inTag { + return true + } + inTag = true + start = idx + } else if ch == '>' { + if !inTag { + return true + } + inTag = false + curr := text[start+1 : idx] + if strings.HasPrefix(curr, "/") { + curr = curr[1:] + if len(arry) < 1 { + return true + } + prev := arry[len(arry)-1] + if curr != prev { + return true + } + arry = arry[:len(arry)-1] + } else { + arry = append(arry, curr) + } + } + idx++ + } + + if inTag { + return true + } + + if len(arry) > 0 { + return true + } + + return false + } + // verifyLevel recursive definition var verifyLevel func(string, int) @@ -4958,6 +5014,11 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special if status != START { fmt.Fprintf(os.Stdout, "Contents not expected before </%s>, line %d\n", parent, line) } + if tbls.DeGloss || tbls.DoMixed { + if unbalancedHtml(name) { + fmt.Fprintf(os.Stdout, "Unbalanced mixed-content tags, line %d\n", line) + } + } status = CHAR case CDATATAG, COMMENTTAG: status = OTHER @@ -8417,57 +8478,117 @@ func ProcessQuery(Text, parent string, index int, cmds *Block, tbls *Tables, act } // report records with bad mixed content HTML tags or Unicode superscripts/subscripts - doValid := func() string { + doValid := func(savexml bool) string { id := doIndex() ok := false - badTags := [10]string{ + badSelf := [15]string{ "<i/>", "<i />", + "<i></i>", "<b/>", "<b />", + "<b></b>", "<u/>", "<u />", + "<u></u>", "<sup/>", "<sup />", + "<sup></sup>", "<sub/>", "<sub />", + "<sub></sub>", + } + + badSingle := [15]string{ + "<i>", + "</i>", + "<i/>", + "<b>", + "</b>", + "<b/>", + "<u>", + "</u>", + "<u/>", + "<sub>", + "</sub>", + "<sub/>", + "<sup>", + "</sup>", + "<sup/>", + } + + badDouble := [15]string{ + "&lt;i&gt;", + "&lt;/i&gt;", + "&lt;i/&gt;", + "&lt;b&gt;", + "&lt;/b&gt;", + "&lt;b/&gt;", + "&lt;u&gt;", + "&lt;/u&gt;", + "&lt;u/&gt;", + "&lt;sub&gt;", + "&lt;/sub&gt;", + "&lt;sub/&gt;", + "&lt;sup&gt;", + "&lt;/sup&gt;", + "&lt;sup/&gt;", + } + + badMark := [5]string{ + " Self ", + " < ", + " & ", + " Supsc", + " Subsc", + } + + var badMatch [15]bool + + if strings.Contains(Text[:], "<") { + for _, str := range badSelf { + if strings.Contains(Text[:], str) { + badMatch[0] = true + ok = true + break + } + } } - badMark := [10]string{ - " i/ ", - " i / ", - " b/ ", - " b / ", - " u/ ", - " u / ", - " sup/ ", - " sup /", - " sub/ ", - " sub /", + if strings.Contains(Text[:], "<") { + for _, str := range badSingle { + if strings.Contains(Text[:], str) { + badMatch[1] = true + ok = true + break + } + } } - var badMatch [10]bool - - uniSup := false - uniSub := false - - for i, str := range badTags { - if strings.Contains(Text[:], str) { - badMatch[i] = true - ok = true + if strings.Contains(Text[:], "&lt;") { + for _, str := range badDouble { + if strings.Contains(Text[:], str) { + badMatch[2] = true + ok = true + break + } } } + if ok && savexml { + return Text[:] + } + for _, ch := range Text { if ch > 127 { if (ch >= '\u00B2' && ch <= '\u00B9') || (ch >= '\u2070' && ch <= '\u207F') { - uniSup = true + badMatch[3] = true ok = true } else if ch >= '\u2080' && ch <= '\u208E' { - uniSub = true + badMatch[4] = true ok = true } } @@ -8478,17 +8599,6 @@ func ProcessQuery(Text, parent string, index int, cmds *Block, tbls *Tables, act if ok { txt += id - if uniSup { - txt += " Supsc" - } else { - txt += " " - } - if uniSub { - txt += " Subsc" - } else { - txt += " " - } - for i, found := range badMatch { if found { txt += badMark[i] @@ -8511,8 +8621,10 @@ func ProcessQuery(Text, parent string, index int, cmds *Block, tbls *Tables, act return doQuery() case DOINDEX: return doIndex() - case DOVALID: - return doValid() + case DOFINDBAD: + return doValid(false) + case DOSAVEBAD: + return doValid(true) default: } @@ -9159,6 +9271,7 @@ func main() { // check for empty html tags and unicode superscript/subscripts dmgd := false + cptr := false // phrase to find anywhere in XML phrs := "" @@ -9292,6 +9405,8 @@ func main() { // check for damaged html tags or unicode superscript/subscript characters (undocumented) case "-damaged", "-damage": dmgd = true + case "-capture": + cptr = true case "-missing": msng = true // data cleanup flags @@ -10304,14 +10419,38 @@ func main() { // LOOK FOR IMPROPER HTML TAGS AND UNICODE SUPERSCRIPT/SUBSCRIPT CHARACTERS // -index plus -damaged plus -pattern with no other extraction arguments - // reports empty html tags or Unicode superscript/subscript characters + // reports records with problem html tags or Unicode superscript/subscript characters if indx != "" && dmgd && len(args) == 2 { PartitionPattern(topPattern, star, rdr, func(rec int, ofs int64, str string) { recordCount++ - res := ProcessQuery(str[:], parent, rec, nil, tbls, DOVALID) + res := ProcessQuery(str[:], parent, rec, nil, tbls, DOFINDBAD) + if res == "" { + return + } + + os.Stdout.WriteString(res[:]) + os.Stdout.WriteString("\n") + }) + + if timr { + printDuration("records") + } + + return + } + + // -index plus -capture plus -pattern with no other extraction arguments + // saves records with problem html tags or Unicode superscript/subscript characters + if indx != "" && cptr && len(args) == 2 { + + PartitionPattern(topPattern, star, rdr, + func(rec int, ofs int64, str string) { + recordCount++ + + res := ProcessQuery(str[:], parent, rec, nil, tbls, DOSAVEBAD) if res == "" { return } |