summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAaron M. Ucko <ucko@debian.org>2017-10-05 20:46:01 -0400
committerAaron M. Ucko <ucko@debian.org>2017-10-05 20:46:01 -0400
commitcbf48d042bde0f7a63d81f49f615d66661f7770b (patch)
treeb4fc012174e4c8f1f33e075e735d68bbb0696027
parentd253448077bb6306232bdaf404df863f5aa42ce8 (diff)
New upstream version 7.00.20170714+ds
-rwxr-xr-xedirect.pl50
-rw-r--r--xtract.go221
2 files changed, 210 insertions, 61 deletions
diff --git a/edirect.pl b/edirect.pl
index a8d1050..b58f33e 100755
--- a/edirect.pl
+++ b/edirect.pl
@@ -162,6 +162,7 @@ sub clearflags {
$pipe = false;
$pub = "";
$query = "";
+ $raw = false;
$related = false;
$rldate = 0;
$seq_start = 0;
@@ -1784,7 +1785,9 @@ sub esmry {
Encode::_utf8_on($data);
- $data = fix_bad_encoding($dbase, $data);
+ if (! $raw) {
+ $data = fix_bad_encoding($dbase, $data);
+ }
# remove eSummaryResult wrapper
$data =~ s/<!DOCTYPE eSummaryResult PUBLIC/<!DOCTYPE DocumentSummarySet PUBLIC/g;
@@ -1902,7 +1905,9 @@ sub esmry {
Encode::_utf8_on($data);
- $data = fix_bad_encoding($dbase, $data);
+ if (! $raw) {
+ $data = fix_bad_encoding($dbase, $data);
+ }
# remove eSummaryResult wrapper
$data =~ s/<!DOCTYPE eSummaryResult PUBLIC/<!DOCTYPE DocumentSummarySet PUBLIC/g;
@@ -2125,6 +2130,7 @@ sub eftch {
"verbose" => \$verbose,
"debug" => \$debug,
"log" => \$log,
+ "raw" => \$raw,
"http=s" => \$http,
"https=s" => \$http,
"alias=s" => \$alias,
@@ -2426,17 +2432,19 @@ sub eftch {
Encode::_utf8_on($$data);
- if ( $dbase eq "sra" and $type eq "full" and $mode eq "xml" ) {
- $$data = fix_sra_xml_encoding($dbase, $$data);
- }
+ if (! $raw) {
+ if ( $dbase eq "sra" and $type eq "full" and $mode eq "xml" ) {
+ $$data = fix_sra_xml_encoding($dbase, $$data);
+ }
- if ( $dbase eq "pubmed" and $type eq "full" and $mode eq "xml" ) {
- $$data = fix_pubmed_xml_encoding($dbase, $$data);
- }
+ if ( $dbase eq "pubmed" and $type eq "full" and $mode eq "xml" ) {
+ $$data = fix_pubmed_xml_encoding($dbase, $$data);
+ }
- if ( $type eq "fasta" or $type eq "fasta_cds_aa" or $type eq "fasta_cds_na" or $type eq "gene_fasta" ) {
- # remove blank lines in FASTA format
- $$data =~ s/\n+/\n/g;
+ if ( $type eq "fasta" or $type eq "fasta_cds_aa" or $type eq "fasta_cds_na" or $type eq "gene_fasta" ) {
+ # remove blank lines in FASTA format
+ $$data =~ s/\n+/\n/g;
+ }
}
print $$data;
@@ -2581,17 +2589,19 @@ sub eftch {
Encode::_utf8_on($$data);
- if ( $dbase eq "sra" and $type eq "full" and $mode eq "xml" ) {
- $$data = fix_sra_xml_encoding($dbase, $$data);
- }
+ if (! $raw) {
+ if ( $dbase eq "sra" and $type eq "full" and $mode eq "xml" ) {
+ $$data = fix_sra_xml_encoding($dbase, $$data);
+ }
- if ( $dbase eq "pubmed" and $type eq "full" and $mode eq "xml" ) {
- $$data = fix_pubmed_xml_encoding($dbase, $$data);
- }
+ if ( $dbase eq "pubmed" and $type eq "full" and $mode eq "xml" ) {
+ $$data = fix_pubmed_xml_encoding($dbase, $$data);
+ }
- if ( $type eq "fasta" or $type eq "fasta_cds_aa" or $type eq "fasta_cds_na" or $type eq "gene_fasta" ) {
- # remove blank lines in FASTA format
- $$data =~ s/\n+/\n/g;
+ if ( $type eq "fasta" or $type eq "fasta_cds_aa" or $type eq "fasta_cds_na" or $type eq "gene_fasta" ) {
+ # remove blank lines in FASTA format
+ $$data =~ s/\n+/\n/g;
+ }
}
print $$data;
diff --git a/xtract.go b/xtract.go
index 5b1e3df..1d465d0 100644
--- a/xtract.go
+++ b/xtract.go
@@ -460,7 +460,7 @@ Aspera PubMed Download
Aspera PubMed Update
- asp-ls pubmed/baseline |
+ asp-ls pubmed/updatefiles |
grep -v ".md5" | grep "xml.gz" |
asp-cp pubmed/updatefiles
@@ -1864,7 +1864,8 @@ const (
DOTOKEN
DOQUERY
DOINDEX
- DOVALID
+ DOFINDBAD
+ DOSAVEBAD
)
type SeqEndType int
@@ -2886,6 +2887,7 @@ func DoHtmlReplace(str string) string {
"&amp;lt;/sup&amp;gt;", "",
"&amp;lt;sup/&amp;gt;", "",
"&amp;lt;sup /&amp;gt;", "",
+ "&amp;amp;", "&amp;",
)
}
@@ -2946,6 +2948,7 @@ func DoHtmlRepair(str string) string {
"&amp;lt;/sup&amp;gt;", "</sup>",
"&amp;lt;sup/&amp;gt;", "<sup/>",
"&amp;lt;sup /&amp;gt;", "<sup/>",
+ "&amp;amp;", "&amp;",
)
}
@@ -4908,6 +4911,59 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
blockLine := 0
startLine := 0
+ // warn if HTML tags are not well-formed
+ unbalancedHtml := func(text string) bool {
+
+ var arry []string
+
+ idx := 0
+ txtlen := len(text)
+
+ inTag := false
+ start := 0
+
+ for idx < txtlen {
+ ch := text[idx]
+ if ch == '<' {
+ if inTag {
+ return true
+ }
+ inTag = true
+ start = idx
+ } else if ch == '>' {
+ if !inTag {
+ return true
+ }
+ inTag = false
+ curr := text[start+1 : idx]
+ if strings.HasPrefix(curr, "/") {
+ curr = curr[1:]
+ if len(arry) < 1 {
+ return true
+ }
+ prev := arry[len(arry)-1]
+ if curr != prev {
+ return true
+ }
+ arry = arry[:len(arry)-1]
+ } else {
+ arry = append(arry, curr)
+ }
+ }
+ idx++
+ }
+
+ if inTag {
+ return true
+ }
+
+ if len(arry) > 0 {
+ return true
+ }
+
+ return false
+ }
+
// verifyLevel recursive definition
var verifyLevel func(string, int)
@@ -4958,6 +5014,11 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
if status != START {
fmt.Fprintf(os.Stdout, "Contents not expected before </%s>, line %d\n", parent, line)
}
+ if tbls.DeGloss || tbls.DoMixed {
+ if unbalancedHtml(name) {
+ fmt.Fprintf(os.Stdout, "Unbalanced mixed-content tags, line %d\n", line)
+ }
+ }
status = CHAR
case CDATATAG, COMMENTTAG:
status = OTHER
@@ -8417,57 +8478,117 @@ func ProcessQuery(Text, parent string, index int, cmds *Block, tbls *Tables, act
}
// report records with bad mixed content HTML tags or Unicode superscripts/subscripts
- doValid := func() string {
+ doValid := func(savexml bool) string {
id := doIndex()
ok := false
- badTags := [10]string{
+ badSelf := [15]string{
"<i/>",
"<i />",
+ "<i></i>",
"<b/>",
"<b />",
+ "<b></b>",
"<u/>",
"<u />",
+ "<u></u>",
"<sup/>",
"<sup />",
+ "<sup></sup>",
"<sub/>",
"<sub />",
+ "<sub></sub>",
+ }
+
+ badSingle := [15]string{
+ "&lt;i&gt;",
+ "&lt;/i&gt;",
+ "&lt;i/&gt;",
+ "&lt;b&gt;",
+ "&lt;/b&gt;",
+ "&lt;b/&gt;",
+ "&lt;u&gt;",
+ "&lt;/u&gt;",
+ "&lt;u/&gt;",
+ "&lt;sub&gt;",
+ "&lt;/sub&gt;",
+ "&lt;sub/&gt;",
+ "&lt;sup&gt;",
+ "&lt;/sup&gt;",
+ "&lt;sup/&gt;",
+ }
+
+ badDouble := [15]string{
+ "&amp;lt;i&amp;gt;",
+ "&amp;lt;/i&amp;gt;",
+ "&amp;lt;i/&amp;gt;",
+ "&amp;lt;b&amp;gt;",
+ "&amp;lt;/b&amp;gt;",
+ "&amp;lt;b/&amp;gt;",
+ "&amp;lt;u&amp;gt;",
+ "&amp;lt;/u&amp;gt;",
+ "&amp;lt;u/&amp;gt;",
+ "&amp;lt;sub&amp;gt;",
+ "&amp;lt;/sub&amp;gt;",
+ "&amp;lt;sub/&amp;gt;",
+ "&amp;lt;sup&amp;gt;",
+ "&amp;lt;/sup&amp;gt;",
+ "&amp;lt;sup/&amp;gt;",
+ }
+
+ badMark := [5]string{
+ " Self ",
+ " &lt ",
+ " &amp ",
+ " Supsc",
+ " Subsc",
+ }
+
+ var badMatch [15]bool
+
+ if strings.Contains(Text[:], "<") {
+ for _, str := range badSelf {
+ if strings.Contains(Text[:], str) {
+ badMatch[0] = true
+ ok = true
+ break
+ }
+ }
}
- badMark := [10]string{
- " i/ ",
- " i / ",
- " b/ ",
- " b / ",
- " u/ ",
- " u / ",
- " sup/ ",
- " sup /",
- " sub/ ",
- " sub /",
+ if strings.Contains(Text[:], "&lt;") {
+ for _, str := range badSingle {
+ if strings.Contains(Text[:], str) {
+ badMatch[1] = true
+ ok = true
+ break
+ }
+ }
}
- var badMatch [10]bool
-
- uniSup := false
- uniSub := false
-
- for i, str := range badTags {
- if strings.Contains(Text[:], str) {
- badMatch[i] = true
- ok = true
+ if strings.Contains(Text[:], "&amp;lt;") {
+ for _, str := range badDouble {
+ if strings.Contains(Text[:], str) {
+ badMatch[2] = true
+ ok = true
+ break
+ }
}
}
+ if ok && savexml {
+ return Text[:]
+ }
+
for _, ch := range Text {
if ch > 127 {
if (ch >= '\u00B2' && ch <= '\u00B9') || (ch >= '\u2070' && ch <= '\u207F') {
- uniSup = true
+ badMatch[3] = true
ok = true
} else if ch >= '\u2080' && ch <= '\u208E' {
- uniSub = true
+ badMatch[4] = true
ok = true
}
}
@@ -8478,17 +8599,6 @@ func ProcessQuery(Text, parent string, index int, cmds *Block, tbls *Tables, act
if ok {
txt += id
- if uniSup {
- txt += " Supsc"
- } else {
- txt += " "
- }
- if uniSub {
- txt += " Subsc"
- } else {
- txt += " "
- }
-
for i, found := range badMatch {
if found {
txt += badMark[i]
@@ -8511,8 +8621,10 @@ func ProcessQuery(Text, parent string, index int, cmds *Block, tbls *Tables, act
return doQuery()
case DOINDEX:
return doIndex()
- case DOVALID:
- return doValid()
+ case DOFINDBAD:
+ return doValid(false)
+ case DOSAVEBAD:
+ return doValid(true)
default:
}
@@ -9159,6 +9271,7 @@ func main() {
// check for empty html tags and unicode superscript/subscripts
dmgd := false
+ cptr := false
// phrase to find anywhere in XML
phrs := ""
@@ -9292,6 +9405,8 @@ func main() {
// check for damaged html tags or unicode superscript/subscript characters (undocumented)
case "-damaged", "-damage":
dmgd = true
+ case "-capture":
+ cptr = true
case "-missing":
msng = true
// data cleanup flags
@@ -10304,14 +10419,38 @@ func main() {
// LOOK FOR IMPROPER HTML TAGS AND UNICODE SUPERSCRIPT/SUBSCRIPT CHARACTERS
// -index plus -damaged plus -pattern with no other extraction arguments
- // reports empty html tags or Unicode superscript/subscript characters
+ // reports records with problem html tags or Unicode superscript/subscript characters
if indx != "" && dmgd && len(args) == 2 {
PartitionPattern(topPattern, star, rdr,
func(rec int, ofs int64, str string) {
recordCount++
- res := ProcessQuery(str[:], parent, rec, nil, tbls, DOVALID)
+ res := ProcessQuery(str[:], parent, rec, nil, tbls, DOFINDBAD)
+ if res == "" {
+ return
+ }
+
+ os.Stdout.WriteString(res[:])
+ os.Stdout.WriteString("\n")
+ })
+
+ if timr {
+ printDuration("records")
+ }
+
+ return
+ }
+
+ // -index plus -capture plus -pattern with no other extraction arguments
+ // saves records with problem html tags or Unicode superscript/subscript characters
+ if indx != "" && cptr && len(args) == 2 {
+
+ PartitionPattern(topPattern, star, rdr,
+ func(rec int, ofs int64, str string) {
+ recordCount++
+
+ res := ProcessQuery(str[:], parent, rec, nil, tbls, DOSAVEBAD)
if res == "" {
return
}