diff options
author | Aaron M. Ucko <ucko@debian.org> | 2017-10-05 17:51:06 -0400 |
---|---|---|
committer | Aaron M. Ucko <ucko@debian.org> | 2017-10-05 17:53:49 -0400 |
commit | 5b42e03d8658c26a715e088e82f3379b145b3ba2 (patch) | |
tree | 79475ee9c43ce86a8f730bd3f648e27e38ca9c0f | |
parent | 386713d0283ba164bfe832eca13e0409cb1349c1 (diff) | |
parent | d253448077bb6306232bdaf404df863f5aa42ce8 (diff) |
Merge tag 'upstream/7.00.20170710+ds'
Upstream version 7.00.20170710(+ds).
-rw-r--r-- | debian/changelog | 6 | ||||
-rwxr-xr-x | edirect.pl | 162 | ||||
-rw-r--r-- | xtract.go | 192 |
3 files changed, 319 insertions, 41 deletions
diff --git a/debian/changelog b/debian/changelog index 956a4bf..21d4a42 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,9 @@ +ncbi-entrez-direct (7.00.20170710+ds-1) UNRELEASED; urgency=medium + + * New upstream release. (NOT YET RELEASED.) + + -- Aaron M. Ucko <ucko@debian.org> Thu, 05 Oct 2017 17:51:06 -0400 + ncbi-entrez-direct (6.90.20170705+ds-2) unstable; urgency=medium * debian/rules: Rework Go-related logic to accommodate builds on @@ -87,7 +87,7 @@ use constant true => 1; # EDirect version number -$version = "6.90"; +$version = "7.00"; # URL address components @@ -171,6 +171,7 @@ sub clearflags { $sort = ""; $source = ""; $spell = false; + $split = ""; $status = ""; $stp = ""; $strand = ""; @@ -184,6 +185,17 @@ sub clearflags { $web = ""; $word = false; $year = ""; + + $stop_words="#a#about#again#all#almost#also#although#always#among#an#and#" . + "another#any#are#as#at#be#because#been#before#being#between#both#but#by#can#" . + "could#did#do#does#done#due#during#each#either#enough#especially#etc#for#" . + "found#from#further#had#has#have#having#here#how#however#i#if#in#into#is#it#" . + "its#itself#just#kg#km#made#mainly#make#may#mg#might#ml#mm#most#mostly#" . + "must#nearly#neither#no#nor#obtained#of#often#on#our#overall#perhaps#pmid#" . + "quite#rather#really#regarding#seem#seen#several#should#show#showed#shown#" . + "shows#significantly#since#so#some#such#than#that#the#their#theirs#them#" . + "then#there#therefore#these#they#this#those#through#thus#to#upon#use#used#" . + "using#various#very#was#we#were#what#when#which#while#with#within#without#would#"; } # gets a live UID for any database @@ -1018,6 +1030,11 @@ Date Constraint -mindate Start of date range -maxdate End of date range +Limit by Field + + -field Query words individually in field + -pairs Query overlapping word pairs + Spell Check -spell Correct misspellings in query @@ -1353,7 +1370,9 @@ sub efilt { "maxdate=s" => \$mxdate, "datetype=s" => \$dttype, "label=s" => \$lbl, + "field=s" => \$field, "spell" => \$spell, + "pairs=s" => \$pair, "pub=s" => \$pub, "feature=s" => \$feature, "location=s" => \$location, @@ -1448,6 +1467,25 @@ sub efilt { test_edirect ( $dbase, $web, $key, $num, "filter" ); + # -field combines -drop and -split (-field TITL produces same behavior as Web PubMed) + if ( $field ne "" ) { + $query = remove_stop_words ($query); + $query = field_each_word ($field, $query); + } + + # -pairs separately fields query word pairs, breaking chain at stop words + if ( $pair ne "" ) { + $query = remove_punctuation ($query); + if ( $query =~ /^ +(.+)$/ ) { + $query = $1; + } + if ( $query =~ /^(.+) +$/ ) { + $query = $1; + } + $query =~ s/ +/ /g; + $query = field_each_pair ($pair, $query); + } + # spell check each query word if ( $spell ) { $query = spell_check_query ($dbase, $query); @@ -1488,6 +1526,7 @@ sub efilt { $key = ""; $num = ""; $err = ""; + my $trn = ""; $output = ""; @@ -1499,6 +1538,7 @@ sub efilt { $web = $1 if ($output =~ /<WebEnv>(\S+)<\/WebEnv>/); $key = $1 if ($output =~ /<QueryKey>(\S+)<\/QueryKey>/); $num = $1 if ($output =~ /<Count>(\S+)<\/Count>/); + $trn = $1 if ($output =~ /<QueryTranslation>(.+?)<\/QueryTranslation>/i); } else { if ( ! $silent ) { print STDERR "Retrying efilter, step $stp: $err\n"; @@ -1539,10 +1579,46 @@ sub efilt { } write_edirect ( $dbase, $web, $key, $num, $stp, $err, $tool, $email ); + + if ( $log ) { + if ( $trn ne "" ) { + print STDERR "$trn\n"; + } + } } # efetch -format docsum calls esmry to retrieve document summaries +sub fix_mixed_content { + + my $x = shift (@_); + + while ( $x =~ /\&\;/ || $x =~ /\<\;/ || $x =~ /\>\;/ ) { + HTML::Entities::decode_entities($x); + } + # removed mixed content tags + $x =~ s|<b>||g; + $x =~ s|<i>||g; + $x =~ s|<u>||g; + $x =~ s|<sup>||g; + $x =~ s|<sub>||g; + $x =~ s|</b>||g; + $x =~ s|</i>||g; + $x =~ s|</u>||g; + $x =~ s|</sup>||g; + $x =~ s|</sub>||g; + $x =~ s|<b/>||g; + $x =~ s|<i/>||g; + $x =~ s|<u/>||g; + $x =~ s|<sup/>||g; + $x =~ s|<sub/>||g; + # Reencode any resulting less-than or greater-than entities to avoid breaking the XML. + $x =~ s/</</g; + $x =~ s/>/>/g; + + return $x; +} + my %fields_to_fix = ( 'biosample' => ['SampleData'], 'medgen' => ['ConceptMeta'], @@ -1554,7 +1630,15 @@ sub fix_one_encoding { my $dbase = shift (@_); my $data = shift (@_); - if ( $dbase eq "gene" ) { + if ( $dbase eq "pubmed" ) { + if ( $data =~ /<Title>(.+?)<\/Title>/ ) { + my $x = $1; + if ( $x =~ /\&\;/ || $x =~ /\<\;/ || $x =~ /\>\;/ ) { + $x = fix_mixed_content($x); + $data =~ s/<Title>(.+?)<\/Title>/<Title>$x<\/Title>/; + } + } + } elsif ( $dbase eq "gene" ) { if ( $data =~ /<Summary>(.+?)<\/Summary>/ ) { my $x = $1; if ( $x =~ /\&\;/ ) { @@ -1598,7 +1682,7 @@ sub fix_bad_encoding { my $dbase = shift (@_); my $data = shift (@_); - if ( $dbase eq "gene" || $dbase eq "assembly" || defined $fields_to_fix{$dbase} ) { + if ( $dbase eq "pubmed" || $dbase eq "gene" || $dbase eq "assembly" || defined $fields_to_fix{$dbase} ) { my @accum = (); my @working = (); @@ -1985,6 +2069,31 @@ sub fix_sra_xml_encoding { return $data; } +sub fix_pubmed_xml_encoding { + + my $dbase = shift (@_); + my $data = shift (@_); + + if ( $dbase eq "pubmed" ) { + if ( $data =~ /<ArticleTitle>(.+?)<\/ArticleTitle>/ ) { + my $x = $1; + if ( $x =~ /\&\;/ || $x =~ /\<\;/ || $x =~ /\>\;/ ) { + $x = fix_mixed_content($x); + $data =~ s/<ArticleTitle>(.+?)<\/ArticleTitle>/<ArticleTitle>$x<\/ArticleTitle>/; + } + } + if ( $data =~ /<AbstractText>(.+?)<\/AbstractText>/ ) { + my $x = $1; + if ( $x =~ /\&\;/ || $x =~ /\<\;/ || $x =~ /\>\;/ ) { + $x = fix_mixed_content($x); + $data =~ s/<AbstractText>(.+?)<\/AbstractText>/<AbstractText>$x<\/AbstractText>/; + } + } + } + + return $data; +} + sub eftch { # ... | edirect.pl -fetch -format gp | ... @@ -2321,6 +2430,10 @@ sub eftch { $$data = fix_sra_xml_encoding($dbase, $$data); } + if ( $dbase eq "pubmed" and $type eq "full" and $mode eq "xml" ) { + $$data = fix_pubmed_xml_encoding($dbase, $$data); + } + if ( $type eq "fasta" or $type eq "fasta_cds_aa" or $type eq "fasta_cds_na" or $type eq "gene_fasta" ) { # remove blank lines in FASTA format $$data =~ s/\n+/\n/g; @@ -2472,6 +2585,10 @@ sub eftch { $$data = fix_sra_xml_encoding($dbase, $$data); } + if ( $dbase eq "pubmed" and $type eq "full" and $mode eq "xml" ) { + $$data = fix_pubmed_xml_encoding($dbase, $$data); + } + if ( $type eq "fasta" or $type eq "fasta_cds_aa" or $type eq "fasta_cds_na" or $type eq "gene_fasta" ) { # remove blank lines in FASTA format $$data =~ s/\n+/\n/g; @@ -4104,6 +4221,11 @@ Date Constraint -mindate Start of date range -maxdate End of date range +Limit by Field + + -field Query words individually in field + -pairs Query overlapping word pairs + Spell Check -spell Correct misspellings in query @@ -4171,17 +4293,6 @@ sub remove_stop_words { my $qury = shift (@_); - my $stop_words="#a#about#again#all#almost#also#although#always#among#an#and#" . - "another#any#are#as#at#be#because#been#before#being#between#both#but#by#can#" . - "could#did#do#does#done#due#during#each#either#enough#especially#etc#for#" . - "found#from#further#had#has#have#having#here#how#however#i#if#in#into#is#it#" . - "its#itself#just#kg#km#made#mainly#make#may#mg#might#ml#mm#most#mostly#" . - "must#nearly#neither#no#nor#obtained#of#often#on#our#overall#perhaps#pmid#" . - "quite#rather#really#regarding#seem#seen#several#should#show#showed#shown#" . - "shows#significantly#since#so#some#such#than#that#the#their#theirs#them#" . - "then#there#therefore#these#they#this#those#through#thus#to#upon#use#used#" . - "using#various#very#was#we#were#what#when#which#while#with#within#without#would#"; - # split to protect against regular expression artifacts $qury =~ s/[^a-zA-Z0-9]/ /g; $qury =~ s/ +/ /g; @@ -4259,6 +4370,12 @@ sub field_each_pair { my $prev = ""; foreach $term (@words) { + my $trm = lc($term); + $trm = "#$trm#"; + if ($stop_words =~ /$trm/) { + $prev = ""; + $term = ""; + } if ( $prev ne "" ) { $qury .= "$pfx\"$prev $term\" [$fld]"; $pfx = " AND "; @@ -4294,14 +4411,15 @@ sub esrch { "status=s" => \$status, "type=s" => \$gtype, "clean" => \$clean, + "field=s" => \$field, "word" => \$word, "drop" => \$drop, "trim" => \$trim, "trunc" => \$trunc, "spell" => \$spell, - "split=s" => \$field, + "split=s" => \$split, "merge=s" => \$meadow, - "pair=s" => \$pair, + "pairs=s" => \$pair, "email=s" => \$emaddr, "tool=s" => \$tuul, "help" => \$help, @@ -4419,8 +4537,8 @@ sub esrch { } # force each query word to be separately fielded, combined with AND (undocumented) - if ( $field ne "" ) { - $query = field_each_word ($field, $query); + if ( $split ne "" ) { + $query = field_each_word ($split, $query); } # force each query word to be separately fielded, combined with OR (undocumented) @@ -4428,7 +4546,13 @@ sub esrch { $query = merge_each_word ($meadow, $query); } - # separately field query word pairs in future experimental bigram index (undocumented) + # -field combines -drop and -split (-field TITL produces same behavior as Web PubMed) + if ( $field ne "" ) { + $query = remove_stop_words ($query); + $query = field_each_word ($field, $query); + } + + # -pairs separately fields query word pairs, breaking chain at stop words if ( $pair ne "" ) { $query = remove_punctuation ($query); if ( $query =~ /^ +(.+)$/ ) { @@ -78,7 +78,7 @@ import ( // VERSION AND HELP MESSAGE TEXT -const xtractVersion = "6.90" +const xtractVersion = "7.00" const xtractHelp = ` Overview @@ -2402,9 +2402,22 @@ func IsAllCapsOrDigits(str string) bool { func HasAngleBracket(str string) bool { + hasAmp := false + hasSemi := false + for _, ch := range str { if ch == '<' || ch == '>' { return true + } else if ch == '&' { + hasAmp = true + } else if ch == ';' { + hasSemi = true + } + } + + if hasAmp && hasSemi { + if strings.Contains(str, "<") || strings.Contains(str, ">") || strings.Contains(str, "&") { + return true } } @@ -2802,19 +2815,78 @@ func ParseFlag(str string) OpType { var ( rlock sync.Mutex replr *strings.Replacer + rpair *strings.Replacer ) func DoHtmlReplace(str string) string { - // replacer not reentrant, protected by mutex + // replacer/repairer not reentrant, protected by mutex rlock.Lock() if replr == nil { - replr = strings.NewReplacer("<i>", "", "</i>", "", "<i/>", "", "<i />", "", - "<b>", "", "</b>", "", "<b/>", "", "<b />", "", - "<u>", "", "</u>", "", "<u/>", "", "<u />", "", - "<sub>", "", "</sub>", "", "<sub/>", "", "<sub />", "", - "<sup>", "", "</sup>", "", "<sup/>", "", "<sup />", "") + // handles mixed-content tags, with zero, one, or two levels of encoding + replr = strings.NewReplacer( + "<i>", "", + "</i>", "", + "<i/>", "", + "<i />", "", + "<b>", "", + "</b>", "", + "<b/>", "", + "<b />", "", + "<u>", "", + "</u>", "", + "<u/>", "", + "<u />", "", + "<sub>", "", + "</sub>", "", + "<sub/>", "", + "<sub />", "", + "<sup>", "", + "</sup>", "", + "<sup/>", "", + "<sup />", "", + "<i>", "", + "</i>", "", + "<i/>", "", + "<i />", "", + "<b>", "", + "</b>", "", + "<b/>", "", + "<b />", "", + "<u>", "", + "</u>", "", + "<u/>", "", + "<u />", "", + "<sub>", "", + "</sub>", "", + "<sub/>", "", + "<sub />", "", + "<sup>", "", + "</sup>", "", + "<sup/>", "", + "<sup />", "", + "&lt;i&gt;", "", + "&lt;/i&gt;", "", + "&lt;i/&gt;", "", + "&lt;i /&gt;", "", + "&lt;b&gt;", "", + "&lt;/b&gt;", "", + "&lt;b/&gt;", "", + "&lt;b /&gt;", "", + "&lt;u&gt;", "", + "&lt;/u&gt;", "", + "&lt;u/&gt;", "", + "&lt;u /&gt;", "", + "&lt;sub&gt;", "", + "&lt;/sub&gt;", "", + "&lt;sub/&gt;", "", + "&lt;sub /&gt;", "", + "&lt;sup&gt;", "", + "&lt;/sup&gt;", "", + "&lt;sup/&gt;", "", + "&lt;sup /&gt;", "", + ) } if replr != nil { @@ -2826,6 +2898,66 @@ func DoHtmlReplace(str string) string { return str } +func DoHtmlRepair(str string) string { + + // replacer/repairer not reentrant, protected by mutex + rlock.Lock() + + if rpair == nil { + // handles mixed-content tags, with zero, one, or two levels of encoding + rpair = strings.NewReplacer( + "<i>", "<i>", + "</i>", "</i>", + "<i/>", "<i/>", + "<i />", "<i/>", + "<b>", "<b>", + "</b>", "</b>", + "<b/>", "<b/>", + "<b />", "<b/>", + "<u>", "<u>", + "</u>", "</u>", + "<u/>", "<u/>", + "<u />", "<u/>", + "<sub>", "<sub>", + "</sub>", "</sub>", + "<sub/>", "<sub/>", + "<sub />", "<sub/>", + "<sup>", "<sup>", + "</sup>", "</sup>", + "<sup/>", "<sup/>", + "<sup />", "<sup/>", + "&lt;i&gt;", "<i>", + "&lt;/i&gt;", "</i>", + "&lt;i/&gt;", "<i/>", + "&lt;i /&gt;", "<i/>", + "&lt;b&gt;", "<b>", + "&lt;/b&gt;", "</b>", + "&lt;b/&gt;", "<b/>", + "&lt;b /&gt;", "<b/>", + "&lt;u&gt;", "<u>", + "&lt;/u&gt;", "</u>", + "&lt;u/&gt;", "<u/>", + "&lt;u /&gt;", "<u/>", + "&lt;sub&gt;", "<sub>", + "&lt;/sub&gt;", "</sub>", + "&lt;sub/&gt;", "<sub/>", + "&lt;sub /&gt;", "<sub/>", + "&lt;sup&gt;", "<sup>", + "&lt;/sup&gt;", "</sup>", + "&lt;sup/&gt;", "<sup/>", + "&lt;sup /&gt;", "<sup/>", + ) + } + + if rpair != nil { + str = rpair.Replace(str) + } + + rlock.Unlock() + + return str +} + func HasBadAccent(str string) bool { for _, ch := range str { @@ -5185,6 +5317,9 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special if HasMarkup(str) { str = SimulateUnicodeMarkup(str) } + if HasAngleBracket(str) { + str = DoHtmlRepair(str) + } } if tbls.DeAccent { if IsNotASCII(str) { @@ -5546,6 +5681,9 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special if HasMarkup(name) { name = SimulateUnicodeMarkup(name) } + if HasAngleBracket(name) { + name = DoHtmlRepair(name) + } } if tbls.DeAccent { if IsNotASCII(name) { @@ -6991,17 +7129,6 @@ func ProcessClause(curr *Node, stages []*Step, mask, prev, pfx, sfx, sep, def st addToIndex := func(item, past string) string { - if IsNotASCII(item) { - item = DoAccentTransform(item) - } - item = strings.ToLower(item) - if HasBadSpace(item) { - item = CleanupBadSpaces(item) - } - if HasMarkup(item) { - item = RemoveUnicodeMarkup(item) - } - item = TrimPunctuation(item) if item == "" { return "" } @@ -7026,17 +7153,35 @@ func ProcessClause(curr *Node, stages []*Step, mask, prev, pfx, sfx, sep, def st processElement(func(str string) { if str != "" { - // break terms at spaces + if IsNotASCII(str) { + str = DoAccentTransform(str) + } + str = strings.ToLower(str) + if HasBadSpace(str) { + str = CleanupBadSpaces(str) + } + if HasMarkup(str) { + str = RemoveUnicodeMarkup(str) + } + if HasAngleBracket(str) { + str = DoHtmlReplace(str) + } + + // break terms at spaces, allowing hyphenated words terms := strings.Fields(str) + past := "" for _, item := range terms { - // index single term, allowing hyphenated words - addToIndex(item, "") + // allow parentheses in chemical formula + item = TrimPunctuation(item) + // index term and adjacent term pairs + past = addToIndex(item, past) } + // break words at non-alphanumeric punctuation words := strings.FieldsFunc(str, func(c rune) bool { return !unicode.IsLetter(c) && !unicode.IsNumber(c) }) - past := "" + past = "" for _, item := range words { // index word and adjacent word pairs past = addToIndex(item, past) @@ -8076,6 +8221,9 @@ func ProcessQuery(Text, parent string, index int, cmds *Block, tbls *Tables, act if HasMarkup(name) { name = SimulateUnicodeMarkup(name) } + if HasAngleBracket(name) { + name = DoHtmlReplace(name) + } } if tbls.DeAccent { if IsNotASCII(name) { |