summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAaron M. Ucko <ucko@debian.org>2017-10-05 17:51:06 -0400
committerAaron M. Ucko <ucko@debian.org>2017-10-05 17:53:49 -0400
commit5b42e03d8658c26a715e088e82f3379b145b3ba2 (patch)
tree79475ee9c43ce86a8f730bd3f648e27e38ca9c0f
parent386713d0283ba164bfe832eca13e0409cb1349c1 (diff)
parentd253448077bb6306232bdaf404df863f5aa42ce8 (diff)
Merge tag 'upstream/7.00.20170710+ds'
Upstream version 7.00.20170710(+ds).
-rw-r--r--debian/changelog6
-rwxr-xr-xedirect.pl162
-rw-r--r--xtract.go192
3 files changed, 319 insertions, 41 deletions
diff --git a/debian/changelog b/debian/changelog
index 956a4bf..21d4a42 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,9 @@
+ncbi-entrez-direct (7.00.20170710+ds-1) UNRELEASED; urgency=medium
+
+ * New upstream release. (NOT YET RELEASED.)
+
+ -- Aaron M. Ucko <ucko@debian.org> Thu, 05 Oct 2017 17:51:06 -0400
+
ncbi-entrez-direct (6.90.20170705+ds-2) unstable; urgency=medium
* debian/rules: Rework Go-related logic to accommodate builds on
diff --git a/edirect.pl b/edirect.pl
index d888659..a8d1050 100755
--- a/edirect.pl
+++ b/edirect.pl
@@ -87,7 +87,7 @@ use constant true => 1;
# EDirect version number
-$version = "6.90";
+$version = "7.00";
# URL address components
@@ -171,6 +171,7 @@ sub clearflags {
$sort = "";
$source = "";
$spell = false;
+ $split = "";
$status = "";
$stp = "";
$strand = "";
@@ -184,6 +185,17 @@ sub clearflags {
$web = "";
$word = false;
$year = "";
+
+ $stop_words="#a#about#again#all#almost#also#although#always#among#an#and#" .
+ "another#any#are#as#at#be#because#been#before#being#between#both#but#by#can#" .
+ "could#did#do#does#done#due#during#each#either#enough#especially#etc#for#" .
+ "found#from#further#had#has#have#having#here#how#however#i#if#in#into#is#it#" .
+ "its#itself#just#kg#km#made#mainly#make#may#mg#might#ml#mm#most#mostly#" .
+ "must#nearly#neither#no#nor#obtained#of#often#on#our#overall#perhaps#pmid#" .
+ "quite#rather#really#regarding#seem#seen#several#should#show#showed#shown#" .
+ "shows#significantly#since#so#some#such#than#that#the#their#theirs#them#" .
+ "then#there#therefore#these#they#this#those#through#thus#to#upon#use#used#" .
+ "using#various#very#was#we#were#what#when#which#while#with#within#without#would#";
}
# gets a live UID for any database
@@ -1018,6 +1030,11 @@ Date Constraint
-mindate Start of date range
-maxdate End of date range
+Limit by Field
+
+ -field Query words individually in field
+ -pairs Query overlapping word pairs
+
Spell Check
-spell Correct misspellings in query
@@ -1353,7 +1370,9 @@ sub efilt {
"maxdate=s" => \$mxdate,
"datetype=s" => \$dttype,
"label=s" => \$lbl,
+ "field=s" => \$field,
"spell" => \$spell,
+ "pairs=s" => \$pair,
"pub=s" => \$pub,
"feature=s" => \$feature,
"location=s" => \$location,
@@ -1448,6 +1467,25 @@ sub efilt {
test_edirect ( $dbase, $web, $key, $num, "filter" );
+ # -field combines -drop and -split (-field TITL produces same behavior as Web PubMed)
+ if ( $field ne "" ) {
+ $query = remove_stop_words ($query);
+ $query = field_each_word ($field, $query);
+ }
+
+ # -pairs separately fields query word pairs, breaking chain at stop words
+ if ( $pair ne "" ) {
+ $query = remove_punctuation ($query);
+ if ( $query =~ /^ +(.+)$/ ) {
+ $query = $1;
+ }
+ if ( $query =~ /^(.+) +$/ ) {
+ $query = $1;
+ }
+ $query =~ s/ +/ /g;
+ $query = field_each_pair ($pair, $query);
+ }
+
# spell check each query word
if ( $spell ) {
$query = spell_check_query ($dbase, $query);
@@ -1488,6 +1526,7 @@ sub efilt {
$key = "";
$num = "";
$err = "";
+ my $trn = "";
$output = "";
@@ -1499,6 +1538,7 @@ sub efilt {
$web = $1 if ($output =~ /<WebEnv>(\S+)<\/WebEnv>/);
$key = $1 if ($output =~ /<QueryKey>(\S+)<\/QueryKey>/);
$num = $1 if ($output =~ /<Count>(\S+)<\/Count>/);
+ $trn = $1 if ($output =~ /<QueryTranslation>(.+?)<\/QueryTranslation>/i);
} else {
if ( ! $silent ) {
print STDERR "Retrying efilter, step $stp: $err\n";
@@ -1539,10 +1579,46 @@ sub efilt {
}
write_edirect ( $dbase, $web, $key, $num, $stp, $err, $tool, $email );
+
+ if ( $log ) {
+ if ( $trn ne "" ) {
+ print STDERR "$trn\n";
+ }
+ }
}
# efetch -format docsum calls esmry to retrieve document summaries
+sub fix_mixed_content {
+
+ my $x = shift (@_);
+
+ while ( $x =~ /\&amp\;/ || $x =~ /\&lt\;/ || $x =~ /\&gt\;/ ) {
+ HTML::Entities::decode_entities($x);
+ }
+ # removed mixed content tags
+ $x =~ s|<b>||g;
+ $x =~ s|<i>||g;
+ $x =~ s|<u>||g;
+ $x =~ s|<sup>||g;
+ $x =~ s|<sub>||g;
+ $x =~ s|</b>||g;
+ $x =~ s|</i>||g;
+ $x =~ s|</u>||g;
+ $x =~ s|</sup>||g;
+ $x =~ s|</sub>||g;
+ $x =~ s|<b/>||g;
+ $x =~ s|<i/>||g;
+ $x =~ s|<u/>||g;
+ $x =~ s|<sup/>||g;
+ $x =~ s|<sub/>||g;
+ # Reencode any resulting less-than or greater-than entities to avoid breaking the XML.
+ $x =~ s/</&lt;/g;
+ $x =~ s/>/&gt;/g;
+
+ return $x;
+}
+
my %fields_to_fix = (
'biosample' => ['SampleData'],
'medgen' => ['ConceptMeta'],
@@ -1554,7 +1630,15 @@ sub fix_one_encoding {
my $dbase = shift (@_);
my $data = shift (@_);
- if ( $dbase eq "gene" ) {
+ if ( $dbase eq "pubmed" ) {
+ if ( $data =~ /<Title>(.+?)<\/Title>/ ) {
+ my $x = $1;
+ if ( $x =~ /\&amp\;/ || $x =~ /\&lt\;/ || $x =~ /\&gt\;/ ) {
+ $x = fix_mixed_content($x);
+ $data =~ s/<Title>(.+?)<\/Title>/<Title>$x<\/Title>/;
+ }
+ }
+ } elsif ( $dbase eq "gene" ) {
if ( $data =~ /<Summary>(.+?)<\/Summary>/ ) {
my $x = $1;
if ( $x =~ /\&amp\;/ ) {
@@ -1598,7 +1682,7 @@ sub fix_bad_encoding {
my $dbase = shift (@_);
my $data = shift (@_);
- if ( $dbase eq "gene" || $dbase eq "assembly" || defined $fields_to_fix{$dbase} ) {
+ if ( $dbase eq "pubmed" || $dbase eq "gene" || $dbase eq "assembly" || defined $fields_to_fix{$dbase} ) {
my @accum = ();
my @working = ();
@@ -1985,6 +2069,31 @@ sub fix_sra_xml_encoding {
return $data;
}
+sub fix_pubmed_xml_encoding {
+
+ my $dbase = shift (@_);
+ my $data = shift (@_);
+
+ if ( $dbase eq "pubmed" ) {
+ if ( $data =~ /<ArticleTitle>(.+?)<\/ArticleTitle>/ ) {
+ my $x = $1;
+ if ( $x =~ /\&amp\;/ || $x =~ /\&lt\;/ || $x =~ /\&gt\;/ ) {
+ $x = fix_mixed_content($x);
+ $data =~ s/<ArticleTitle>(.+?)<\/ArticleTitle>/<ArticleTitle>$x<\/ArticleTitle>/;
+ }
+ }
+ if ( $data =~ /<AbstractText>(.+?)<\/AbstractText>/ ) {
+ my $x = $1;
+ if ( $x =~ /\&amp\;/ || $x =~ /\&lt\;/ || $x =~ /\&gt\;/ ) {
+ $x = fix_mixed_content($x);
+ $data =~ s/<AbstractText>(.+?)<\/AbstractText>/<AbstractText>$x<\/AbstractText>/;
+ }
+ }
+ }
+
+ return $data;
+}
+
sub eftch {
# ... | edirect.pl -fetch -format gp | ...
@@ -2321,6 +2430,10 @@ sub eftch {
$$data = fix_sra_xml_encoding($dbase, $$data);
}
+ if ( $dbase eq "pubmed" and $type eq "full" and $mode eq "xml" ) {
+ $$data = fix_pubmed_xml_encoding($dbase, $$data);
+ }
+
if ( $type eq "fasta" or $type eq "fasta_cds_aa" or $type eq "fasta_cds_na" or $type eq "gene_fasta" ) {
# remove blank lines in FASTA format
$$data =~ s/\n+/\n/g;
@@ -2472,6 +2585,10 @@ sub eftch {
$$data = fix_sra_xml_encoding($dbase, $$data);
}
+ if ( $dbase eq "pubmed" and $type eq "full" and $mode eq "xml" ) {
+ $$data = fix_pubmed_xml_encoding($dbase, $$data);
+ }
+
if ( $type eq "fasta" or $type eq "fasta_cds_aa" or $type eq "fasta_cds_na" or $type eq "gene_fasta" ) {
# remove blank lines in FASTA format
$$data =~ s/\n+/\n/g;
@@ -4104,6 +4221,11 @@ Date Constraint
-mindate Start of date range
-maxdate End of date range
+Limit by Field
+
+ -field Query words individually in field
+ -pairs Query overlapping word pairs
+
Spell Check
-spell Correct misspellings in query
@@ -4171,17 +4293,6 @@ sub remove_stop_words {
my $qury = shift (@_);
- my $stop_words="#a#about#again#all#almost#also#although#always#among#an#and#" .
- "another#any#are#as#at#be#because#been#before#being#between#both#but#by#can#" .
- "could#did#do#does#done#due#during#each#either#enough#especially#etc#for#" .
- "found#from#further#had#has#have#having#here#how#however#i#if#in#into#is#it#" .
- "its#itself#just#kg#km#made#mainly#make#may#mg#might#ml#mm#most#mostly#" .
- "must#nearly#neither#no#nor#obtained#of#often#on#our#overall#perhaps#pmid#" .
- "quite#rather#really#regarding#seem#seen#several#should#show#showed#shown#" .
- "shows#significantly#since#so#some#such#than#that#the#their#theirs#them#" .
- "then#there#therefore#these#they#this#those#through#thus#to#upon#use#used#" .
- "using#various#very#was#we#were#what#when#which#while#with#within#without#would#";
-
# split to protect against regular expression artifacts
$qury =~ s/[^a-zA-Z0-9]/ /g;
$qury =~ s/ +/ /g;
@@ -4259,6 +4370,12 @@ sub field_each_pair {
my $prev = "";
foreach $term (@words) {
+ my $trm = lc($term);
+ $trm = "#$trm#";
+ if ($stop_words =~ /$trm/) {
+ $prev = "";
+ $term = "";
+ }
if ( $prev ne "" ) {
$qury .= "$pfx\"$prev $term\" [$fld]";
$pfx = " AND ";
@@ -4294,14 +4411,15 @@ sub esrch {
"status=s" => \$status,
"type=s" => \$gtype,
"clean" => \$clean,
+ "field=s" => \$field,
"word" => \$word,
"drop" => \$drop,
"trim" => \$trim,
"trunc" => \$trunc,
"spell" => \$spell,
- "split=s" => \$field,
+ "split=s" => \$split,
"merge=s" => \$meadow,
- "pair=s" => \$pair,
+ "pairs=s" => \$pair,
"email=s" => \$emaddr,
"tool=s" => \$tuul,
"help" => \$help,
@@ -4419,8 +4537,8 @@ sub esrch {
}
# force each query word to be separately fielded, combined with AND (undocumented)
- if ( $field ne "" ) {
- $query = field_each_word ($field, $query);
+ if ( $split ne "" ) {
+ $query = field_each_word ($split, $query);
}
# force each query word to be separately fielded, combined with OR (undocumented)
@@ -4428,7 +4546,13 @@ sub esrch {
$query = merge_each_word ($meadow, $query);
}
- # separately field query word pairs in future experimental bigram index (undocumented)
+ # -field combines -drop and -split (-field TITL produces same behavior as Web PubMed)
+ if ( $field ne "" ) {
+ $query = remove_stop_words ($query);
+ $query = field_each_word ($field, $query);
+ }
+
+ # -pairs separately fields query word pairs, breaking chain at stop words
if ( $pair ne "" ) {
$query = remove_punctuation ($query);
if ( $query =~ /^ +(.+)$/ ) {
diff --git a/xtract.go b/xtract.go
index 48c0aae..5b1e3df 100644
--- a/xtract.go
+++ b/xtract.go
@@ -78,7 +78,7 @@ import (
// VERSION AND HELP MESSAGE TEXT
-const xtractVersion = "6.90"
+const xtractVersion = "7.00"
const xtractHelp = `
Overview
@@ -2402,9 +2402,22 @@ func IsAllCapsOrDigits(str string) bool {
func HasAngleBracket(str string) bool {
+ hasAmp := false
+ hasSemi := false
+
for _, ch := range str {
if ch == '<' || ch == '>' {
return true
+ } else if ch == '&' {
+ hasAmp = true
+ } else if ch == ';' {
+ hasSemi = true
+ }
+ }
+
+ if hasAmp && hasSemi {
+ if strings.Contains(str, "&lt;") || strings.Contains(str, "&gt;") || strings.Contains(str, "&amp;") {
+ return true
}
}
@@ -2802,19 +2815,78 @@ func ParseFlag(str string) OpType {
var (
rlock sync.Mutex
replr *strings.Replacer
+ rpair *strings.Replacer
)
func DoHtmlReplace(str string) string {
- // replacer not reentrant, protected by mutex
+ // replacer/repairer not reentrant, protected by mutex
rlock.Lock()
if replr == nil {
- replr = strings.NewReplacer("<i>", "", "</i>", "", "<i/>", "", "<i />", "",
- "<b>", "", "</b>", "", "<b/>", "", "<b />", "",
- "<u>", "", "</u>", "", "<u/>", "", "<u />", "",
- "<sub>", "", "</sub>", "", "<sub/>", "", "<sub />", "",
- "<sup>", "", "</sup>", "", "<sup/>", "", "<sup />", "")
+ // handles mixed-content tags, with zero, one, or two levels of encoding
+ replr = strings.NewReplacer(
+ "<i>", "",
+ "</i>", "",
+ "<i/>", "",
+ "<i />", "",
+ "<b>", "",
+ "</b>", "",
+ "<b/>", "",
+ "<b />", "",
+ "<u>", "",
+ "</u>", "",
+ "<u/>", "",
+ "<u />", "",
+ "<sub>", "",
+ "</sub>", "",
+ "<sub/>", "",
+ "<sub />", "",
+ "<sup>", "",
+ "</sup>", "",
+ "<sup/>", "",
+ "<sup />", "",
+ "&lt;i&gt;", "",
+ "&lt;/i&gt;", "",
+ "&lt;i/&gt;", "",
+ "&lt;i /&gt;", "",
+ "&lt;b&gt;", "",
+ "&lt;/b&gt;", "",
+ "&lt;b/&gt;", "",
+ "&lt;b /&gt;", "",
+ "&lt;u&gt;", "",
+ "&lt;/u&gt;", "",
+ "&lt;u/&gt;", "",
+ "&lt;u /&gt;", "",
+ "&lt;sub&gt;", "",
+ "&lt;/sub&gt;", "",
+ "&lt;sub/&gt;", "",
+ "&lt;sub /&gt;", "",
+ "&lt;sup&gt;", "",
+ "&lt;/sup&gt;", "",
+ "&lt;sup/&gt;", "",
+ "&lt;sup /&gt;", "",
+ "&amp;lt;i&amp;gt;", "",
+ "&amp;lt;/i&amp;gt;", "",
+ "&amp;lt;i/&amp;gt;", "",
+ "&amp;lt;i /&amp;gt;", "",
+ "&amp;lt;b&amp;gt;", "",
+ "&amp;lt;/b&amp;gt;", "",
+ "&amp;lt;b/&amp;gt;", "",
+ "&amp;lt;b /&amp;gt;", "",
+ "&amp;lt;u&amp;gt;", "",
+ "&amp;lt;/u&amp;gt;", "",
+ "&amp;lt;u/&amp;gt;", "",
+ "&amp;lt;u /&amp;gt;", "",
+ "&amp;lt;sub&amp;gt;", "",
+ "&amp;lt;/sub&amp;gt;", "",
+ "&amp;lt;sub/&amp;gt;", "",
+ "&amp;lt;sub /&amp;gt;", "",
+ "&amp;lt;sup&amp;gt;", "",
+ "&amp;lt;/sup&amp;gt;", "",
+ "&amp;lt;sup/&amp;gt;", "",
+ "&amp;lt;sup /&amp;gt;", "",
+ )
}
if replr != nil {
@@ -2826,6 +2898,66 @@ func DoHtmlReplace(str string) string {
return str
}
+func DoHtmlRepair(str string) string {
+
+ // replacer/repairer not reentrant, protected by mutex
+ rlock.Lock()
+
+ if rpair == nil {
+ // handles mixed-content tags, with zero, one, or two levels of encoding
+ rpair = strings.NewReplacer(
+ "&lt;i&gt;", "<i>",
+ "&lt;/i&gt;", "</i>",
+ "&lt;i/&gt;", "<i/>",
+ "&lt;i /&gt;", "<i/>",
+ "&lt;b&gt;", "<b>",
+ "&lt;/b&gt;", "</b>",
+ "&lt;b/&gt;", "<b/>",
+ "&lt;b /&gt;", "<b/>",
+ "&lt;u&gt;", "<u>",
+ "&lt;/u&gt;", "</u>",
+ "&lt;u/&gt;", "<u/>",
+ "&lt;u /&gt;", "<u/>",
+ "&lt;sub&gt;", "<sub>",
+ "&lt;/sub&gt;", "</sub>",
+ "&lt;sub/&gt;", "<sub/>",
+ "&lt;sub /&gt;", "<sub/>",
+ "&lt;sup&gt;", "<sup>",
+ "&lt;/sup&gt;", "</sup>",
+ "&lt;sup/&gt;", "<sup/>",
+ "&lt;sup /&gt;", "<sup/>",
+ "&amp;lt;i&amp;gt;", "<i>",
+ "&amp;lt;/i&amp;gt;", "</i>",
+ "&amp;lt;i/&amp;gt;", "<i/>",
+ "&amp;lt;i /&amp;gt;", "<i/>",
+ "&amp;lt;b&amp;gt;", "<b>",
+ "&amp;lt;/b&amp;gt;", "</b>",
+ "&amp;lt;b/&amp;gt;", "<b/>",
+ "&amp;lt;b /&amp;gt;", "<b/>",
+ "&amp;lt;u&amp;gt;", "<u>",
+ "&amp;lt;/u&amp;gt;", "</u>",
+ "&amp;lt;u/&amp;gt;", "<u/>",
+ "&amp;lt;u /&amp;gt;", "<u/>",
+ "&amp;lt;sub&amp;gt;", "<sub>",
+ "&amp;lt;/sub&amp;gt;", "</sub>",
+ "&amp;lt;sub/&amp;gt;", "<sub/>",
+ "&amp;lt;sub /&amp;gt;", "<sub/>",
+ "&amp;lt;sup&amp;gt;", "<sup>",
+ "&amp;lt;/sup&amp;gt;", "</sup>",
+ "&amp;lt;sup/&amp;gt;", "<sup/>",
+ "&amp;lt;sup /&amp;gt;", "<sup/>",
+ )
+ }
+
+ if rpair != nil {
+ str = rpair.Replace(str)
+ }
+
+ rlock.Unlock()
+
+ return str
+}
+
func HasBadAccent(str string) bool {
for _, ch := range str {
@@ -5185,6 +5317,9 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
if HasMarkup(str) {
str = SimulateUnicodeMarkup(str)
}
+ if HasAngleBracket(str) {
+ str = DoHtmlRepair(str)
+ }
}
if tbls.DeAccent {
if IsNotASCII(str) {
@@ -5546,6 +5681,9 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
if HasMarkup(name) {
name = SimulateUnicodeMarkup(name)
}
+ if HasAngleBracket(name) {
+ name = DoHtmlRepair(name)
+ }
}
if tbls.DeAccent {
if IsNotASCII(name) {
@@ -6991,17 +7129,6 @@ func ProcessClause(curr *Node, stages []*Step, mask, prev, pfx, sfx, sep, def st
addToIndex := func(item, past string) string {
- if IsNotASCII(item) {
- item = DoAccentTransform(item)
- }
- item = strings.ToLower(item)
- if HasBadSpace(item) {
- item = CleanupBadSpaces(item)
- }
- if HasMarkup(item) {
- item = RemoveUnicodeMarkup(item)
- }
- item = TrimPunctuation(item)
if item == "" {
return ""
}
@@ -7026,17 +7153,35 @@ func ProcessClause(curr *Node, stages []*Step, mask, prev, pfx, sfx, sep, def st
processElement(func(str string) {
if str != "" {
- // break terms at spaces
+ if IsNotASCII(str) {
+ str = DoAccentTransform(str)
+ }
+ str = strings.ToLower(str)
+ if HasBadSpace(str) {
+ str = CleanupBadSpaces(str)
+ }
+ if HasMarkup(str) {
+ str = RemoveUnicodeMarkup(str)
+ }
+ if HasAngleBracket(str) {
+ str = DoHtmlReplace(str)
+ }
+
+ // break terms at spaces, allowing hyphenated words
terms := strings.Fields(str)
+ past := ""
for _, item := range terms {
- // index single term, allowing hyphenated words
- addToIndex(item, "")
+ // allow parentheses in chemical formula
+ item = TrimPunctuation(item)
+ // index term and adjacent term pairs
+ past = addToIndex(item, past)
}
+
// break words at non-alphanumeric punctuation
words := strings.FieldsFunc(str, func(c rune) bool {
return !unicode.IsLetter(c) && !unicode.IsNumber(c)
})
- past := ""
+ past = ""
for _, item := range words {
// index word and adjacent word pairs
past = addToIndex(item, past)
@@ -8076,6 +8221,9 @@ func ProcessQuery(Text, parent string, index int, cmds *Block, tbls *Tables, act
if HasMarkup(name) {
name = SimulateUnicodeMarkup(name)
}
+ if HasAngleBracket(name) {
+ name = DoHtmlReplace(name)
+ }
}
if tbls.DeAccent {
if IsNotASCII(name) {