#!/usr/bin/env perl # =========================================================================== # # PUBLIC DOMAIN NOTICE # National Center for Biotechnology Information (NCBI) # # This software/database is a "United States Government Work" under the # terms of the United States Copyright Act. It was written as part of # the author's official duties as a United States Government employee and # thus cannot be copyrighted. This software/database is freely available # to the public for use. The National Library of Medicine and the U.S. # Government do not place any restriction on its use or reproduction. # We would, however, appreciate having the NCBI and the author cited in # any work or product based on this material. # # Although all reasonable efforts have been taken to ensure the accuracy # and reliability of the software and data, the NLM and the U.S. # Government do not and cannot warrant the performance or results that # may be obtained by using this software or data. The NLM and the U.S. # Government disclaim all warranties, express or implied, including # warranties of performance, merchantability or fitness for any particular # purpose. # # =========================================================================== # # File Name: nquire # # Author: Jonathan Kans # # Version Creation Date: 8/20/12 # # ========================================================================== # Entrez Direct - EDirect # use strict; use warnings; my ($LibDir, $ScriptName); use File::Spec; # nquire version number $version = "10.9"; BEGIN { my $Volume; ($Volume, $LibDir, $ScriptName) = File::Spec->splitpath($0); $LibDir = File::Spec->catpath($Volume, $LibDir, ''); if (my $RealPathname = eval {readlink $0}) { do { $RealPathname = File::Spec->rel2abs($RealPathname, $LibDir); ($Volume, $LibDir, undef) = File::Spec->splitpath($RealPathname); $LibDir = File::Spec->catpath($Volume, $LibDir, '') } while ($RealPathname = eval {readlink $RealPathname}); } else { $LibDir = File::Spec->rel2abs($LibDir) } $LibDir .= '/aux/lib/perl5'; } use lib $LibDir; use JSON::PP; use LWP::UserAgent; use POSIX; use URI::Escape; use Net::FTP; use XML::Simple; # definitions use constant false => 0; use constant true => 1; # utility subroutines sub clearflags { %macros = (); $agent = "Nquire/1.0"; $alias = ""; $debug = false; $http = ""; $j2x = false; $output = ""; } sub map_macros { $qury = shift (@_); if ( $qury !~ /\(#/ ) { return $qury; } if ( scalar (keys %macros) > 0 ) { for ( keys %macros ) { $ky = $_; $vl = $macros{$_}; $qury =~ s/\((\#$ky)\)/$vl/g; } } return $qury; } sub read_aliases { if ( $alias ne "" ) { if (open (my $PROXY_IN, $alias)) { while ( $thisline = <$PROXY_IN> ) { $thisline =~ s/\r//; $thisline =~ s/\n//; $thisline =~ s/ +/ /g; $thisline =~ s/> 300); $usragnt->agent( "$agent" ); $res = $usragnt->get ( $urlx ); if ( $res->is_success) { $rslt = $res->content; } elsif ( $debug ) { print STDERR "STATUS: " . $res->status_line . "\n"; } if ( $rslt eq "" and $debug ) { print STDERR "No do_get output returned from '$urlx'\n"; } if ( $debug ) { print STDERR "$rslt\n"; } return $rslt; } $usragnt = new LWP::UserAgent (timeout => 300); $usragnt->agent( "$agent" ); $req = new HTTP::Request POST => "$urlx"; $req->content_type('application/x-www-form-urlencoded'); $req->content("$argx"); $res = $usragnt->request ( $req ); if ( $res->is_success) { $rslt = $res->content; } elsif ( $debug ) { print STDERR "STATUS: " . $res->status_line . "\n"; } if ( $rslt eq "" && $debug ) { if ( $argx ne "" ) { $urlx .= "?"; $urlx .= "$argx"; } print STDERR "No do_post output returned from '$urlx'\n"; } if ( $debug ) { print STDERR "$rslt\n"; } return $rslt; } # uri_escape with backslash exceptions sub do_uri_escape { $patx = shift (@_); $rslt = ""; while ( $patx ne "" ) { if ( $patx =~ /^\\\\(.+)/ ) { $rslt .= "\\"; $patx = $1; } elsif ( $patx =~ /^\\(.)(.+)/ ) { $rslt .= $1; $patx = $2; } elsif ( $patx =~ /^(.)(.+)/ ) { $rslt .= uri_escape ($1); $patx = $2; } elsif ( $patx =~ /^(.)/ ) { $rslt .= uri_escape ($1); $patx = ""; } } return $rslt; } sub convert_bools { my %unrecognized; local *_convert_bools = sub { my $ref_type = ref($_[0]); if (!$ref_type) { # Nothing. } elsif ($ref_type eq 'HASH') { _convert_bools($_) for values(%{ $_[0] }); } elsif ($ref_type eq 'ARRAY') { _convert_bools($_) for @{ $_[0] }; } elsif ( $ref_type eq 'JSON::PP::Boolean' || $ref_type eq 'Types::Serialiser::Boolean' ) { $_[0] = $_[0] ? 1 : 0; } else { ++$unrecognized{$ref_type}; } }; &_convert_bools; } # nquire executes an external URL query from command line arguments my $nquire_help = qq{ Query Commands -ftp Uses FTP instead of HTTP -get Uses HTTP GET instead of POST -url Base URL for external search Documentation -help Print this document -examples Examples of advanced queries -version Print version number Examples nquire -get -url "http://collections.mnh.si.edu/services/resolver/resolver.php" \\ -voucher "Birds:625456" | xtract -pattern Result -element ScientificName Country nquire -get -url http://w1.weather.gov/xml/current_obs/KSFO.xml | xtract -pattern current_observation -tab "\\n" \\ -element weather temp_f wind_dir wind_mph nquire -url "https://eutils.ncbi.nlm.nih.gov/entrez/eutils" elink.fcgi \\ -dbfrom protein -db protein -cmd neighbor -linkname protein_protein -id NP_476532.1 nquire -eutils efetch.fcgi -db pubmed -id 2539356 -rettype medline -retmode text nquire -eutils esummary.fcgi -db pubmed -id 2539356 -version 2.0 nquire -eutils esearch.fcgi -db pubmed -term "transposition immunity Tn3" | xtract -pattern eSearchResult -element QueryTranslation nquire -ftp ftp.ncbi.nlm.nih.gov pub/gdp ideogram_9606_GCF_000001305.14_850_V1 | grep acen | cut -f 1,2,6,7 | grep "^X\\t" }; my $nquire_examples = qq{ Medical Subject Headings nquire -get -url "http://id.nlm.nih.gov/mesh/sparql" \\ -query "PREFIX rdf: \\ SELECT DISTINCT ?class FROM \\ WHERE { ?s rdf:type ?class } ORDER BY ?class" | xtract -pattern result -pfx "meshv:" -first "uri[http://id.nlm.nih.gov/mesh/vocab#|]" meshv:AllowedDescriptorQualifierPair meshv:CheckTag meshv:Concept meshv:DisallowedDescriptorQualifierPair meshv:GeographicalDescriptor meshv:PublicationType meshv:Qualifier meshv:SCR_Chemical meshv:SCR_Disease meshv:SCR_Organism meshv:SCR_Protocol meshv:Term meshv:TopicalDescriptor meshv:TreeNumber MeSH Predicates nquire -get -url "http://id.nlm.nih.gov/mesh/sparql" \\ -query "SELECT DISTINCT ?p FROM WHERE { ?s ?p ?o } ORDER BY ?p" | xtract -pattern result -pfx "meshv:" -first "uri[http://id.nlm.nih.gov/mesh/vocab#|]" meshv:abbreviation meshv:active meshv:allowableQualifier meshv:altLabel meshv:annotation meshv:broaderConcept meshv:broaderDescriptor meshv:broaderQualifier meshv:casn1_label meshv:concept meshv:considerAlso meshv:dateCreated meshv:dateEstablished meshv:dateRevised meshv:entryVersion meshv:frequency meshv:hasDescriptor meshv:hasQualifier meshv:historyNote meshv:identifier meshv:indexerConsiderAlso meshv:lastActiveYear meshv:lexicalTag meshv:mappedTo meshv:narrowerConcept meshv:nlmClassificationNumber meshv:note meshv:onlineNote meshv:parentTreeNumber meshv:pharmacologicalAction meshv:prefLabel meshv:preferredConcept meshv:preferredMappedTo meshv:preferredTerm meshv:previousIndexing meshv:publicMeSHNote meshv:registryNumber meshv:relatedConcept meshv:relatedRegistryNumber meshv:scopeNote meshv:seeAlso meshv:sortVersion meshv:source meshv:term meshv:thesaurusID meshv:treeNumber meshv:useInstead WikiData Predicate List nquire -url "https://query.wikidata.org/sparql" \\ -query "SELECT ?property ?propertyType ?propertyLabel \\ ?propertyDescription ?propertyAltLabel WHERE { \\ ?property wikibase:propertyType ?propertyType . SERVICE wikibase:label \\ { bd:serviceParam wikibase:language '[AUTO_LANGUAGE],en'. } } \\ ORDER BY ASC(xsd:integer(STRAFTER(STR(?property), 'P')))" | xtract -pattern result -first "uri[http://www.wikidata.org/entity/|]" -first literal Selected WikiData Predicates P6 head of government P16 highway system P17 country P19 place of birth P21 sex or gender P22 father P25 mother P26 spouse P30 continent P31 instance of P35 head of state P36 capital P40 child P105 taxon rank P660 EC enzyme classification P672 MeSH Code P680 molecular function P681 cell component P682 biological process P685 NCBI Taxonomy ID P698 PubMed ID P699 Disease Ontology ID P932 PMCID P1340 eye color P2067 mass P2410 WikiPathways ID P2888 exact match Vitamin Binding Site nquire -get -url "http://www.wikidata.org/entity/Q22679758" | transmute -j2x | xtract -pattern entities -group claims -block P527 -element "value\@id" Children of JS Bach nquire -url "https://query.wikidata.org/sparql" \\ -query "SELECT ?child ?childLabel WHERE \\ { ?child wdt:P22 wd:Q1339. SERVICE wikibase:label \\ { bd:serviceParam wikibase:language '[AUTO_LANGUAGE],en'. } }" | xtract -pattern result -block binding -if "\@name" -equals childLabel -element literal Eye Color Frequency nquire -url "https://query.wikidata.org/sparql" \\ -query "SELECT ?eyeColorLabel WHERE \\ { ?human wdt:P31 wd:Q5. ?human wdt:P1340 ?eyeColor. SERVICE wikibase:label \\ { bd:serviceParam wikibase:language '[AUTO_LANGUAGE],en'. } }" | xtract -pattern result -element literal | sort-uniq-count-rank Federated Query nquire -url "https://query.wikidata.org/sparql" \\ -query " \\ PREFIX wp: \\ PREFIX dcterms: \\ PREFIX dc: \\ SELECT DISTINCT ?metabolite1Label ?metabolite2Label ?mass1 ?mass2 WITH { \\ SELECT ?metabolite1 ?metabolite2 WHERE { \\ ?pathwayItem wdt:P2410 'WP706'; \\ wdt:P2888 ?pwIri. \\ SERVICE { \\ ?pathway dc:identifier ?pwIri. \\ ?interaction rdf:type wp:Interaction; \\ wp:participants ?wpmb1, ?wpmb2; \\ dcterms:isPartOf ?pathway. \\ FILTER (?wpmb1 != ?wpmb2) \\ ?wpmb1 wp:bdbWikidata ?metabolite1. \\ ?wpmb2 wp:bdbWikidata ?metabolite2. \\ } \\ } \\ } AS %metabolites WHERE { \\ INCLUDE %metabolites. \\ ?metabolite1 wdt:P2067 ?mass1. \\ ?metabolite2 wdt:P2067 ?mass2. \\ SERVICE wikibase:label { bd:serviceParam wikibase:language '[AUTO_LANGUAGE],en'. } \\ }" | xtract -pattern result -block binding -element "binding\@name" literal BioThings Queries nquire -variant variant "chr6:g.26093141G>A" -fields dbsnp.gene | xtract -pattern gene -element \@geneid nquire -gene query -q "symbol:OPN1MW" -species 9606 | xtract -pattern hits -element "\@_id" nquire -gene query -q "symbol:OPN1MW AND taxid:9606" | xtract -pattern hits -element "\@_id" nquire -gene gene 2652 -fields pathway.wikipathways | xtract -pattern pathway -element "\@id" nquire -gene query -q "pathway.wikipathways.id:WP455" -size 300 | xtract -pattern hits -element "\@_id" nquire -chem query -q "drugbank.targets.uniprot:P05231 AND drugbank.targets.actions:inhibitor" -fields hgvs | xtract -pattern hits -element "\@_id" EDirect Expansion ExtractIDs() { xtract -pattern BIO_THINGS -block Id -tab "\\n" -element "Id" } WrapIDs() { xtract -wrp BIO_THINGS -pattern opt -wrp "Type" -lbl "\$1" \\ -wrp "Count" -num "\$2" -block "\$2" -wrp "Id" -element "\$3" | xtract -format } nquire -gene query -q "symbol:OPN1MW AND taxid:9606" | WrapIDs entrezgene hits "\@entrezgene" | ExtractIDs | while read geneid do nquire -gene gene "\$geneid" -fields pathway.wikipathways done | WrapIDs pathway.wikipathways.id pathway "\@id" | ExtractIDs | while read pathid do nquire -gene query -q "pathway.wikipathways.id:\$pathid" -size 300 done | WrapIDs entrezgene hits "\@entrezgene" | ExtractIDs | sort -n }; my @pubchem_properties = qw( MolecularFormula MolecularWeight CanonicalSMILES IsomericSMILES InChI InChIKey IUPACName XLogP ExactMass MonoisotopicMass TPSA Complexity Charge HBondDonorCount HBondAcceptorCount RotatableBondCount HeavyAtomCount IsotopeAtomCount AtomStereoCount DefinedAtomStereoCount UndefinedAtomStereoCount BondStereoCount DefinedBondStereoCount UndefinedBondStereoCount CovalentUnitCount Volume3D XStericQuadrupole3D YStericQuadrupole3D ZStericQuadrupole3D FeatureCount3D FeatureAcceptorCount3D FeatureDonorCount3D FeatureAnionCount3D FeatureCationCount3D FeatureRingCount3D FeatureHydrophobeCount3D ConformerModelRMSD3D EffectiveRotorCount3D ConformerCount3D Fingerprint2D ); sub nquire { # nquire -url http://... -tag value -tag value | ... $url = ""; $arg = ""; $pfx = ""; $amp = ""; $pat = ""; $sfx = ""; @args = @ARGV; $max = scalar @args; %biothingsHash = ( '-gene' => 'http://mygene.info/v3', '-variant' => 'http://myvariant.info/v1', '-chem' => 'http://mychem.info/v1', '-drug' => 'http://c.biothings.io/v1', '-taxon' => 'http://t.biothings.io/v1', ); if ( $max < 1 ) { return; } if ( $ARGV[0] eq "-version" ) { print "nquire $version\n"; return; } if ( $ARGV[0] eq "-help" ) { print "nquire $version\n"; print $nquire_help; return; } # -examples prints advanced sparql queries (undocumented) if ( $ARGV[0] eq "-examples" or $ARGV[0] eq "-example" or $ARGV[0] eq "-extras" or $ARGV[0] eq "-extra" ) { print "nquire $version\n"; print $nquire_examples; return; } if ( $max < 2 ) { return; } $i = 0; # if present, -debug must be first argument, only prints generated URL (undocumented) if ( $i < $max ) { $pat = $args[$i]; if ( $pat eq "-debug" ) { $i++; $debug = true; } } # if present, -ftp must be next if ( $i < $max ) { $pat = $args[$i]; if ( $pat eq "-ftp" ) { $i++; if ( $i < $max ) { my $server = $args[$i]; $i++; if ( $i < $max ) { my $dir = $args[$i]; $i++; if ( $i < $max ) { my $fl = $args[$i]; my $ftp = new Net::FTP($server, Passive => 1) or die "Unable to connect to FTP server: $!"; $ftp->login or die "Unable to log in to FTP server: ", $ftp->message; $ftp->cwd($dir) or die "Unable to change to $dir: ", $ftp->message; $ftp->binary or warn "Unable to set binary mode: ", $ftp->message; if (! $ftp->get($fl, "/dev/stdout") ) { my $msg = $ftp->message; chomp $msg; print STDERR "\nFAILED TO DOWNLOAD:\n\n$fl ($msg\n"; } } } } return; } } # if present, -http get or -get must be next (now also allow -http post or -post) # nquire -get -url "http://collections.mnh.si.edu/services/resolver/resolver.php" -voucher "Birds:625456" if ( $i < $max ) { $pat = $args[$i]; if ( $pat eq "-http" ) { $i++; if ( $i < $max ) { $http = $args[$i]; $i++; } } elsif ( $pat eq "-get" ) { $i++; $http = "get"; } elsif ( $pat eq "-post" ) { $i++; $http = "post"; } } # if present, -agent must be next argument (undocumented) if ( $i < $max ) { $pat = $args[$i]; if ( $pat eq "-agent" ) { $i++; if ( $i < $max ) { $agent = $args[$i]; $i++; } } } # read file of keyword shortcuts for URL expansion if ( $i < $max ) { $pat = $args[$i]; if ( $pat eq "-alias" ) { $i++; if ( $i < $max ) { $alias = $args[$i]; if ( $alias ne "" ) { read_aliases (); } $i++; } } } # read URL if ( $i < $max ) { $pat = $args[$i]; if ( $pat eq "-url" ) { $i++; if ( $i < $max ) { $url = $args[$i]; $url = map_macros ($url); $i++; } } elsif ( $pat eq "-ncbi" ) { # shortcut for ncbi base (undocumented) $i++; if ( $i < $max ) { $url = "https://www.ncbi.nlm.nih.gov"; } } elsif ( $pat eq "-eutils" ) { # shortcut for eutils base (undocumented) $i++; if ( $i < $max ) { $url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"; } } elsif ( $pat eq "-test" ) { # shortcut for eutilstest base (undocumented) $i++; if ( $i < $max ) { $url = "https://eutilstest.ncbi.nlm.nih.gov/entrez/eutils"; } } elsif ( $pat eq "-qa" ) { # shortcut for eutils QA base (undocumented) $i++; if ( $i < $max ) { $url = "http://qa.ncbi.nlm.nih.gov/entrez/eutils"; } } elsif ( $pat eq "-hydra" ) { # internal citation match request (undocumented) $i++; if ( $i < $max ) { $url = "https://www.ncbi.nlm.nih.gov/projects/hydra/hydra_search.cgi"; $pat = $args[$i]; $pat = map_macros ($pat); $enc = do_uri_escape ($pat); $arg="search=pubmed_search_citation_top_20.1&query=$enc"; $amp = "&"; $i++; } } elsif ( $pat eq "-revhist" ) { # internal sequence revision history request (undocumented) $i++; if ( $i < $max ) { $url = "https://www.ncbi.nlm.nih.gov/sviewer/girevhist.cgi"; $pat = $args[$i]; $arg="cmd=seqid&txt=on&seqid=asntext&os=PUBSEQ_OS&val=$pat"; $amp = "&"; $i++; } } elsif ( $pat eq "-pubchem" ) { # shortcut for PubChem Power User Gateway REST service base (undocumented) # nquire -pubchem "compound/name/creatine/property" "IUPACName,MolecularWeight,MolecularFormula" "XML" $i++; if ( $i < $max ) { $url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug"; if ( $i + 2 == $max && $args[$i] eq "compound" ) { # even shorter shortcut # nquire -pubchem compound creatine $pat = $args[$i + 1]; if ( $pat =~ /^-(.+)/ ) { } elsif ( $pat !~ /\// ) { $i = $i + 2; $url .= "/compound/name/"; $pat = map_macros ($pat); $url .= $pat; $url .= "/property/"; $sfx = join(",", @pubchem_properties); $url .= $sfx; $url .= "/XML"; } } } } elsif ( defined $biothingsHash{$pat} ) { # shortcuts for biothings services (undocumented) $i++; $url = $biothingsHash{$pat}; if ( $http eq "" ) { $http = "get"; } $j2x = true; } elsif ( $pat eq "-wikipathways" ) { # shortcut for webservice.wikipathways.org (undocumented) $i++; if ( $i < $max ) { $url = "http://webservice.wikipathways.org"; } } elsif ( $pat eq "-biosample" ) { # internal biosample_chk request on live database (undocumented) $i++; if ( $i < $max ) { $http = "get"; $url = "https://api-int.ncbi.nlm.nih.gov/biosample/fetch"; $bid = $args[$i]; $arg="format=source&id=$bid"; $amp = "&"; $i++; } } elsif ( $pat eq "-biosample-dev" ) { # internal biosample_chk request on development database (undocumented) $i++; if ( $i < $max ) { $http = "get"; $url = "https://dev-api-int.ncbi.nlm.nih.gov/biosample/fetch"; $bid = $args[$i]; $arg="format=source&id=$bid"; $amp = "&"; $i++; } } } if ( $url eq "" ) { return; } # hard-coded URL aliases for common NCBI web sites if ( $url =~ /\(#/ ) { $ky = "ncbi_url"; if ( $url =~ /\(#$ky\)/ ) { $vl = "https://www.ncbi.nlm.nih.gov"; $url =~ s/\((\#$ky)\)/$vl/g; } $ky = "eutils_url"; if ( $url =~ /\(#$ky\)/ ) { $vl = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"; $url =~ s/\((\#$ky)\)/$vl/g; } } # arguments before next minus are added to base URL as /value $go_on = true; while ( $i < $max and $go_on ) { $pat = $args[$i]; if ( $pat =~ /^-(.+)/ ) { $go_on = false; } else { $pat = map_macros ($pat); $url .= "/" . $pat; $i++; } } # now expect tag with minus and value[s] without, add as &tag=value[,value] while ( $i < $max ) { $pat = $args[$i]; if ( $pat =~ /^-(.+)/ ) { $pat = $1; $pfx = $amp . "$pat="; $amp = ""; } else { $pat =~ s/^\\-/-/g; $pat = map_macros ($pat); $enc = do_uri_escape ($pat); $arg .= $pfx . $enc; $pfx = ","; $amp = "&"; } $i++; } # perform query $output = do_post ($url, $arg); if ( $j2x ) { my $jc = JSON::PP->new->ascii->pretty->allow_nonref; my $conv = $jc->decode($output); convert_bools($conv); my $result = XMLout($conv, SuppressEmpty => undef); # remove newlines, tabs, space between tokens, compress runs of spaces $result =~ s/\r/ /g; $result =~ s/\n/ /g; $result =~ s/\t//g; $result =~ s/ +/ /g; $result =~ s/> + flanking object if ( $result =~ /\s*?\s*?<\/opt>/ ) { $result =~ s/\s*?\s*?<\/opt>/>/g; } $output = "$result"; # restore newlines between objects $output =~ s/> *?\n