1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
|
PUBMED
First Authors
efetch -db pubmed -id 6271474,5685784,4882854,6243420 -format xml |
xtract -pattern PubmedArticle -element MedlineCitation/PMID "#Author" \
-block Author -position first -sep " " -element Initials,LastName \
-block Article -element ArticleTitle
6271474 5 MJ Casadaban Tn3: transposition and control.
5685784 2 RK Mortimer Suppressors and suppressible mutations in yeast.
4882854 2 ED Garber Proteins and enzymes as taxonomic tools.
6243420 1 NR Cozzarelli DNA gyrase and the supercoiling of DNA.
Formatted Authors
efetch -db pubmed -id 1413997,6301692,781293 -format xml |
xtract -pattern PubmedArticle -element MedlineCitation/PMID \
-block PubDate -sep "-" -element Year,Month,MedlineDate \
-block Author -sep " " -tab "" \
-element "&COM" Initials,LastName -COM "(|)" |
perl -pe 's/(\t[^\t|]*)\|([^\t|]*)$/$1 and $2/; s/\|([^|]*)$/, and $1/; s/\|/, /g'
1413997 1992-Oct RK Mortimer, CR Contopoulou, and JS King
6301692 1983-Apr MA Krasnow and NR Cozzarelli
781293 1976-Jul MJ Casadaban
Medical Subject Headings
efetch -db pubmed -id 6092233,2539356,1937004 -format xml |
xtract -pattern PubmedArticle -element MedlineCitation/PMID \
-block MeshHeading \
-subset DescriptorName -pfc "\n" -sep "|" -element @MajorTopicYN,DescriptorName \
-subset QualifierName -pfc " / " -sep "|" -element @MajorTopicYN,QualifierName |
sed -e 's/N|//g' -e 's/Y|/*/g'
6092233
Base Sequence
DNA Restriction Enzymes
DNA, Fungal / genetics / *isolation & purification
*Genes, Fungal
...
Book Authors and Editors
efetch -db pubmed -id 21433338 -format xml |
xtract -pattern PubmedBookArticle \
-path BookDocument.AuthorList.Author -element LastName \
-path BookDocument.Book.AuthorList.Author -element LastName
Fauci Desrosiers Coffin Hughes Varmus
Heterogeneous Data
efetch -db pubmed -id 21433338,17247418 -format xml |
xtract -pattern "PubmedArticleSet/*" \
-group "Book/AuthorList" -element LastName \
-group "Article/AuthorList" -element LastName
Coffin Hughes Varmus
Lederberg Cavalli Lederberg
Multiple Links
esearch -db pubmed -query "conotoxin AND dopamine [MAJR]" |
elink -target protein -cmd neighbor |
xtract -pattern LinkSet -if Link/Id -element IdList/Id Link/Id
28666811 17105332 9506485
23624852 17105332
14657161 27532980 27532978 19424304
12944511 31542395 17105332
Link Counts
elink -db protein -id NP_000509 -target pubmed |
elink -target protein -cmd neighbor |
xtract -wrp "Set,Rec" -pattern LinkSet \
-wrp "Uid" -element IdList/Id -wrp "Count" -num Link/Id |
xtract -pattern Rec -if Count -ge 50 -element Uid Count
32296183 17997
19372376 57
Markup Correction
for id in 8475897 8988608 9698410 10194376 15949988 16271163 17282049 \
19793852 20968289 21892341 22106757 22785267 22360335 23095895 23095897 \
25435818 26433210 27672066 28635620 29547395 29869631 29869640 29944225
do
efetch -db pubmed -format xml -id "$id" |
xtract -pattern PubmedArticle -plg "\n\n" -sep "\n\n" -tab "\n\n" \
-element MedlineCitation/PMID ArticleTitle Abstract/AbstractText
done
XML Normalization
echo "assembly 443538 Stat biosample 3737421 SampleData gene 5053 Summary medgen 162753 Name" |
xargs -n 3 sh -c 'efetch -db "$0" -id "$1" -format docsum |
xtract -pattern DocumentSummary -sep " | " -tab " - " -ret "\n\n" -lbl "$0" -element Id "$2"'
Record Counts
echo "diphtheria measles pertussis polio tuberculosis" |
xargs -n 1 sh -c 'esearch -db pubmed -query "$0 [MESH]" |
efilter -days 365 -datetype PDAT |
xtract -pattern ENTREZ_DIRECT -lbl "$0" -element Count'
diphtheria 20
measles 213
pertussis 69
polio 76
tuberculosis 1787
Citation Lookup
esearch -db pubmed -query "Beadle GW [AUTH] AND Tatum EL [AUTH]" |
elink -cited |
efilter -days 365 |
efetch -format abstract
Stopwords and Stemming
pm=$( efetch -db pubmed -id 2005826 -format xml )
echo "$pm" | xtract -pattern PubmedArticle -sep " " -words ArticleTitle
echo "$pm" | xtract -stops -pattern PubmedArticle -sep " " -words ArticleTitle
echo "$pm" | xtract -stems -pattern PubmedArticle -sep " " -words ArticleTitle
echo "$pm" | xtract -stops -stems -pattern PubmedArticle -sep " " -words ArticleTitle
DOI Extraction
esearch -db pubmed -query "Rowley JD [AUTH]" |
efetch -format xml |
xtract -pattern PubmedArticle \
-block ArticleId -if @IdType -equals doi \
-doi ArticleId
Combining Independent Queries
esearch -db protein -query "amyloid* [PROT]" |
elink -target pubmed -label prot_cit |
esearch -db gene -query "apo* [GENE]" |
elink -target pubmed -label gene_cit |
esearch -query "(#prot_cit) AND (#gene_cit)" |
efetch -format docsum |
xtract -pattern DocumentSummary -element Id Title |
cat -v
PMC
Formatting Tag Removal
efetch -db pmc -id 4729119 -format xml |
xtract -mixed -pattern article -group p \
-position first -tab "\n\n" -element p -plain p |
fold -w 70 -s | awk '{$1=$1};1'
The intestinal cells of <italic>Caenorhabditis elegans</italic> are
filled with heterogeneous granular organelles that are associated
with specific organ functions. The best studied of these organelles
...
The intestinal cells of Caenorhabditis elegans are filled with
heterogeneous granular organelles that are associated with specific
organ functions. The best studied of these organelles are lipid
...
SEQUENCE
Peptide Sequences
esearch -db protein -query "conotoxin AND mat_peptide [FKEY]" |
efetch -format gpc |
xtract -insd complete mat_peptide "%peptide" product mol_wt peptide |
grep -i conotoxin | sort -t $'\t' -u -k 2,2n | head -n 8
ADB43131.1 15 conotoxin Cal 1b 1708 LCCKRHHGCHPCGRT
ADB43128.1 16 conotoxin Cal 5.1 1829 DPAPCCQHPIETCCRR
AIC77105.1 17 conotoxin Lt1.4 1705 GCCSHPACDVNNPDICG
ADB43129.1 18 conotoxin Cal 5.2 2008 MIQRSQCCAVKKNCCHVG
ADD97803.1 20 conotoxin Cal 1.2 2206 AGCCPTIMYKTGACRTNRCR
AIC77085.1 21 conotoxin Bt14.8 2574 NECDNCMRSFCSMIYEKCRLK
ADB43125.1 22 conotoxin Cal 14.2 2157 GCPADCPNTCDSSNKCSPGFPG
AIC77154.1 23 conotoxin Bt14.19 2578 VREKDCPPHPVPGMHKCVCLKTC
Vitamin Biosynthesis
esearch -db pubmed -query "lycopene cyclase" |
elink -related |
elink -target protein |
efilter -organism rodents -source refseq |
efetch -format docsum |
xtract -pattern DocumentSummary -element AccessionVersion Title |
grep -i carotene | sort -V
NP_001346539.1 beta,beta-carotene 9',10'-oxygenase isoform 2 [Mus musculus]
NP_573480.1 beta,beta-carotene 9',10'-oxygenase isoform 1 [Mus musculus]
NP_446100.2 beta,beta-carotene 15,15'-dioxygenase [Rattus norvegicus]
NP_001121184.1 beta,beta-carotene 9',10'-oxygenase [Rattus norvegicus]
NP_001156500.1 beta,beta-carotene 15,15'-dioxygenase isoform 2 [Mus musculus]
NP_067461.2 beta,beta-carotene 15,15'-dioxygenase isoform 1 [Mus musculus]
Coding Sequences
efetch -db nuccore -id J01636.1 -format gbc |
xtract -insd CDS gene sub_sequence
J01636.1 lacI GTGAAACCAGTAACGTTATACGATGTCGCAGAGTATGCCG...
J01636.1 lacZ ATGACCATGATTACGGATTCACTGGCCGTCGTTTTACAAC...
J01636.1 lacY ATGTACTATTTAAAAAACACAAACTTTTGGATGTTCGGTT...
J01636.1 lacA TTGAACATGCCAATGACCGAAAGAATAAGAGCAGGCAAGC...
Sequence Subregion
efetch -db nuccore -id U54469 -format gbc |
xtract -pattern INSDSeq -nucleic INSDSeq_sequence[2881:1] |
fold -w 60
CCGGTTTTAATGTAGGTTTTTATTAATATACTTTTCCGTCTAATCCATTATTGACAGTGA
CTACAAAAAGCGGATAGATTTTATATTATGCCGATTTTTGATAACAAAGGGGGTTCCGTT
TCGGTTTCGTTACGCGGGTCTTAGACAATAGTCACGATTAATCGCTACTGTTGCTTATAA
...
3'UTR Sequences
#!/bin/bash -norc
ThreePrimeUTRs() {
xtract -pattern INSDSeq -ACC INSDSeq_accession-version -SEQ INSDSeq_sequence \
-block INSDFeature -if INSDFeature_key -equals CDS \
-pfc "\n" -element "&ACC" -rst -last INSDInterval_to -element "&SEQ" |
while read acc pos seq
do
if [ $pos -lt ${#seq} ]
then
echo -e ">$acc 3'UTR: $((pos+1))..${#seq}"
echo "${seq:$pos}" | fold -w 50
elif [ $pos -ge ${#seq} ]
then
echo -e ">$acc NO 3'UTR"
fi
done
}
esearch -db nuccore -query "5.5.1.19 [ECNO]" |
efilter -molecule mrna -source refseq |
efetch -format gbc | ThreePrimeUTRs
>NM_001328461.1 3'UTR: 1737..1871
gatgaatatagagttactgtgttgtaagctaatcatcatactgatgcaag
tgcattatcacatttacttctgctgatgattgttcataagattatgagtt
agccatttatcaaaaaaaaaaaaaaaaaaaaaaaa
>NM_001316759.1 3'UTR: 1628..1690
atccgagtaattcggaatcttgtccaattttatatagcctatattaatac
...
Amino Acid Composition
#!/bin/bash -norc
abbrev=( Ala Asx Cys Asp Glu Phe Gly His Ile \
Xle Lys Leu Met Asn Pyl Pro Gln Arg \
Ser Thr Sec Val Trp Xxx Tyr Glx )
AminoAcidComp() {
local count
while read num lttr
do
idx=$(printf %i "'$lttr'")
ofs=$((idx-97))
count[$ofs]="$num"
done <<< "$1"
for i in {0..25}
do
echo -e "${abbrev[$i]}\t${count[$i]-0}"
done |
sort
}
AminoAcidJoin() {
result=""
while read acc seq gene
do
comp="$(echo "$seq" | tr A-Z a-z | sed 's/[^a-z]//g' | fold -w 1 | sort-uniq-count)"
current=$(AminoAcidComp "$comp")
current=$(echo -e "GENE\t$gene\n$current")
if [ -n "$result" ]
then
result=$(join -t $'\t' <(echo "$result") <(echo "$current"))
else
result=$current
fi
done
echo "$result" |
grep -e "GENE" -e "[1-9]"
}
ids="NP_001172026,NP_000509,NP_004001,NP_001243779"
efetch -db protein -id "$ids" -format gpc |
xtract -insd INSDSeq_sequence CDS gene |
AminoAcidJoin
GENE INS HBB DMD TTN
Ala 10 15 210 2084
Arg 5 3 193 1640
Asn 3 6 153 1111
Asp 2 7 185 1720
Cys 6 2 35 513
Gln 7 3 301 942
Glu 8 8 379 3193
Gly 12 13 104 2066
His 2 9 84 478
Ile 2 0 165 2062
Leu 20 18 438 2117
Lys 2 11 282 2943
Met 2 2 79 398
Phe 3 8 77 908
Pro 6 7 130 2517
Ser 5 5 239 2463
Thr 3 7 194 2546
Trp 2 2 67 466
Tyr 4 3 61 999
Val 6 18 186 3184
GENE
Chromosome Assignments
esearch -db gene -query "calmodulin * [PFN] AND mammalia [ORGN]" |
efetch -format docsum |
xtract -pattern DocumentSummary \
-def "-" -element Id Name MapLocation ScientificName |
head -n 30
801 CALM1 14q32.11 Homo sapiens
808 CALM3 19q13.32 Homo sapiens
805 CALM2 2p21 Homo sapiens
24242 Calm1 6q32 Rattus norvegicus
12313 Calm1 12 E Mus musculus
326597 CALM - Bos taurus
50663 Calm2 6q12 Rattus norvegicus
24244 Calm3 1q21 Rattus norvegicus
12315 Calm3 7 9.15 cM Mus musculus
12314 Calm2 17 E4 Mus musculus
617095 CALM1 - Bos taurus
396838 CALM3 6 Sus scrofa
...
Genome Range
esearch -db gene -query "Homo sapiens [ORGN] AND Y [CHR]" |
efilter -status alive | efetch -format docsum |
xtract -pattern DocumentSummary -NAME Name -DESC Description \
-block GenomicInfoType -if ChrLoc -equals Y \
-min ChrStart,ChrStop -element "&NAME" "&DESC" |
sort -k 1,1n | cut -f 2- |
grep -v pseudogene | grep -v uncharacterized |
between-two-genes ASMT IL3RA |
align-columns -g 4
IL3RA interleukin 3 receptor subunit alpha
SLC25A6 solute carrier family 25 member 6
LINC00106 long intergenic non-protein coding RNA 106
ASMTL-AS1 ASMTL antisense RNA 1
ASMTL acetylserotonin O-methyltransferase-like
P2RY8 purinergic receptor P2Y8
AKAP17A A-kinase anchoring protein 17A
ASMT acetylserotonin O-methyltransferase
Centromere Position
nquire -ftp ftp.ncbi.nlm.nih.gov pub/gdp ideogram_9606_GCF_000001305.14_850_V1 |
grep acen | cut -f 1,2,6,7 | grep "^X"
X p 58100001 61000000
X q 61000001 63800000
Gene Regions
esearch -db gene -query "DDT [GENE] AND mouse [ORGN]" |
efetch -format docsum |
xtract -pattern GenomicInfoType -element ChrAccVer ChrStart ChrStop |
xargs -n 3 sh -c 'efetch -db nuccore -format gb \
-id "$0" -chr_start "$1" -chr_stop "$2"'
LOCUS NC_000076 2142 bp DNA linear CON 09-FEB-2015
DEFINITION Mus musculus strain C57BL/6J chromosome 10, GRCm38.p3 C57BL/6J.
ACCESSION NC_000076 REGION: complement(75771233..75773374) GPC_000000783
VERSION NC_000076.6
...
FEATURES Location/Qualifiers
source 1..2142
/organism="Mus musculus"
/mol_type="genomic DNA"
/strain="C57BL/6J"
/db_xref="taxon:10090"
/chromosome="10"
gene 1..2142
/gene="Ddt"
mRNA join(1..159,462..637,1869..2142)
/gene="Ddt"
/product="D-dopachrome tautomerase"
/transcript_id="NM_010027.1"
CDS join(52..159,462..637,1869..1941)
/gene="Ddt"
/codon_start=1
/product="D-dopachrome decarboxylase"
/protein_id="NP_034157.1"
/translation="MPFVELETNLPASRIPAGLENRLCAATATILDKPEDRVSVTIRP
GMTLLMNKSTEPCAHLLVSSIGVVGTAEQNRTHSASFFKFLTEELSLDQDRIVIRFFP
...
Recursive Data
esearch -db gene -query "rbcL [GENE] AND maize [ORGN]" |
efetch -format xml |
xtract -pattern Entrezgene -block "**/Gene-commentary" \
-if Gene-commentary_type@value -equals genomic \
-tab "\n" -element Gene-commentary_accession |
sort | uniq
NC_001666
X86563
Z11973
Genes in Pathways
esearch -db gene -query "PAH [GENE]" -organism human |
elink -target biosystems |
efilter -pathway wikipathways |
elink -target gene |
efetch -format docsum |
xtract -pattern DocumentSummary -element Name Id Description |
grep -v pseudogene | grep -v uncharacterized |
sort -f
AANAT 15 aralkylamine N-acetyltransferase
ACADM 34 acyl-CoA dehydrogenase medium chain
ACHE 43 acetylcholinesterase (Cartwright blood group)
ADCYAP1 116 adenylate cyclase activating polypeptide 1
...
Gene Products
for sym in HBB BRCA2 CFTR RAG1
do
esearch -db gene -query "$sym [GENE] AND human [ORGN]" |
efilter -query "alive [PROP]" | efetch -format docsum |
xtract -pattern GenomicInfoType \
-element ChrAccVer ChrStart ChrStop |
while read acc str stp
do
efetch -db nuccore -format gbc \
-id "$acc" -chr_start "$str" -chr_stop "$stp" |
xtract -insd CDS,mRNA INSDFeature_key "#INSDInterval" \
gene "%transcription" "%translation" \
product transcription translation |
grep -i $'\t'"$sym"$'\t'
done
done
NC_000011.10 mRNA 3 HBB 626 hemoglobin, beta ACATTTGCTT...
NC_000011.10 CDS 3 HBB 147 hemoglobin subunit beta MVHLTPEEKS...
NC_000023.11 mRNA 78 DMD 13805 dystrophin, transcript variant X2 AGGAAGATGA...
NC_000023.11 mRNA 77 DMD 13794 dystrophin, transcript variant X6 ACTTTCCCCC...
NC_000023.11 mRNA 77 DMD 13800 dystrophin, transcript variant X5 ACTTTCCCCC...
NC_000023.11 mRNA 77 DMD 13785 dystrophin, transcript variant X7 ACTTTCCCCC...
NC_000023.11 mRNA 74 DMD 13593 dystrophin, transcript variant X8 ACTTTCCCCC...
NC_000023.11 mRNA 75 DMD 13625 dystrophin, transcript variant X9 ACTTTCCCCC...
...
Unfiltered Gene Lookup
for sym in ATP6 ATP7B CBD DMD HFE PAH PRNP TTN
do
esearch -db gene -query "$sym [GENE]" -organism human |
efetch -format docsum |
xtract -pattern DocumentSummary -def "-" -lbl "${sym}" \
-element NomenclatureSymbol Id Description CommonName
done
ATP6 MT-ATP6 4508 ATP synthase F0 subunit 6 human
ATP6 - 6775074 ATP synthase F0 subunit 6 Neandertal
ATP6 - 8923188 ATP synthase F0 subunit 6 Denisova hominin
CBD OPN1MW 2652 opsin 1, medium wave sensitive human
HBB HBB 3043 hemoglobin subunit beta human
HBB KRT89P 85344 keratin 89 pseudogene human
OPN1MW OPN1MW 2652 opsin 1, medium wave sensitive human
OPN1MW OPN1MW3 101060233 opsin 1, medium wave sensitive 3 human
Protein Coding Genes
for sym in MT-ATP6 BRCA2 CFTR HBB HFE IL9R OPN1MW PAH
do
esearch -db gene -query "$sym [PREF]" -organism human |
efetch -format docsum |
xtract -pattern DocumentSummary -def "-" \
-lbl "${sym}" -element Id Chromosome Description
done |
...
MT-ATP6 4508 MT ATP synthase F0 subunit 6
BRCA2 675 13 BRCA2 DNA repair associated
CFTR 1080 7 CF transmembrane conductance regulator
HBB 3043 11 hemoglobin subunit beta
HFE 3077 6 homeostatic iron regulator
IL9R 3581 X, Y interleukin 9 receptor
OPN1MW 2652 X opsin 1, medium wave sensitive
PAH 5053 12 phenylalanine hydroxylase
Common Pathways
...
while IFS=$'\t' read sym uid chr desc
do
elink -db gene -id "$uid" -target biosystems |
efilter -kind pathway |
efetch -format docsum |
xtract -pattern DocumentSummary -lbl "${sym}" \
-lower source -element externalid biosystemname
done |
sort -t $'\t' -k 2,2 -k 3,3 -k 1,1 |
awk 'a[$3]++{ if(a[$3]==2){ print b }; print $0}; {b=$0}'
MT-ATP6 kegg hsa01100 Metabolic pathways
PAH kegg hsa01100 Metabolic pathways
HBB reactome R-HSA-1430728 Metabolism
MT-ATP6 reactome R-HSA-1430728 Metabolism
PAH reactome R-HSA-1430728 Metabolism
CFTR reactome R-HSA-162582 Signal Transduction
OPN1MW reactome R-HSA-162582 Signal Transduction
...
TAXONOMY
Taxonomic Names
esearch -db taxonomy -query "txid10090 [SBTR] OR camel [COMN]" |
efetch -format docsum |
xtract -pattern DocumentSummary -if CommonName \
-element Id ScientificName CommonName
57486 Mus musculus molossinus Japanese wild mouse
39442 Mus musculus musculus eastern European house mouse
35531 Mus musculus bactrianus southwestern Asian house mouse
10092 Mus musculus domesticus western European house mouse
10091 Mus musculus castaneus southeastern Asian house mouse
10090 Mus musculus house mouse
9838 Camelus dromedarius Arabian camel
9837 Camelus bactrianus Bactrian camel
STRUCTURE
Structural Similarity
esearch -db structure -query "crotalus [ORGN] AND phospholipase A2" |
elink -related |
efilter -query "archaea [ORGN]" |
efetch -format docsum |
xtract -pattern DocumentSummary \
-if PdbClass -equals Hydrolase \
-element PdbAcc PdbDescr
3WIV Crystal Structure Of Pro-s324a/d356a
3WIU Crystal Structure Of Pro-s324a/l349a
3VV2 Crystal Structure Of Complex Form Between S324a-subtilisin And Mutant Tkpro
3VHQ Crystal Structure Of The Ca6 Site Mutant Of Pro-Sa-Subtilisin
2ZWP Crystal Structure Of Ca3 Site Mutant Of Pro-S324a
...
SNP
Amino Acid Substitutions
esearch -db gene -query "OPN1MW [PREF] AND human [ORGN]" |
elink -target snp | efilter -class missense |
efetch -format docsum |
xtract -set Set -rec Rec -pattern DocumentSummary \
-wrp Id -element Id -rst -hgvs DOCSUM |
xtract -pattern Rec -pfx "rs" -RSID Id \
-group Protein/Missense -block Variant -plg "\n" \
-element "&RSID" Accession Inserted Position |
sort -t $'\t' -k 2,2 -k 4,4n -k 3,3f -k 1.3n | uniq |
while read rsid accn res pos
do
if [ "$accn" != "$last" ]
then
seq=$( efetch -db protein -id "$accn" -format gpc < /dev/null |
xtract -pattern INSDSeq -lower INSDSeq_sequence )
last="$accn"
fi
echo ">$rsid [$accn $res@$pos]"
echo "${seq:0:$pos-1}$res${seq:$pos}" | fold -w 50
done
>rs1238141906 [NP_000504.1 K@41]
maqqwslqrlagrhpqdsyedstqssiftytnsnstrgpfKgpnyhiapr
wvyhltsvwmifvviasvftnglvlaatmkfkklrhplnwilvnlavadl
aetviastisvvnqvygyfvlghpmcvlegytvslcgitglwslaiiswe
...
Sequences Flanking SNPs
#!/bin/bash -norc
efetch -db snp -id 268 -format json |
transmute -j2x -set - -rec RS |
xtract -pattern RS -pfx "rs" -RSID RS/refsnp_id \
-group placements_with_allele \
-block allele -if seq_id -starts-with "NC_" \
-and inserted_sequence -differs-from deleted_sequence \
-element "&RSID" seq_id deleted_sequence \
inserted_sequence -tab "\n" -inc position |
sort -t $'\t' -k 2,2 -k 5,5n -k 4,4f -k 1.3n | uniq |
while read rsid accn del ins pos
do
lft=$(efetch -db nuccore -format fasta -id "$accn" \
-seq_start "$((pos-50))" -seq_stop "$((pos-1))" < /dev/null |
grep -v '>' | tr -d '\n')
ad=${#ins}
sb=${#del}
rgt=$(efetch -db nuccore -format fasta -id "$accn" \
-seq_start "$((pos+ad-sb+1))" -seq_stop "$((pos+ad-sb+50))" < /dev/null |
grep -v '>' | tr -d '\n')
echo "$rsid $accn $pos $del->$ins"
echo "5': $lft"
echo "3': $rgt"
echo ""
done
rs268 NC_000008.10 19813529 A->G
5': CTGCTTGAGTTGTAGAAAGAACCGCTGCAACAATCTGGGCTATGAGATCA
3': TAAAGTCAGAGCCAAAAGAAGCAGCAAAATGTACCTGAAGACTCGTTCTC
rs268 NC_000008.11 19956018 A->G
5': CTGCTTGAGTTGTAGAAAGAACCGCTGCAACAATCTGGGCTATGAGATCA
3': TAAAGTCAGAGCCAAAAGAAGCAGCAAAATGTACCTGAAGACTCGTTCTC
EXTERNAL
JSON Nested Array Expansion
for ns in flat recurse plural depth
do
echo " $ns"
echo
nquire -get "http://mygene.info/v3" gene 2652 |
transmute -j2x -set - -rec GeneRec -nest "$ns" |
grep position | head -n 4
echo
done
"position": [
[
154182595,
154182789
],
[
154187769,
154188066
],
flat
<position>154182595</position>
<position>154182789</position>
<position>154187769</position>
<position>154188066</position>
recurse
<position>
<position>154182595</position>
<position>154182789</position>
</position>
plural
<positions>
<position>154182595</position>
<position>154182789</position>
</positions>
depth
<position>
<position_1>154182595</position_1>
<position_1>154182789</position_1>
</position>
Exon Interval Sets
nquire -get "http://mygene.info/v3/gene/2652" |
transmute -j2x -set - -rec GeneRec -nest plural |
xtract -pattern GeneRec -group exons -lbl "" -clr \
-block positions -pfc "\n" -sep ".." -tab "\n" -element position
154182595..154182789
154187769..154188066
154190053..154190222
154191687..154191853
154193407..154193647
154195929..154196861
154219733..154219927
154224907..154225204
...
Heterogeneous Object Names
nquire -get "http://mygene.info/v3/gene/2652" |
transmute -j2x -set - -rec GeneRec |
xtract -pattern GeneRec -group "pathway/*" -pfx "\n" -element "?,name,id"
<pathway>
<reactome>
<id>R-HSA-162582</id>
<name>Signal Transduction</name>
</reactome>
...
<wikipathways>
<id>WP455</id>
<name>GPCRs, Class A Rhodopsin-like</name>
</wikipathways>
</pathway>
reactome Signal Transduction R-HSA-162582
reactome Disease R-HSA-1643685
reactome The retinoid cycle in cones (daylight vision) R-HSA-2187335
reactome Visual phototransduction R-HSA-2187338
reactome Retinoid cycle disease events R-HSA-2453864
reactome Diseases associated with visual transduction R-HSA-2474795
reactome Signaling by GPCR R-HSA-372790
reactome Class A/1 (Rhodopsin-like receptors) R-HSA-373076
reactome GPCR downstream signalling R-HSA-388396
reactome G alpha (i) signalling events R-HSA-418594
reactome Opsins R-HSA-419771
reactome GPCR ligand binding R-HSA-500792
reactome Diseases of signal transduction R-HSA-5663202
wikipathways GPCRs, Class A Rhodopsin-like WP455
XML Namespace Prefixes
nquire -url "http://webservice.wikipathways.org" getPathway -pwId WP455 |
xtract -pattern "ns1:getPathwayResponse" -decode ":gpml" |
xtract -pattern Pathway -block Xref \
-if @Database -equals "Entrez Gene" \
-tab "\n" -element @ID |
sort -n
134
135
136
140
146
...
LOCAL ARCHIVE
Entrez Indexing
efetch -db pubmed -id 12857958,2981625 -format xml |
xtract -e2index |
xtract -pattern IdxDocument -UID IdxUid \
-block NORM -pfc "\n" -element "&UID",NORM,"@pos"
12857958 allow 205
12857958 assays 147
12857958 binding 146
12857958 braid 187,215
12857958 braiding 153
...
Author Frequency
esearch -db pubmed -query "rattlesnake phospholipase" |
efetch -format uid | fetch-pubmed |
xtract -pattern PubmedArticle -block Author \
-sep " " -tab "\n" -element LastName,Initials |
sort-uniq-count-rank
40 Marangoni S
33 Toyama MH
28 Soares AM
25 Bon C
...
Author Counts
esearch -db pubmed -query "conotoxin" |
efetch -format uid | fetch-pubmed |
xtract -pattern PubmedArticle -num Author |
sort-uniq-count -n |
reorder-columns 2 1 |
head -n 15 |
tee /dev/tty |
xy-plot auth.png
0 11
1 193
2 854
3 844
4 699
5 588
6 439
7 291
8 187
9 124
10 122
11 58
12 33
13 18
900 +
| ********
800 + * **
| * *
700 + * ***
| * **
600 + * *
| * ***
500 + * **
| * ***
400 + * **
| * *
300 + * ***
| * *
200 + * ******
| * *********
100 + ** *
| * **********
0 + * ******
+---------+---------+---------+---------+---------+---------+---------+
0 2 4 6 8 10 12 14
Title and Abstract Word Counts
esearch -db pubmed -query "conotoxin" -pub structured |
efetch -format uid | fetch-pubmed |
xtract -stops -wrp "Set,Rec" \
-pattern PubmedArticle -wrp "PMID" -element MedlineCitation/PMID \
-wrp "Titl" -words ArticleTitle \
-block Abstract/AbstractText -wrp "Grp,Abst" -words AbstractText |
xtract -pattern Rec -element PMID -num Titl -block Grp -tab ", " -num Abst
29194563 21 63, 84, 89, 26
28882644 23 87, 34, 115, 25
28877214 10 12, 42, 315, 94
28825343 15 169
28482835 9 75, 123, 42, 37
28479398 15 170, 130
...
Verbosity Per Year
esearch -db pubmed -query "PNAS [JOUR]" -pub abstract |
efetch -format uid | stream-pubmed | gunzip -c |
xtract -stops -wrp Set,Rec -pattern PubmedArticle \
-wrp "Year" -year "PubDate/*" \
-wrp "Abst" -words Abstract/AbstractText |
xtract -wrp Set,Pub -pattern Rec \
-wrp "Year" -element Year \
-wrp "Num" -num Abst > countsByYear.xml
for yr in {1960..2020}
do
cat countsByYear.xml |
xtract -wrp Raw -pattern Pub -select Year -eq "$yr" |
xtract -pattern Raw -lbl "$yr" -avg Num
done |
tee /dev/tty |
xy-plot verbosity.png
rm countsByYear.xml
Appending Metadata
esearch -db pubmed -query "PNAS [JOUR]" -pub abstract |
efetch -format uid | fetch-pubmed > pnas.xml
cat pnas.xml |
xtract -stops -wrp Set,Rec -pattern PubmedArticle \
-wrp ID -element MedlineCitation/PMID \
-wrp Abst -words Abstract/AbstractText |
<Set>
<Rec><ID>31822623</ID> <Abst>foxp3</Abst><Abst>cd4</Abst><Abst>regulatory...</Abst></Rec>
...
xtract -pattern Rec -element ID -wrp Num -num Abst > counts.txt
31822623 <Num>243</Num>
31822622 <Num>132</Num>
31822621 <Num>252</Num>
31822620 <Num>238</Num>
...
xtract -input pnas.xml -wrp PubmedArticleSet -pattern PubmedArticle \
-select MedlineCitation/PMID -appending counts.txt > merged.xml
LOCAL INDEX
Histogram Shortcut
cat $EDIRECT_PUBMED_MASTER/Current/*.xml |
xtract -timer -pattern PubmedArticle -histogram PubDate/Month
26 8
37 9
121475 01
114579 02
111137 03
109794 04
120169 05
130062 06
125107 07
126246 08
123191 09
120957 10
109657 11
110854 12
1958892 Apr
1809730 Aug
2086169 Dec
1844717 Feb
1851803 Jan
1784258 Jul
2015942 Jun
1943325 Mar
1815691 May
1889194 Nov
2035632 Oct
1 October
1956569 Sep
Month Format Per Year
cat $EDIRECT_PUBMED_MASTER/Current/*.xml |
xtract -wrp Set,Rec -pattern PubmedArticle \
-if PubDate/Month -wrp YR -year "PubDate/*" -wrp MN -len PubDate/Month |
xtract -wrp Set,Rec -pattern Rec \
-pfx "<DT>" -sep "+-" -sfx "-</DT>" -element YR,MN |
xtract -pattern Rec -histogram DT |
reorder-columns 2 1 | tr '+' '\t' |
sed -e 's/-3-/1/g' -e 's/-2-/2/g' -e 's/-1-/3/g' -e 's/-[0-9]-/4/g' |
sort -k 1,1n -k 2,2n > rawMonthCounts.txt
result=$( cat rawMonthCounts.txt | cut -f 1 | uniq )
for i in {1..4}
do
current=$( cat rawMonthCounts.txt | grep "\t$i\t" | cut -f 1,3 )
result=$(join -a 1 -t $'\t' <(echo "$result") <(echo "$current"))
done
echo "$result" > plotme.txt
cat plotme.txt | xy-plot
Phrase Query Automation
ascend_mesh_tree() {
var="${1%\*}"
while :
do
phrase-search -count "$var* [TREE]"
case "$var" in
*.* ) var="${var%????}" ;;
* ) break ;;
esac
done
}
ascend_mesh_tree "C01.925.782.417.415"
5598 c01 925 782 417 415*
28400 c01 925 782 417*
658188 c01 925 782*
928201 c01 925*
2639368 c01*
Medical Subject Heading Code Viewers
https://meshb.nlm.nih.gov/treeView
https://meshb-prev.nlm.nih.gov/treeView
MISCELLANEOUS
Indexed Fields
einfo -db pubmed |
xtract -pattern Field \
-if IsDate -equals Y -and IsHidden -equals N \
-pfx "[" -sep "]\t" -element Name,FullName |
sort -t $'\t' -k 2f
[CDAT] Date - Completion
[CRDT] Date - Create
[EDAT] Date - Entrez
[MHDA] Date - MeSH
[MDAT] Date - Modification
[PDAT] Date - Publication
Pseudocode Prototype
for each PubmedArticle {
for each Author {
print Initials LastName
}
for each MeshHeading {
print DescriptorName
for each QualifierName {
print QualifierName
}
}
}
xtract -pattern PubmedArticle \
-block Author -element Initials LastName \
-block MeshHeading -element DescriptorName \
-subset QualifierName -element QualifierName
Processing in Groups
...
efetch -format acc |
join-into-groups-of 200 |
xargs -n 1 sh -c 'epost -db nuccore -format acc -id "$0" |
efetch -format gb'
|