summaryrefslogtreecommitdiff
path: root/hlp-xtract.txt
blob: df9944d7f26e6e08355d2c32d947879750651ba1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
PUBMED

First Authors

  efetch -db pubmed -id 6271474,5685784,4882854,6243420 -format xml |
  xtract -pattern PubmedArticle -element MedlineCitation/PMID "#Author" \
    -block Author -position first -sep " " -element Initials,LastName \
    -block Article -element ArticleTitle

  6271474    5    MJ Casadaban     Tn3: transposition and control.
  5685784    2    RK Mortimer      Suppressors and suppressible mutations in yeast.
  4882854    2    ED Garber        Proteins and enzymes as taxonomic tools.
  6243420    1    NR Cozzarelli    DNA gyrase and the supercoiling of DNA.

Formatted Authors

  efetch -db pubmed -id 1413997,6301692,781293 -format xml |
  xtract -pattern PubmedArticle -element MedlineCitation/PMID \
    -block PubDate -sep "-" -element Year,Month,MedlineDate \
    -block Author -sep " " -tab "" \
      -element "&COM" Initials,LastName -COM "(|)" |
  perl -pe 's/(\t[^\t|]*)\|([^\t|]*)$/$1 and $2/; s/\|([^|]*)$/, and $1/; s/\|/, /g'

  1413997    1992-Oct    RK Mortimer, CR Contopoulou, and JS King
  6301692    1983-Apr    MA Krasnow and NR Cozzarelli
  781293     1976-Jul    MJ Casadaban

Medical Subject Headings

  efetch -db pubmed -id 6092233,2539356,1937004 -format xml |
  xtract -pattern PubmedArticle -element MedlineCitation/PMID \
    -block MeshHeading \
      -subset DescriptorName -pfc "\n" -sep "|" -element @MajorTopicYN,DescriptorName \
      -subset QualifierName -pfc " / " -sep "|" -element @MajorTopicYN,QualifierName |
  sed -e 's/N|//g' -e 's/Y|/*/g'

  6092233
  Base Sequence
  DNA Restriction Enzymes
  DNA, Fungal / genetics / *isolation & purification
  *Genes, Fungal
  ...

Book Authors and Editors

  efetch -db pubmed -id 21433338 -format xml |
  xtract -pattern PubmedBookArticle \
    -path BookDocument.AuthorList.Author -element LastName \
    -path BookDocument.Book.AuthorList.Author -element LastName

  Fauci    Desrosiers    Coffin    Hughes    Varmus

Heterogeneous Data

  efetch -db pubmed -id 21433338,17247418 -format xml |
  xtract -pattern "PubmedArticleSet/*" \
    -group "Book/AuthorList" -element LastName \
    -group "Article/AuthorList" -element LastName

  Coffin       Hughes     Varmus
  Lederberg    Cavalli    Lederberg

Multiple Links

  esearch -db pubmed -query "conotoxin AND dopamine [MAJR]" |
  elink -target protein -cmd neighbor |
  xtract -pattern LinkSet -if Link/Id -element IdList/Id Link/Id

  28666811    17105332    9506485
  23624852    17105332
  14657161    27532980    27532978    19424304
  12944511    31542395    17105332

Link Counts

  elink -db protein -id NP_000509 -target pubmed |
  elink -target protein -cmd neighbor |
  xtract -wrp "Set,Rec" -pattern LinkSet \
    -wrp "Uid" -element IdList/Id -wrp "Count" -num Link/Id |
  xtract -pattern Rec -if Count -ge 50 -element Uid Count

  32296183    17997
  19372376    57

Markup Correction

  for id in 8475897 8988608 9698410 10194376 15949988 16271163 17282049 \
    19793852 20968289 21892341 22106757 22785267 22360335 23095895 23095897 \
    25435818 26433210 27672066 28635620 29547395 29869631 29869640 29944225
  do
    efetch -db pubmed -format xml -id "$id" |
    xtract -pattern PubmedArticle -plg "\n\n" -sep "\n\n" -tab "\n\n" \
      -element MedlineCitation/PMID ArticleTitle Abstract/AbstractText
  done

XML Normalization

  echo "assembly 443538 Stat biosample 3737421 SampleData gene 5053 Summary medgen 162753 Name" |
  xargs -n 3 sh -c 'efetch -db "$0" -id "$1" -format docsum |
  xtract -pattern DocumentSummary -sep " | " -tab " - " -ret "\n\n" -lbl "$0" -element Id "$2"'

Record Counts

  echo "diphtheria measles pertussis polio tuberculosis" |
  xargs -n 1 sh -c 'esearch -db pubmed -query "$0 [MESH]" |
  efilter -days 365 -datetype PDAT |
  xtract -pattern ENTREZ_DIRECT -lbl "$0" -element Count'

  diphtheria      20
  measles         213
  pertussis       69
  polio           76
  tuberculosis    1787

Citation Lookup

  esearch -db pubmed -query "Beadle GW [AUTH] AND Tatum EL [AUTH]" |
  elink -cited |
  efilter -days 365 |
  efetch -format abstract

Stopwords and Stemming

  pm=$( efetch -db pubmed -id 2005826 -format xml )
  echo "$pm" | xtract -pattern PubmedArticle -sep " " -words ArticleTitle
  echo "$pm" | xtract -stops -pattern PubmedArticle -sep " " -words ArticleTitle
  echo "$pm" | xtract -stems -pattern PubmedArticle -sep " " -words ArticleTitle
  echo "$pm" | xtract -stops -stems -pattern PubmedArticle -sep " " -words ArticleTitle

DOI Extraction

  esearch -db pubmed -query "Rowley JD [AUTH]" |
  efetch -format xml |
  xtract -pattern PubmedArticle \
    -block ArticleId -if @IdType -equals doi \
      -doi ArticleId

Combining Independent Queries

  esearch -db protein -query "amyloid* [PROT]" |
  elink -target pubmed -label prot_cit |
  esearch -db gene -query "apo* [GENE]" |
  elink -target pubmed -label gene_cit |
  esearch -query "(#prot_cit) AND (#gene_cit)" |
  efetch -format docsum |
  xtract -pattern DocumentSummary -element Id Title |
  cat -v

PMC

Formatting Tag Removal

  efetch -db pmc -id 4729119 -format xml |
  xtract -mixed -pattern article -group p \
    -position first -tab "\n\n" -element p -plain p |
  fold -w 70 -s | awk '{$1=$1};1'

  The intestinal cells of <italic>Caenorhabditis elegans</italic> are
  filled with heterogeneous granular organelles that are associated
  with specific organ functions. The best studied of these organelles
  ...

  The intestinal cells of Caenorhabditis elegans are filled with
  heterogeneous granular organelles that are associated with specific
  organ functions. The best studied of these organelles are lipid
  ...

SEQUENCE

Peptide Sequences

  esearch -db protein -query "conotoxin AND mat_peptide [FKEY]" |
  efetch -format gpc |
  xtract -insd complete mat_peptide "%peptide" product mol_wt peptide |
  grep -i conotoxin | sort -t $'\t' -u -k 2,2n | head -n 8

  ADB43131.1    15    conotoxin Cal 1b      1708    LCCKRHHGCHPCGRT
  ADB43128.1    16    conotoxin Cal 5.1     1829    DPAPCCQHPIETCCRR
  AIC77105.1    17    conotoxin Lt1.4       1705    GCCSHPACDVNNPDICG
  ADB43129.1    18    conotoxin Cal 5.2     2008    MIQRSQCCAVKKNCCHVG
  ADD97803.1    20    conotoxin Cal 1.2     2206    AGCCPTIMYKTGACRTNRCR
  AIC77085.1    21    conotoxin Bt14.8      2574    NECDNCMRSFCSMIYEKCRLK
  ADB43125.1    22    conotoxin Cal 14.2    2157    GCPADCPNTCDSSNKCSPGFPG
  AIC77154.1    23    conotoxin Bt14.19     2578    VREKDCPPHPVPGMHKCVCLKTC

Vitamin Biosynthesis

  esearch -db pubmed -query "lycopene cyclase" |
  elink -related |
  elink -target protein |
  efilter -organism rodents -source refseq |
  efetch -format docsum |
  xtract -pattern DocumentSummary -element AccessionVersion Title |
  grep -i carotene | sort -V

  NP_001346539.1    beta,beta-carotene 9',10'-oxygenase isoform 2 [Mus musculus]
  NP_573480.1       beta,beta-carotene 9',10'-oxygenase isoform 1 [Mus musculus]
  NP_446100.2       beta,beta-carotene 15,15'-dioxygenase [Rattus norvegicus]
  NP_001121184.1    beta,beta-carotene 9',10'-oxygenase [Rattus norvegicus]
  NP_001156500.1    beta,beta-carotene 15,15'-dioxygenase isoform 2 [Mus musculus]
  NP_067461.2       beta,beta-carotene 15,15'-dioxygenase isoform 1 [Mus musculus]

Coding Sequences

  efetch -db nuccore -id J01636.1 -format gbc |
  xtract -insd CDS gene sub_sequence

  J01636.1    lacI    GTGAAACCAGTAACGTTATACGATGTCGCAGAGTATGCCG...
  J01636.1    lacZ    ATGACCATGATTACGGATTCACTGGCCGTCGTTTTACAAC...
  J01636.1    lacY    ATGTACTATTTAAAAAACACAAACTTTTGGATGTTCGGTT...
  J01636.1    lacA    TTGAACATGCCAATGACCGAAAGAATAAGAGCAGGCAAGC...

Sequence Subregion

  efetch -db nuccore -id U54469 -format gbc |
  xtract -pattern INSDSeq -nucleic INSDSeq_sequence[2881:1] |
  fold -w 60

  CCGGTTTTAATGTAGGTTTTTATTAATATACTTTTCCGTCTAATCCATTATTGACAGTGA
  CTACAAAAAGCGGATAGATTTTATATTATGCCGATTTTTGATAACAAAGGGGGTTCCGTT
  TCGGTTTCGTTACGCGGGTCTTAGACAATAGTCACGATTAATCGCTACTGTTGCTTATAA
  ...

3'UTR Sequences

  #!/bin/bash -norc

  ThreePrimeUTRs() {
    xtract -pattern INSDSeq -ACC INSDSeq_accession-version -SEQ INSDSeq_sequence \
      -block INSDFeature -if INSDFeature_key -equals CDS \
        -pfc "\n" -element "&ACC" -rst -last INSDInterval_to -element "&SEQ" |
    while read acc pos seq
    do
      if [ $pos -lt ${#seq} ]
      then
        echo -e ">$acc 3'UTR: $((pos+1))..${#seq}"
        echo "${seq:$pos}" | fold -w 50
      elif [ $pos -ge ${#seq} ]
      then
        echo -e ">$acc NO 3'UTR"
      fi
    done
  }

  esearch -db nuccore -query "5.5.1.19 [ECNO]" |
  efilter -molecule mrna -source refseq |
  efetch -format gbc | ThreePrimeUTRs

  >NM_001328461.1 3'UTR: 1737..1871
  gatgaatatagagttactgtgttgtaagctaatcatcatactgatgcaag
  tgcattatcacatttacttctgctgatgattgttcataagattatgagtt
  agccatttatcaaaaaaaaaaaaaaaaaaaaaaaa
  >NM_001316759.1 3'UTR: 1628..1690
  atccgagtaattcggaatcttgtccaattttatatagcctatattaatac
  ...

Amino Acid Composition

  #!/bin/bash -norc

  abbrev=( Ala Asx Cys Asp Glu Phe Gly His Ile \
           Xle Lys Leu Met Asn Pyl Pro Gln Arg \
           Ser Thr Sec Val Trp Xxx Tyr Glx )

  AminoAcidComp() {
    local count
    while read num lttr
    do
      idx=$(printf %i "'$lttr'")
      ofs=$((idx-97))
      count[$ofs]="$num"
    done <<< "$1"
    for i in {0..25}
    do
      echo -e "${abbrev[$i]}\t${count[$i]-0}"
    done |
    sort
  }

  AminoAcidJoin() {
    result=""
    while read acc seq gene
    do
      comp="$(echo "$seq" | tr A-Z a-z | sed 's/[^a-z]//g' | fold -w 1 | sort-uniq-count)"
      current=$(AminoAcidComp "$comp")
      current=$(echo -e "GENE\t$gene\n$current")
      if [ -n "$result" ]
      then
        result=$(join -t $'\t' <(echo "$result") <(echo "$current"))
      else
        result=$current
      fi
    done
    echo "$result" |
    grep -e "GENE" -e "[1-9]"
  }

  ids="NP_001172026,NP_000509,NP_004001,NP_001243779"
  efetch -db protein -id "$ids" -format gpc |
  xtract -insd INSDSeq_sequence CDS gene |
  AminoAcidJoin

  GENE    INS    HBB    DMD    TTN
  Ala     10     15     210    2084
  Arg     5      3      193    1640
  Asn     3      6      153    1111
  Asp     2      7      185    1720
  Cys     6      2      35     513
  Gln     7      3      301    942
  Glu     8      8      379    3193
  Gly     12     13     104    2066
  His     2      9      84     478
  Ile     2      0      165    2062
  Leu     20     18     438    2117
  Lys     2      11     282    2943
  Met     2      2      79     398
  Phe     3      8      77     908
  Pro     6      7      130    2517
  Ser     5      5      239    2463
  Thr     3      7      194    2546
  Trp     2      2      67     466
  Tyr     4      3      61     999
  Val     6      18     186    3184

GENE

Chromosome Assignments

  esearch -db gene -query "calmodulin * [PFN] AND mammalia [ORGN]" |
  efetch -format docsum |
  xtract -pattern DocumentSummary \
    -def "-" -element Id Name MapLocation ScientificName |
  head -n 30

  801       CALM1    14q32.11     Homo sapiens
  808       CALM3    19q13.32     Homo sapiens
  805       CALM2    2p21         Homo sapiens
  24242     Calm1    6q32         Rattus norvegicus
  12313     Calm1    12 E         Mus musculus
  326597    CALM     -            Bos taurus
  50663     Calm2    6q12         Rattus norvegicus
  24244     Calm3    1q21         Rattus norvegicus
  12315     Calm3    7 9.15 cM    Mus musculus
  12314     Calm2    17 E4        Mus musculus
  617095    CALM1    -            Bos taurus
  396838    CALM3    6            Sus scrofa
  ...

Genome Range

  esearch -db gene -query "Homo sapiens [ORGN] AND Y [CHR]" |
  efilter -status alive | efetch -format docsum |
  xtract -pattern DocumentSummary -NAME Name -DESC Description \
    -block GenomicInfoType -if ChrLoc -equals Y \
      -min ChrStart,ChrStop -element "&NAME" "&DESC" |
  sort -k 1,1n | cut -f 2- |
  grep -v pseudogene | grep -v uncharacterized |
  between-two-genes ASMT IL3RA |
  align-columns -g 4

  IL3RA        interleukin 3 receptor subunit alpha
  SLC25A6      solute carrier family 25 member 6
  LINC00106    long intergenic non-protein coding RNA 106
  ASMTL-AS1    ASMTL antisense RNA 1
  ASMTL        acetylserotonin O-methyltransferase-like
  P2RY8        purinergic receptor P2Y8
  AKAP17A      A-kinase anchoring protein 17A
  ASMT         acetylserotonin O-methyltransferase

Centromere Position

  nquire -ftp ftp.ncbi.nlm.nih.gov pub/gdp ideogram_9606_GCF_000001305.14_850_V1 |
  grep acen | cut -f 1,2,6,7 | grep "^X"

  X    p    58100001    61000000
  X    q    61000001    63800000

Gene Regions

  esearch -db gene -query "DDT [GENE] AND mouse [ORGN]" |
  efetch -format docsum |
  xtract -pattern GenomicInfoType -element ChrAccVer ChrStart ChrStop |
  xargs -n 3 sh -c 'efetch -db nuccore -format gb \
    -id "$0" -chr_start "$1" -chr_stop "$2"'

  LOCUS       NC_000076               2142 bp    DNA     linear   CON 09-FEB-2015
  DEFINITION  Mus musculus strain C57BL/6J chromosome 10, GRCm38.p3 C57BL/6J.
  ACCESSION   NC_000076 REGION: complement(75771233..75773374) GPC_000000783
  VERSION     NC_000076.6
  ...
  FEATURES             Location/Qualifiers
       source          1..2142
                       /organism="Mus musculus"
                       /mol_type="genomic DNA"
                       /strain="C57BL/6J"
                       /db_xref="taxon:10090"
                       /chromosome="10"
       gene            1..2142
                       /gene="Ddt"
       mRNA            join(1..159,462..637,1869..2142)
                       /gene="Ddt"
                       /product="D-dopachrome tautomerase"
                       /transcript_id="NM_010027.1"
       CDS             join(52..159,462..637,1869..1941)
                       /gene="Ddt"
                       /codon_start=1
                       /product="D-dopachrome decarboxylase"
                       /protein_id="NP_034157.1"
                       /translation="MPFVELETNLPASRIPAGLENRLCAATATILDKPEDRVSVTIRP
                       GMTLLMNKSTEPCAHLLVSSIGVVGTAEQNRTHSASFFKFLTEELSLDQDRIVIRFFP
                       ...

Recursive Data

  esearch -db gene -query "rbcL [GENE] AND maize [ORGN]" |
  efetch -format xml |
  xtract -pattern Entrezgene -block "**/Gene-commentary" \
    -if Gene-commentary_type@value -equals genomic \
      -tab "\n" -element Gene-commentary_accession |
  sort | uniq

  NC_001666
  X86563
  Z11973

Genes in Pathways

  esearch -db gene -query "PAH [GENE]" -organism human |
  elink -target biosystems |
  efilter -pathway wikipathways |
  elink -target gene |
  efetch -format docsum |
  xtract -pattern DocumentSummary -element Name Id Description |
  grep -v pseudogene | grep -v uncharacterized |
  sort -f

  AANAT      15     aralkylamine N-acetyltransferase
  ACADM      34     acyl-CoA dehydrogenase medium chain
  ACHE       43     acetylcholinesterase (Cartwright blood group)
  ADCYAP1    116    adenylate cyclase activating polypeptide 1
  ...

Gene Products

  for sym in HBB BRCA2 CFTR RAG1
  do
    esearch -db gene -query "$sym [GENE] AND human [ORGN]" |
    efilter -query "alive [PROP]" | efetch -format docsum |
    xtract -pattern GenomicInfoType \
      -element ChrAccVer ChrStart ChrStop |
    while read acc str stp
    do
      efetch -db nuccore -format gbc \
        -id "$acc" -chr_start "$str" -chr_stop "$stp" |
      xtract -insd CDS,mRNA INSDFeature_key "#INSDInterval" \
        gene "%transcription" "%translation" \
        product transcription translation |
      grep -i $'\t'"$sym"$'\t'
    done
  done

  NC_000011.10    mRNA    3     HBB    626      hemoglobin, beta                     ACATTTGCTT...
  NC_000011.10    CDS     3     HBB    147      hemoglobin subunit beta              MVHLTPEEKS...
  NC_000023.11    mRNA    78    DMD    13805    dystrophin, transcript variant X2    AGGAAGATGA...
  NC_000023.11    mRNA    77    DMD    13794    dystrophin, transcript variant X6    ACTTTCCCCC...
  NC_000023.11    mRNA    77    DMD    13800    dystrophin, transcript variant X5    ACTTTCCCCC...
  NC_000023.11    mRNA    77    DMD    13785    dystrophin, transcript variant X7    ACTTTCCCCC...
  NC_000023.11    mRNA    74    DMD    13593    dystrophin, transcript variant X8    ACTTTCCCCC...
  NC_000023.11    mRNA    75    DMD    13625    dystrophin, transcript variant X9    ACTTTCCCCC...
  ...

Unfiltered Gene Lookup

  for sym in ATP6 ATP7B CBD DMD HFE PAH PRNP TTN
  do
    esearch -db gene -query "$sym [GENE]" -organism human |
    efetch -format docsum |
    xtract -pattern DocumentSummary -def "-" -lbl "${sym}" \
      -element NomenclatureSymbol Id Description CommonName
  done

  ATP6      MT-ATP6    4508         ATP synthase F0 subunit 6           human
  ATP6      -          6775074      ATP synthase F0 subunit 6           Neandertal
  ATP6      -          8923188      ATP synthase F0 subunit 6           Denisova hominin
  CBD       OPN1MW     2652         opsin 1, medium wave sensitive      human
  HBB       HBB        3043         hemoglobin subunit beta             human
  HBB       KRT89P     85344        keratin 89 pseudogene               human
  OPN1MW    OPN1MW     2652         opsin 1, medium wave sensitive      human
  OPN1MW    OPN1MW3    101060233    opsin 1, medium wave sensitive 3    human

Protein Coding Genes

  for sym in MT-ATP6 BRCA2 CFTR HBB HFE IL9R OPN1MW PAH
  do
    esearch -db gene -query "$sym [PREF]" -organism human |
    efetch -format docsum |
    xtract -pattern DocumentSummary -def "-" \
      -lbl "${sym}" -element Id Chromosome Description
  done |
  ...

  MT-ATP6    4508    MT      ATP synthase F0 subunit 6
  BRCA2      675     13      BRCA2 DNA repair associated
  CFTR       1080    7       CF transmembrane conductance regulator
  HBB        3043    11      hemoglobin subunit beta
  HFE        3077    6       homeostatic iron regulator
  IL9R       3581    X, Y    interleukin 9 receptor
  OPN1MW     2652    X       opsin 1, medium wave sensitive
  PAH        5053    12      phenylalanine hydroxylase

Common Pathways

  ...
  while IFS=$'\t' read sym uid chr desc
  do
    elink -db gene -id "$uid" -target biosystems |
    efilter -kind pathway |
    efetch -format docsum |
    xtract -pattern DocumentSummary -lbl "${sym}" \
      -lower source -element externalid biosystemname
  done |
  sort -t $'\t' -k 2,2 -k 3,3 -k 1,1 |
  awk 'a[$3]++{ if(a[$3]==2){ print b }; print $0}; {b=$0}'

  MT-ATP6    kegg        hsa01100         Metabolic pathways
  PAH        kegg        hsa01100         Metabolic pathways
  HBB        reactome    R-HSA-1430728    Metabolism
  MT-ATP6    reactome    R-HSA-1430728    Metabolism
  PAH        reactome    R-HSA-1430728    Metabolism
  CFTR       reactome    R-HSA-162582     Signal Transduction
  OPN1MW     reactome    R-HSA-162582     Signal Transduction
  ...

TAXONOMY

Taxonomic Names

  esearch -db taxonomy -query "txid10090 [SBTR] OR camel [COMN]" |
  efetch -format docsum |
  xtract -pattern DocumentSummary -if CommonName \
    -element Id ScientificName CommonName

  57486    Mus musculus molossinus    Japanese wild mouse
  39442    Mus musculus musculus      eastern European house mouse
  35531    Mus musculus bactrianus    southwestern Asian house mouse
  10092    Mus musculus domesticus    western European house mouse
  10091    Mus musculus castaneus     southeastern Asian house mouse
  10090    Mus musculus               house mouse
  9838     Camelus dromedarius        Arabian camel
  9837     Camelus bactrianus         Bactrian camel

STRUCTURE

Structural Similarity

  esearch -db structure -query "crotalus [ORGN] AND phospholipase A2" |
  elink -related |
  efilter -query "archaea [ORGN]" |
  efetch -format docsum |
  xtract -pattern DocumentSummary \
    -if PdbClass -equals Hydrolase \
      -element PdbAcc PdbDescr

  3WIV    Crystal Structure Of Pro-s324a/d356a
  3WIU    Crystal Structure Of Pro-s324a/l349a
  3VV2    Crystal Structure Of Complex Form Between S324a-subtilisin And Mutant Tkpro
  3VHQ    Crystal Structure Of The Ca6 Site Mutant Of Pro-Sa-Subtilisin
  2ZWP    Crystal Structure Of Ca3 Site Mutant Of Pro-S324a
  ...

SNP

Amino Acid Substitutions

  esearch -db gene -query "OPN1MW [PREF] AND human [ORGN]" |
  elink -target snp | efilter -class missense |
  efetch -format docsum |
  xtract -set Set -rec Rec -pattern DocumentSummary \
    -wrp Id -element Id -rst -hgvs DOCSUM |
  xtract -pattern Rec -pfx "rs" -RSID Id \
    -group Protein/Missense -block Variant -plg "\n" \
      -element "&RSID" Accession Inserted Position |
  sort -t $'\t' -k 2,2 -k 4,4n -k 3,3f -k 1.3n | uniq |
  while read rsid accn res pos
  do
    if [ "$accn" != "$last" ]
    then
      seq=$( efetch -db protein -id "$accn" -format gpc < /dev/null |
             xtract -pattern INSDSeq -lower INSDSeq_sequence )
      last="$accn"
    fi
    echo ">$rsid [$accn $res@$pos]"
    echo "${seq:0:$pos-1}$res${seq:$pos}" | fold -w 50
  done

  >rs1238141906 [NP_000504.1 K@41]
  maqqwslqrlagrhpqdsyedstqssiftytnsnstrgpfKgpnyhiapr
  wvyhltsvwmifvviasvftnglvlaatmkfkklrhplnwilvnlavadl
  aetviastisvvnqvygyfvlghpmcvlegytvslcgitglwslaiiswe
  ...

Sequences Flanking SNPs

  #!/bin/bash -norc

  efetch -db snp -id 268 -format json |
  transmute -j2x -set - -rec RS |
  xtract -pattern RS -pfx "rs" -RSID RS/refsnp_id \
    -group placements_with_allele \
      -block allele -if seq_id -starts-with "NC_" \
        -and inserted_sequence -differs-from deleted_sequence \
        -element "&RSID" seq_id deleted_sequence \
          inserted_sequence -tab "\n" -inc position |
  sort -t $'\t' -k 2,2 -k 5,5n -k 4,4f -k 1.3n | uniq |
  while read rsid accn del ins pos
  do
    lft=$(efetch -db nuccore -format fasta -id "$accn" \
            -seq_start "$((pos-50))" -seq_stop "$((pos-1))" < /dev/null |
          grep -v '>' | tr -d '\n')

    ad=${#ins}
    sb=${#del}
    rgt=$(efetch -db nuccore -format fasta -id "$accn" \
            -seq_start "$((pos+ad-sb+1))" -seq_stop "$((pos+ad-sb+50))" < /dev/null |
          grep -v '>' | tr -d '\n')

    echo "$rsid $accn $pos $del->$ins"
    echo "5': $lft"
    echo "3': $rgt"
    echo ""
  done

  rs268 NC_000008.10 19813529 A->G
  5': CTGCTTGAGTTGTAGAAAGAACCGCTGCAACAATCTGGGCTATGAGATCA
  3': TAAAGTCAGAGCCAAAAGAAGCAGCAAAATGTACCTGAAGACTCGTTCTC

  rs268 NC_000008.11 19956018 A->G
  5': CTGCTTGAGTTGTAGAAAGAACCGCTGCAACAATCTGGGCTATGAGATCA
  3': TAAAGTCAGAGCCAAAAGAAGCAGCAAAATGTACCTGAAGACTCGTTCTC

EXTERNAL

JSON Nested Array Expansion

  for ns in flat recurse plural depth
  do
    echo "  $ns"
    echo
    nquire -get "http://mygene.info/v3" gene 2652 |
    transmute -j2x -set - -rec GeneRec -nest "$ns" |
    grep position | head -n 4
    echo
  done

  "position": [
    [
      154182595,
      154182789
    ],
    [
      154187769,
      154188066
    ],

  flat

    <position>154182595</position>
    <position>154182789</position>
    <position>154187769</position>
    <position>154188066</position>

  recurse

    <position>
      <position>154182595</position>
      <position>154182789</position>
    </position>

  plural

    <positions>
      <position>154182595</position>
      <position>154182789</position>
    </positions>

  depth

    <position>
      <position_1>154182595</position_1>
      <position_1>154182789</position_1>
    </position>

Exon Interval Sets

  nquire -get "http://mygene.info/v3/gene/2652" |
  transmute -j2x -set - -rec GeneRec -nest plural |
  xtract -pattern GeneRec -group exons -lbl "" -clr \
    -block positions -pfc "\n" -sep ".." -tab "\n" -element position

  154182595..154182789
  154187769..154188066
  154190053..154190222
  154191687..154191853
  154193407..154193647
  154195929..154196861

  154219733..154219927
  154224907..154225204
  ...

Heterogeneous Object Names

  nquire -get "http://mygene.info/v3/gene/2652" |
  transmute -j2x -set - -rec GeneRec |
  xtract -pattern GeneRec -group "pathway/*" -pfx "\n" -element "?,name,id"

  <pathway>
    <reactome>
      <id>R-HSA-162582</id>
      <name>Signal Transduction</name>
    </reactome>
    ...
    <wikipathways>
      <id>WP455</id>
      <name>GPCRs, Class A Rhodopsin-like</name>
    </wikipathways>
  </pathway>

  reactome        Signal Transduction                              R-HSA-162582
  reactome        Disease                                          R-HSA-1643685
  reactome        The retinoid cycle in cones (daylight vision)    R-HSA-2187335
  reactome        Visual phototransduction                         R-HSA-2187338
  reactome        Retinoid cycle disease events                    R-HSA-2453864
  reactome        Diseases associated with visual transduction     R-HSA-2474795
  reactome        Signaling by GPCR                                R-HSA-372790
  reactome        Class A/1 (Rhodopsin-like receptors)             R-HSA-373076
  reactome        GPCR downstream signalling                       R-HSA-388396
  reactome        G alpha (i) signalling events                    R-HSA-418594
  reactome        Opsins                                           R-HSA-419771
  reactome        GPCR ligand binding                              R-HSA-500792
  reactome        Diseases of signal transduction                  R-HSA-5663202
  wikipathways    GPCRs, Class A Rhodopsin-like                    WP455

XML Namespace Prefixes

  nquire -url "http://webservice.wikipathways.org" getPathway -pwId WP455 |
  xtract -pattern "ns1:getPathwayResponse" -decode ":gpml" |
  xtract -pattern Pathway -block Xref \
    -if @Database -equals "Entrez Gene" \
      -tab "\n" -element @ID |
  sort -n

  134
  135
  136
  140
  146
  ...

LOCAL ARCHIVE

Entrez Indexing

  efetch -db pubmed -id 12857958,2981625 -format xml |
  xtract -e2index |
  xtract -pattern IdxDocument -UID IdxUid \
    -block NORM -pfc "\n" -element "&UID",NORM,"@pos"

  12857958    allow       205
  12857958    assays      147
  12857958    binding     146
  12857958    braid       187,215
  12857958    braiding    153
  ...

Author Frequency

  esearch -db pubmed -query "rattlesnake phospholipase" |
  efetch -format uid | fetch-pubmed |
  xtract -pattern PubmedArticle -block Author \
    -sep " " -tab "\n" -element LastName,Initials |
  sort-uniq-count-rank

  40    Marangoni S
  33    Toyama MH
  28    Soares AM
  25    Bon C
  ...

Author Counts

  esearch -db pubmed -query "conotoxin" |
  efetch -format uid | fetch-pubmed |
  xtract -pattern PubmedArticle -num Author |
  sort-uniq-count -n |
  reorder-columns 2 1 |
  head -n 15 |
  tee /dev/tty |
  xy-plot auth.png

  0     11
  1     193
  2     854
  3     844
  4     699
  5     588
  6     439
  7     291
  8     187
  9     124
  10    122
  11    58
  12    33
  13    18

  900 +
      |           ********
  800 +           *       **
      |          *          *
  700 +          *          ***
      |          *             **
  600 +         *                *
      |         *                ***
  500 +         *                   **
      |        *                      ***
  400 +       *                          **
      |       *                            *
  300 +       *                            ***
      |      *                                *
  200 +      *                                 ******
      |     *                                        *********
  100 +   **                                                  *
      |  *                                                     **********
    0 + *                                                                ******
        +---------+---------+---------+---------+---------+---------+---------+
        0         2         4         6         8        10        12        14

Title and Abstract Word Counts

  esearch -db pubmed -query "conotoxin" -pub structured |
  efetch -format uid | fetch-pubmed |
  xtract -stops -wrp "Set,Rec" \
    -pattern PubmedArticle -wrp "PMID" -element MedlineCitation/PMID \
      -wrp "Titl" -words ArticleTitle \
      -block Abstract/AbstractText -wrp "Grp,Abst" -words AbstractText |
  xtract -pattern Rec -element PMID -num Titl -block Grp -tab ", " -num Abst

  29194563    21    63, 84, 89, 26
  28882644    23    87, 34, 115, 25
  28877214    10    12, 42, 315, 94
  28825343    15    169
  28482835    9     75, 123, 42, 37
  28479398    15    170, 130
  ...

Verbosity Per Year

  esearch -db pubmed -query "PNAS [JOUR]" -pub abstract |
  efetch -format uid | stream-pubmed | gunzip -c |
  xtract -stops -wrp Set,Rec -pattern PubmedArticle \
    -wrp "Year" -year "PubDate/*" \
    -wrp "Abst" -words Abstract/AbstractText |
  xtract -wrp Set,Pub -pattern Rec \
    -wrp "Year" -element Year \
    -wrp "Num" -num Abst > countsByYear.xml
  for yr in {1960..2020}
  do
    cat countsByYear.xml |
    xtract -wrp Raw -pattern Pub -select Year -eq "$yr" |
    xtract -pattern Raw -lbl "$yr" -avg Num
  done |
  tee /dev/tty |
  xy-plot verbosity.png
  rm countsByYear.xml

Appending Metadata

  esearch -db pubmed -query "PNAS [JOUR]" -pub abstract |
  efetch -format uid | fetch-pubmed > pnas.xml

  cat pnas.xml |
  xtract -stops -wrp Set,Rec -pattern PubmedArticle \
    -wrp ID -element MedlineCitation/PMID \
    -wrp Abst -words Abstract/AbstractText |

    <Set>
    <Rec><ID>31822623</ID>    <Abst>foxp3</Abst><Abst>cd4</Abst><Abst>regulatory...</Abst></Rec>
    ...

  xtract -pattern Rec -element ID -wrp Num -num Abst > counts.txt

    31822623    <Num>243</Num>
    31822622    <Num>132</Num>
    31822621    <Num>252</Num>
    31822620    <Num>238</Num>
    ...

  xtract -input pnas.xml -wrp PubmedArticleSet -pattern PubmedArticle \
    -select MedlineCitation/PMID -appending counts.txt > merged.xml

LOCAL INDEX

Histogram Shortcut

  cat $EDIRECT_PUBMED_MASTER/Current/*.xml |
  xtract -timer -pattern PubmedArticle -histogram PubDate/Month

  26         8
  37         9
  121475     01
  114579     02
  111137     03
  109794     04
  120169     05
  130062     06
  125107     07
  126246     08
  123191     09
  120957     10
  109657     11
  110854     12
  1958892    Apr
  1809730    Aug
  2086169    Dec
  1844717    Feb
  1851803    Jan
  1784258    Jul
  2015942    Jun
  1943325    Mar
  1815691    May
  1889194    Nov
  2035632    Oct
  1          October
  1956569    Sep

Month Format Per Year

  cat $EDIRECT_PUBMED_MASTER/Current/*.xml |
  xtract -wrp Set,Rec -pattern PubmedArticle \
    -if PubDate/Month -wrp YR -year "PubDate/*" -wrp MN -len PubDate/Month |
  xtract -wrp Set,Rec -pattern Rec \
    -pfx "<DT>" -sep "+-" -sfx "-</DT>" -element YR,MN |
  xtract -pattern Rec -histogram DT |
  reorder-columns 2 1 | tr '+' '\t' |
  sed -e 's/-3-/1/g' -e 's/-2-/2/g' -e 's/-1-/3/g' -e 's/-[0-9]-/4/g' |
  sort -k 1,1n -k 2,2n > rawMonthCounts.txt

  result=$( cat rawMonthCounts.txt | cut -f 1 | uniq )
  for i in {1..4}
  do
    current=$( cat rawMonthCounts.txt | grep "\t$i\t" | cut -f 1,3 )
    result=$(join -a 1 -t $'\t' <(echo "$result") <(echo "$current"))
  done
  echo "$result" > plotme.txt

  cat plotme.txt | xy-plot

Phrase Query Automation

  ascend_mesh_tree() {
    var="${1%\*}"
    while :
    do
      phrase-search -count "$var* [TREE]"
      case "$var" in
        *.* ) var="${var%????}" ;;
        *   ) break             ;;
      esac
    done
  }

  ascend_mesh_tree "C01.925.782.417.415"

  5598       c01 925 782 417 415*
  28400      c01 925 782 417*
  658188     c01 925 782*
  928201     c01 925*
  2639368    c01*

Medical Subject Heading Code Viewers

  https://meshb.nlm.nih.gov/treeView
  https://meshb-prev.nlm.nih.gov/treeView

MISCELLANEOUS

Indexed Fields

  einfo -db pubmed |
  xtract -pattern Field \
    -if IsDate -equals Y -and IsHidden -equals N \
      -pfx "[" -sep "]\t" -element Name,FullName |
  sort -t $'\t' -k 2f

  [CDAT]    Date - Completion
  [CRDT]    Date - Create
  [EDAT]    Date - Entrez
  [MHDA]    Date - MeSH
  [MDAT]    Date - Modification
  [PDAT]    Date - Publication

Pseudocode Prototype

  for each PubmedArticle {
    for each Author {
      print Initials LastName
    }
    for each MeshHeading {
      print DescriptorName
      for each QualifierName {
        print QualifierName
      }
    }
  }

  xtract -pattern PubmedArticle \
    -block Author -element Initials LastName \
    -block MeshHeading -element DescriptorName \
      -subset QualifierName -element QualifierName

Processing in Groups

  ...
  efetch -format acc |
  join-into-groups-of 200 |
  xargs -n 1 sh -c 'epost -db nuccore -format acc -id "$0" |
  efetch -format gb'