summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README6
-rwxr-xr-xaccn-at-a-time4
-rwxr-xr-xalign-columns3
-rwxr-xr-xamino-acid-composition4
-rwxr-xr-xarchive-pubmed3
-rwxr-xr-xbetween-two-genes4
-rw-r--r--cmd/rchive.go441
-rw-r--r--cmd/transmute.go431
-rw-r--r--cmd/xtract.go414
-rwxr-xr-xdownload-ncbi-data3
-rwxr-xr-xdownload-pubmed3
-rwxr-xr-xdownload-sequence3
-rwxr-xr-xeblast159
-rwxr-xr-xefetch6
-rwxr-xr-xefilter6
-rwxr-xr-xeinfo16
-rwxr-xr-xelink6
-rwxr-xr-xepost6
-rwxr-xr-xesample3
-rwxr-xr-xesearch31
-rwxr-xr-xesummary6
-rw-r--r--eutils/align.go310
-rw-r--r--eutils/chan.go113
-rw-r--r--eutils/format.go4
-rw-r--r--eutils/misc.go203
-rw-r--r--eutils/normal.go2
-rw-r--r--eutils/parse.go184
-rw-r--r--eutils/spdi.go34
-rw-r--r--eutils/split.go30
-rw-r--r--eutils/utils.go131
-rw-r--r--eutils/valid.go2
-rw-r--r--eutils/xml.go131
-rwxr-xr-xexclude-uid-lists3
-rwxr-xr-xexpand-current3
-rwxr-xr-xfetch-pubmed3
-rwxr-xr-xfilter-stop-words3
-rwxr-xr-xfilter-table8
-rwxr-xr-xgbf2xml3
-rw-r--r--hlp-xtract.txt13
-rwxr-xr-xindex-extras3
-rwxr-xr-xindex-pubmed3
-rwxr-xr-xintersect-uid-lists3
-rwxr-xr-xjoin-into-groups-of4
-rwxr-xr-xpm-collect3
-rwxr-xr-xpm-index3
-rwxr-xr-xpm-invert3
-rwxr-xr-xpm-merge3
-rwxr-xr-xpm-prepare3
-rwxr-xr-xpm-promote3
-rwxr-xr-xpm-refresh3
-rwxr-xr-xpm-stash3
-rwxr-xr-xprint-columns8
-rwxr-xr-xreorder-columns5
-rwxr-xr-xrun-ncbi-converter4
-rwxr-xr-xsetup-deps.pl4
-rwxr-xr-xsetup.sh3
-rwxr-xr-xskip-if-file-exists4
-rwxr-xr-xsort-table6
-rwxr-xr-xsort-uniq-count7
-rwxr-xr-xsort-uniq-count-rank7
-rwxr-xr-xstream-pubmed3
-rwxr-xr-xtest-edirect22
-rwxr-xr-xtest-pubmed-index3
-rwxr-xr-xtheme-aliases3
-rw-r--r--tst-elink.txt1
-rwxr-xr-xword-at-a-time4
-rwxr-xr-xxml2tbl3
-rwxr-xr-xxy-plot4
68 files changed, 1555 insertions, 1311 deletions
diff --git a/README b/README
index 0192953..b1d51c7 100644
--- a/README
+++ b/README
@@ -64,7 +64,7 @@ Transmute converts a concatenated stream of JSON objects or other structured for
Xtract can use waypoints to navigate a complex XML hierarchy and obtain data values by field name:
- xtract -pattern entities -group P527 -block datavalue -element id |
+ xtract -pattern entities -group P527/mainsnak -block datavalue -element id |
The resulting output can be post-processed by Unix utilities or scripts:
@@ -924,6 +924,10 @@ Information on how to obtain an API Key is described in this NCBI blogpost:
https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities
+The Public Domain Notice for all NCBI EDirect scripts is located at:
+
+ https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE
+
Questions or comments on EDirect may be sent to info@ncbi.nlm.nih.gov.
This research was supported by the Intramural Research Program of the National Library of Medicine at the NIH.
diff --git a/accn-at-a-time b/accn-at-a-time
index 000c68b..adc6994 100755
--- a/accn-at-a-time
+++ b/accn-at-a-time
@@ -1,4 +1,8 @@
#!/bin/bash -norc
+
+# Public domain notice for all NCBI EDirect scripts is located at:
+# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE
+
sed 's/[^a-zA-Z0-9_.]/ /g; s/^ *//' |
tr 'A-Z' 'a-z' |
fmt -w 1
diff --git a/align-columns b/align-columns
index b149804..87b31fa 100755
--- a/align-columns
+++ b/align-columns
@@ -1,5 +1,8 @@
#!/bin/sh
+# Public domain notice for all NCBI EDirect scripts is located at:
+# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE
+
# inspired by Steve Kinzler's align script - see http://kinzler.com/me/align/
# requires tab-delimited input, output aligned by padding with spaces
diff --git a/amino-acid-composition b/amino-acid-composition
index 177276a..cc84e2c 100755
--- a/amino-acid-composition
+++ b/amino-acid-composition
@@ -1,4 +1,8 @@
#!/bin/bash -norc
+
+# Public domain notice for all NCBI EDirect scripts is located at:
+# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE
+
abbrev=( Ala Asx Cys Asp Glu Phe Gly His Ile \
Xle Lys Leu Met Asn Pyl Pro Gln Arg \
Ser Thr Sec Val Trp Xxx Tyr Glx )
diff --git a/archive-pubmed b/archive-pubmed
index b7e2141..706f72a 100755
--- a/archive-pubmed
+++ b/archive-pubmed
@@ -1,5 +1,8 @@
#!/bin/sh
+# Public domain notice for all NCBI EDirect scripts is located at:
+# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE
+
while [ $# -gt 0 ]
do
case "$1" in
diff --git a/between-two-genes b/between-two-genes
index b2c78f2..673f159 100755
--- a/between-two-genes
+++ b/between-two-genes
@@ -1,2 +1,6 @@
#!/bin/bash -norc
+
+# Public domain notice for all NCBI EDirect scripts is located at:
+# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE
+
awk -F '\t' -v 'OFS=\t' "/^$1\t/{a++}/^$2\t/{a++}a>0{print}a>1{exit}"
diff --git a/cmd/rchive.go b/cmd/rchive.go
index e443822..af3db22 100644
--- a/cmd/rchive.go
+++ b/cmd/rchive.go
@@ -387,6 +387,11 @@ Execution Profiling
go tool pprof --pdf ./cpu.pprof > ./callgraph.pdf
`
+var (
+ doStem bool
+ deStop bool
+)
+
var idxFields = [12]string{
"CHEM",
"CODE",
@@ -402,18 +407,20 @@ var idxFields = [12]string{
"YEAR",
}
+// Master points to a term and to its postings data
type Master struct {
TermOffset int32
PostOffset int32
}
+// Arrays contains postings lists and word offsets
type Arrays struct {
Data []int32
Ofst [][]int16
Dist int
}
-func ReportEncodedMarkup(typ, id, str string) {
+func reportEncodedMarkup(typ, id, str string) {
var buffer strings.Builder
@@ -599,8 +606,8 @@ func ReportEncodedMarkup(typ, id, str string) {
// DIRECTORY PATH UTILITIES
-// MakeArchiveTrie allows a short prefix of letters with an optional underscore, and splits the remainder into character pairs
-func MakeArchiveTrie(str string, arry [132]rune) string {
+// makeArchiveTrie allows a short prefix of letters with an optional underscore, and splits the remainder into character pairs
+func makeArchiveTrie(str string, arry [132]rune) string {
if len(str) > 64 {
return ""
@@ -697,8 +704,8 @@ func MakeArchiveTrie(str string, arry [132]rune) string {
return strings.ToUpper(res)
}
-// MakePostingsTrie splits a string into characters, separated by path delimiting slashes
-func MakePostingsTrie(str string, arry [516]rune) string {
+// makePostingsTrie splits a string into characters, separated by path delimiting slashes
+func makePostingsTrie(str string, arry [516]rune) string {
if len(str) > 256 {
return ""
@@ -889,7 +896,7 @@ var mergLen = map[string]int{
"tre": 4,
}
-func PostingDir(term string) string {
+func postingDir(term string) string {
if len(term) < 3 {
return term
@@ -910,7 +917,7 @@ func PostingDir(term string) string {
return term[:3]
}
-func IdentifierKey(term string) string {
+func identifierKey(term string) string {
// remove punctuation from term
key := strings.Map(func(c rune) rune {
@@ -924,17 +931,17 @@ func IdentifierKey(term string) string {
key = strings.Replace(key, "-", "_", -1)
// use first 2, 3, or 4 characters of identifier for directory
- key = PostingDir(key)
+ key = postingDir(key)
return key
}
-func PostingPath(prom, field, term string, arry [516]rune) (string, string) {
+func postingPath(prom, field, term string, arry [516]rune) (string, string) {
// use first few characters of identifier for directory
- dir := IdentifierKey(term)
+ dir := identifierKey(term)
- trie := MakePostingsTrie(dir, arry)
+ trie := makePostingsTrie(dir, arry)
if trie == "" {
return "", ""
}
@@ -944,7 +951,7 @@ func PostingPath(prom, field, term string, arry [516]rune) (string, string) {
return dpath, dir
}
-func CommonOpenFile(dpath, fname string) (*os.File, int64) {
+func commonOpenFile(dpath, fname string) (*os.File, int64) {
fpath := path.Join(dpath, fname)
if fpath == "" {
@@ -971,9 +978,9 @@ func CommonOpenFile(dpath, fname string) (*os.File, int64) {
return inFile, size
}
-func ReadMasterIndex(dpath, key, field string) []Master {
+func readMasterIndex(dpath, key, field string) []Master {
- inFile, size := CommonOpenFile(dpath, key+"."+field+".mst")
+ inFile, size := commonOpenFile(dpath, key+"."+field+".mst")
if inFile == nil {
return nil
}
@@ -994,9 +1001,9 @@ func ReadMasterIndex(dpath, key, field string) []Master {
return data
}
-func ReadTermList(dpath, key, field string) []byte {
+func readTermList(dpath, key, field string) []byte {
- inFile, size := CommonOpenFile(dpath, key+"."+field+".trm")
+ inFile, size := commonOpenFile(dpath, key+"."+field+".trm")
if inFile == nil {
return nil
}
@@ -1017,9 +1024,9 @@ func ReadTermList(dpath, key, field string) []byte {
return data
}
-func ReadPostingData(dpath, key, field string, offset int32, size int32) []int32 {
+func readPostingData(dpath, key, field string, offset int32, size int32) []int32 {
- inFile, _ := CommonOpenFile(dpath, key+"."+field+".pst")
+ inFile, _ := commonOpenFile(dpath, key+"."+field+".pst")
if inFile == nil {
return nil
}
@@ -1046,9 +1053,9 @@ func ReadPostingData(dpath, key, field string, offset int32, size int32) []int32
return data
}
-func ReadPositionIndex(dpath, key, field string, offset int32, size int32) []int32 {
+func readPositionIndex(dpath, key, field string, offset int32, size int32) []int32 {
- inFile, _ := CommonOpenFile(dpath, key+"."+field+".uqi")
+ inFile, _ := commonOpenFile(dpath, key+"."+field+".uqi")
if inFile == nil {
return nil
}
@@ -1075,9 +1082,9 @@ func ReadPositionIndex(dpath, key, field string, offset int32, size int32) []int
return data
}
-func ReadOffsetData(dpath, key, field string, offset int32, size int32) []int16 {
+func readOffsetData(dpath, key, field string, offset int32, size int32) []int16 {
- inFile, _ := CommonOpenFile(dpath, key+"."+field+".ofs")
+ inFile, _ := commonOpenFile(dpath, key+"."+field+".ofs")
if inFile == nil {
return nil
}
@@ -1104,7 +1111,7 @@ func ReadOffsetData(dpath, key, field string, offset int32, size int32) []int16
return data
}
-func ReadMasterIndexFuture(dpath, key, field string) <-chan []Master {
+func readMasterIndexFuture(dpath, key, field string) <-chan []Master {
out := make(chan []Master, eutils.ChanDepth())
if out == nil {
@@ -1115,7 +1122,7 @@ func ReadMasterIndexFuture(dpath, key, field string) <-chan []Master {
// masterIndexFuture asynchronously gets the master file and sends results through channel
masterIndexFuture := func(dpath, key, field string, out chan<- []Master) {
- data := ReadMasterIndex(dpath, key, field)
+ data := readMasterIndex(dpath, key, field)
out <- data
@@ -1128,7 +1135,7 @@ func ReadMasterIndexFuture(dpath, key, field string) <-chan []Master {
return out
}
-func ReadTermListFuture(dpath, key, field string) <-chan []byte {
+func readTermListFuture(dpath, key, field string) <-chan []byte {
out := make(chan []byte, eutils.ChanDepth())
if out == nil {
@@ -1139,7 +1146,7 @@ func ReadTermListFuture(dpath, key, field string) <-chan []byte {
// termListFuture asynchronously gets posting IDs and sends results through channel
termListFuture := func(dpath, key, field string, out chan<- []byte) {
- data := ReadTermList(dpath, key, field)
+ data := readTermList(dpath, key, field)
out <- data
@@ -1152,21 +1159,21 @@ func ReadTermListFuture(dpath, key, field string) <-chan []byte {
return out
}
-func GetPostingIDs(prom, term, field string, simple bool) ([]int32, [][]int16) {
+func getPostingIDs(prom, term, field string, simple bool) ([]int32, [][]int16) {
var (
arry [516]rune
)
- dpath, key := PostingPath(prom, field, term, arry)
+ dpath, key := postingPath(prom, field, term, arry)
if dpath == "" {
return nil, nil
}
// schedule asynchronous fetching
- mi := ReadMasterIndexFuture(dpath, key, field)
+ mi := readMasterIndexFuture(dpath, key, field)
- tl := ReadTermListFuture(dpath, key, field)
+ tl := readTermListFuture(dpath, key, field)
// fetch master index and term list
indx := <-mi
@@ -1215,7 +1222,7 @@ func GetPostingIDs(prom, term, field string, simple bool) ([]int32, [][]int16) {
tlen := len(term)
isWildCard = true
term = strings.TrimSuffix(term, "*")
- pdlen := len(PostingDir(term))
+ pdlen := len(postingDir(term))
if tlen < pdlen {
fmt.Fprintf(os.Stderr, "Wildcard term '%s' must be at least %d characters long - ignoring this word\n", term, pdlen)
return nil, nil
@@ -1243,7 +1250,7 @@ func GetPostingIDs(prom, term, field string, simple bool) ([]int32, [][]int16) {
size := indx[R].PostOffset - offset
// read relevant postings list section
- data := ReadPostingData(dpath, key, field, offset, size)
+ data := readPostingData(dpath, key, field, offset, size)
if data == nil || len(data) < 1 {
return nil, nil
}
@@ -1272,7 +1279,7 @@ func GetPostingIDs(prom, term, field string, simple bool) ([]int32, [][]int16) {
}
// read relevant word position section, includes phantom offset at end
- uqis := ReadPositionIndex(dpath, key, field, offset, size+4)
+ uqis := readPositionIndex(dpath, key, field, offset, size+4)
if uqis == nil {
return nil, nil
}
@@ -1285,7 +1292,7 @@ func GetPostingIDs(prom, term, field string, simple bool) ([]int32, [][]int16) {
to := uqis[ulen-1]
// read offset section
- ofst := ReadOffsetData(dpath, key, field, from, to-from)
+ ofst := readOffsetData(dpath, key, field, from, to-from)
if ofst == nil {
return nil, nil
}
@@ -1353,7 +1360,7 @@ func GetPostingIDs(prom, term, field string, simple bool) ([]int32, [][]int16) {
size := indx[R+1].PostOffset - offset
// read relevant postings list section
- data := ReadPostingData(dpath, key, field, offset, size)
+ data := readPostingData(dpath, key, field, offset, size)
if data == nil || len(data) < 1 {
return nil, nil
}
@@ -1363,7 +1370,7 @@ func GetPostingIDs(prom, term, field string, simple bool) ([]int32, [][]int16) {
}
// read relevant word position section, includes phantom offset at end
- uqis := ReadPositionIndex(dpath, key, field, offset, size+4)
+ uqis := readPositionIndex(dpath, key, field, offset, size+4)
if uqis == nil {
return nil, nil
}
@@ -1376,7 +1383,7 @@ func GetPostingIDs(prom, term, field string, simple bool) ([]int32, [][]int16) {
to := uqis[ulen-1]
// read offset section
- ofst := ReadOffsetData(dpath, key, field, from, to-from)
+ ofst := readOffsetData(dpath, key, field, from, to-from)
if ofst == nil {
return nil, nil
}
@@ -1401,18 +1408,18 @@ func GetPostingIDs(prom, term, field string, simple bool) ([]int32, [][]int16) {
return nil, nil
}
-func PrintTermCount(base, term, field string) int {
+func printTermCount(base, term, field string) int {
- data, _ := GetPostingIDs(base, term, field, true)
+ data, _ := getPostingIDs(base, term, field, true)
size := len(data)
fmt.Fprintf(os.Stdout, "%d\t%s\n", size, term)
return size
}
-func PrintTermCounts(base, term, field string) int {
+func printTermCounts(base, term, field string) int {
- pdlen := len(PostingDir(term))
+ pdlen := len(postingDir(term))
if len(term) < pdlen {
fmt.Fprintf(os.Stderr, "\nERROR: Term count argument must be at least %d characters\n", pdlen)
@@ -1425,15 +1432,15 @@ func PrintTermCounts(base, term, field string) int {
}
var arry [516]rune
- dpath, key := PostingPath(base, field, term, arry)
+ dpath, key := postingPath(base, field, term, arry)
if dpath == "" {
return 0
}
// schedule asynchronous fetching
- mi := ReadMasterIndexFuture(dpath, key, field)
+ mi := readMasterIndexFuture(dpath, key, field)
- tl := ReadTermListFuture(dpath, key, field)
+ tl := readTermListFuture(dpath, key, field)
// fetch master index and term list
indx := <-mi
@@ -1497,9 +1504,9 @@ func PrintTermCounts(base, term, field string) int {
return count
}
-func PrintTermPositions(base, term, field string) int {
+func printTermPositions(base, term, field string) int {
- data, ofst := GetPostingIDs(base, term, field, false)
+ data, ofst := getPostingIDs(base, term, field, false)
size := len(data)
fmt.Fprintf(os.Stdout, "\n%d\t%s\n\n", size, term)
@@ -1519,7 +1526,7 @@ func PrintTermPositions(base, term, field string) int {
// BOOLEAN OPERATIONS FOR POSTINGS LISTS
-func ExtendPositionalIDs(N []int32, np [][]int16, M []int32, mp [][]int16, delta int, proc func(pn, pm []int16, dlt int16) []int16) ([]int32, [][]int16) {
+func extendPositionalIDs(N []int32, np [][]int16, M []int32, mp [][]int16, delta int, proc func(pn, pm []int16, dlt int16) []int16) ([]int32, [][]int16) {
if proc == nil {
return nil, nil
@@ -1596,7 +1603,7 @@ func ExtendPositionalIDs(N []int32, np [][]int16, M []int32, mp [][]int16, delta
return res, ofs
}
-func IntersectIDs(N, M []int32) []int32 {
+func intersectIDs(N, M []int32) []int32 {
if N == nil {
return M
@@ -1661,7 +1668,7 @@ func IntersectIDs(N, M []int32) []int32 {
// if m * log(n) < m + n, binary search has fewer comparisons, but processor memory caches make linear algorithm faster
/*
-func IntersectBinary(N, M []int32) []int32 {
+func intersectBinary(N, M []int32) []int32 {
if N == nil {
return M
@@ -1713,7 +1720,7 @@ func IntersectBinary(N, M []int32) []int32 {
}
*/
-func CombineIDs(N, M []int32) []int32 {
+func combineIDs(N, M []int32) []int32 {
if N == nil {
return M
@@ -1770,7 +1777,7 @@ func CombineIDs(N, M []int32) []int32 {
return res
}
-func ExcludeIDs(N, M []int32) []int32 {
+func excludeIDs(N, M []int32) []int32 {
if N == nil {
return nil
@@ -1877,7 +1884,7 @@ func decodeFields(str string) string {
return str
}
-func PostingIDsFuture(base, term, field string, dist int) <-chan Arrays {
+func postingIDsFuture(base, term, field string, dist int) <-chan Arrays {
out := make(chan Arrays, eutils.ChanDepth())
if out == nil {
@@ -1888,7 +1895,7 @@ func PostingIDsFuture(base, term, field string, dist int) <-chan Arrays {
// postingFuture asynchronously gets posting IDs and sends results through channel
postingFuture := func(base, term, field string, dist int, out chan<- Arrays) {
- data, ofst := GetPostingIDs(base, term, field, false)
+ data, ofst := getPostingIDs(base, term, field, false)
out <- Arrays{Data: data, Ofst: ofst, Dist: dist}
@@ -1901,7 +1908,7 @@ func PostingIDsFuture(base, term, field string, dist int) <-chan Arrays {
return out
}
-func EvaluateQuery(base string, clauses []string) int {
+func evaluateQuery(base string, clauses []string) int {
if clauses == nil || clauses[0] == "" {
return 0
@@ -2021,7 +2028,7 @@ func EvaluateQuery(base string, clauses []string) int {
// efetch -format uid | phrase-search -query "[PIPE] AND L [THME]"
var data []int32
// read UIDs from stdin
- uidq := CreateUIDReader(os.Stdin)
+ uidq := createUIDReader(os.Stdin)
for ext := range uidq {
val, err := strconv.Atoi(ext.Text)
@@ -2055,7 +2062,7 @@ func EvaluateQuery(base string, clauses []string) int {
return nil, nil, 0
}
term = strings.Replace(term, "_", " ", -1)
- data, _ := GetPostingIDs(base, term, field, true)
+ data, _ := getPostingIDs(base, term, field, true)
count++
return data, nil, 1
}
@@ -2077,7 +2084,7 @@ func EvaluateQuery(base string, clauses []string) int {
continue
}
- fetch := PostingIDsFuture(base, term, field, dist)
+ fetch := postingIDsFuture(base, term, field, dist)
futures = append(futures, fetch)
@@ -2116,7 +2123,7 @@ func EvaluateQuery(base string, clauses []string) int {
for i := 1; i < len(intersect); i++ {
// add subsequent words, keep starting positions of phrases that contain all words in proper position
- data, ofst = ExtendPositionalIDs(data, ofst, intersect[i].Data, intersect[i].Ofst, intersect[i].Dist, phrasePositions)
+ data, ofst = extendPositionalIDs(data, ofst, intersect[i].Data, intersect[i].Ofst, intersect[i].Dist, phrasePositions)
if len(data) < 1 {
// bail if phrase not present
return nil, nil, 0
@@ -2222,7 +2229,7 @@ func EvaluateQuery(base string, clauses []string) int {
return nil, ""
}
// next phrase must be within specified distance after the previous phrase
- data, ofst = ExtendPositionalIDs(data, ofst, next, noff, delta+dist, proximityPositions)
+ data, ofst = extendPositionalIDs(data, ofst, next, noff, delta+dist, proximityPositions)
if len(data) < 1 {
return nil, ""
}
@@ -2239,7 +2246,7 @@ func EvaluateQuery(base string, clauses []string) int {
data, tkn := prox()
for tkn == "!" {
next, tkn = prox()
- data = ExcludeIDs(data, next)
+ data = excludeIDs(data, next)
}
return data, tkn
@@ -2252,7 +2259,7 @@ func EvaluateQuery(base string, clauses []string) int {
data, tkn := excl()
for tkn == "&" {
next, tkn = excl()
- data = IntersectIDs(data, next)
+ data = intersectIDs(data, next)
}
return data, tkn
@@ -2265,7 +2272,7 @@ func EvaluateQuery(base string, clauses []string) int {
data, tkn := term()
for tkn == "|" {
next, tkn = term()
- data = CombineIDs(data, next)
+ data = combineIDs(data, next)
}
return data, tkn
@@ -2308,7 +2315,7 @@ func EvaluateQuery(base string, clauses []string) int {
// QUERY PARSING FUNCTIONS
-func PrepareQuery(str string) string {
+func prepareQuery(str string) string {
if str == "" {
return ""
@@ -2377,8 +2384,59 @@ func PrepareQuery(str string) string {
str = strings.Replace(str, "_", " ", -1)
- if eutils.HasPlusOrMinus(str) {
- str = eutils.FixThemeCases(str)
+ hasPlusOrMinus := func(str string) bool {
+
+ for _, ch := range str {
+ if ch == '-' || ch == '+' {
+ return true
+ }
+ }
+
+ return false
+ }
+
+ fixThemeCases := func(str string) string {
+
+ if !strings.Contains(str, "[thme]") && !strings.Contains(str, "[conv]") {
+ return str
+ }
+
+ var arry []string
+
+ terms := strings.Fields(str)
+
+ for _, item := range terms {
+
+ switch item {
+ case "a+":
+ arry = append(arry, "ap")
+ case "e+":
+ arry = append(arry, "ep")
+ case "ec+":
+ arry = append(arry, "ecp")
+ case "eg+":
+ arry = append(arry, "egp")
+ case "v+":
+ arry = append(arry, "vp")
+ case "a-":
+ arry = append(arry, "am")
+ case "e-":
+ arry = append(arry, "em")
+ case "ec-":
+ arry = append(arry, "ecm")
+ default:
+ arry = append(arry, item)
+ }
+ }
+
+ // reconstruct string from transformed words
+ str = strings.Join(arry, " ")
+
+ return str
+ }
+
+ if hasPlusOrMinus(str) {
+ str = fixThemeCases(str)
}
if eutils.HasHyphenOrApostrophe(str) {
@@ -2407,7 +2465,7 @@ func PrepareQuery(str string) string {
return tmp
}
-func PrepareExact(str string) string {
+func prepareExact(str string) string {
if str == "" {
return ""
@@ -2514,7 +2572,7 @@ func PrepareExact(str string) string {
}
// optional stop word removal
- if eutils.DeStop() && eutils.IsStopWord(item) {
+ if deStop && eutils.IsStopWord(item) {
chain = append(chain, "+")
continue
}
@@ -2535,7 +2593,7 @@ func PrepareExact(str string) string {
return tmp
}
-func ProcessStopWords(str string) string {
+func processStopWords(str string) string {
if str == "" {
return ""
@@ -2591,7 +2649,7 @@ func ProcessStopWords(str string) string {
}
// skip if stop word, breaking phrase chain
- if eutils.DeStop() && eutils.IsStopWord(item) {
+ if deStop && eutils.IsStopWord(item) {
chain = append(chain, "+")
continue
}
@@ -2622,7 +2680,7 @@ func ProcessStopWords(str string) string {
return tmp
}
-func PartitionQuery(str string) []string {
+func partitionQuery(str string) []string {
if str == "" {
return nil
@@ -2674,7 +2732,7 @@ func PartitionQuery(str string) []string {
return tmp
}
-func SetFieldQualifiers(clauses []string, rlxd bool) []string {
+func setFieldQualifiers(clauses []string, rlxd bool) []string {
var res []string
@@ -2708,7 +2766,7 @@ func SetFieldQualifiers(clauses []string, rlxd bool) []string {
}
// skip if stop word, breaking phrase chain
- if eutils.DeStop() && eutils.IsStopWord(item) {
+ if deStop && eutils.IsStopWord(item) {
chain = append(chain, "+")
continue
}
@@ -2921,60 +2979,60 @@ func SetFieldQualifiers(clauses []string, rlxd bool) []string {
// SEARCH TERM LISTS FOR PHRASES OR NORMALIZED TERMS, OR MATCH BY PATTERN
-func ProcessSearch(base, phrase string, xact, rlxd bool) int {
+func processSearch(base, phrase string, xact, rlxd bool) int {
if phrase == "" {
return 0
}
if xact {
- phrase = PrepareExact(phrase)
+ phrase = prepareExact(phrase)
} else {
- phrase = PrepareQuery(phrase)
+ phrase = prepareQuery(phrase)
}
- phrase = ProcessStopWords(phrase)
+ phrase = processStopWords(phrase)
- clauses := PartitionQuery(phrase)
+ clauses := partitionQuery(phrase)
- clauses = SetFieldQualifiers(clauses, rlxd)
+ clauses = setFieldQualifiers(clauses, rlxd)
- return EvaluateQuery(base, clauses)
+ return evaluateQuery(base, clauses)
}
-func ProcessMock(base, phrase string, xact, rlxd bool) int {
+func processMock(base, phrase string, xact, rlxd bool) int {
if phrase == "" {
return 0
}
- fmt.Fprintf(os.Stdout, "ProcessSearch:\n\n%s\n\n", phrase)
+ fmt.Fprintf(os.Stdout, "processSearch:\n\n%s\n\n", phrase)
if xact {
- phrase = PrepareExact(phrase)
+ phrase = prepareExact(phrase)
- fmt.Fprintf(os.Stdout, "PrepareExact:\n\n%s\n\n", phrase)
+ fmt.Fprintf(os.Stdout, "prepareExact:\n\n%s\n\n", phrase)
} else {
- phrase = PrepareQuery(phrase)
+ phrase = prepareQuery(phrase)
- fmt.Fprintf(os.Stdout, "PrepareQuery:\n\n%s\n\n", phrase)
+ fmt.Fprintf(os.Stdout, "prepareQuery:\n\n%s\n\n", phrase)
}
- phrase = ProcessStopWords(phrase)
+ phrase = processStopWords(phrase)
- fmt.Fprintf(os.Stdout, "ProcessStopWords:\n\n%s\n\n", phrase)
+ fmt.Fprintf(os.Stdout, "processStopWords:\n\n%s\n\n", phrase)
- clauses := PartitionQuery(phrase)
+ clauses := partitionQuery(phrase)
- fmt.Fprintf(os.Stdout, "PartitionQuery:\n\n")
+ fmt.Fprintf(os.Stdout, "partitionQuery:\n\n")
for _, tkn := range clauses {
fmt.Fprintf(os.Stdout, "%s\n", tkn)
}
fmt.Fprintf(os.Stdout, "\n")
- clauses = SetFieldQualifiers(clauses, rlxd)
+ clauses = setFieldQualifiers(clauses, rlxd)
- fmt.Fprintf(os.Stdout, "SetFieldQualifiers:\n\n")
+ fmt.Fprintf(os.Stdout, "setFieldQualifiers:\n\n")
for _, tkn := range clauses {
fmt.Fprintf(os.Stdout, "%s\n", tkn)
}
@@ -2983,19 +3041,19 @@ func ProcessMock(base, phrase string, xact, rlxd bool) int {
return 0
}
-func ProcessCount(base, phrase string, plrl, psns, rlxd bool) int {
+func processCount(base, phrase string, plrl, psns, rlxd bool) int {
if phrase == "" {
return 0
}
- phrase = PrepareQuery(phrase)
+ phrase = prepareQuery(phrase)
- phrase = ProcessStopWords(phrase)
+ phrase = processStopWords(phrase)
- clauses := PartitionQuery(phrase)
+ clauses := partitionQuery(phrase)
- clauses = SetFieldQualifiers(clauses, rlxd)
+ clauses = setFieldQualifiers(clauses, rlxd)
if clauses == nil {
return 0
@@ -3075,11 +3133,11 @@ func ProcessCount(base, phrase string, plrl, psns, rlxd bool) int {
term = strings.Replace(term, "_", " ", -1)
if psns {
- count += PrintTermPositions(base, term, field)
+ count += printTermPositions(base, term, field)
} else if plrl {
- count += PrintTermCounts(base, term, field)
+ count += printTermCounts(base, term, field)
} else {
- count += PrintTermCount(base, term, field)
+ count += printTermCount(base, term, field)
}
}
}
@@ -3106,7 +3164,7 @@ func ProcessCount(base, phrase string, plrl, psns, rlxd bool) int {
// processes with single goroutine call defer close(out) so consumer(s) can range over channel
// processes with multiple instances call defer wg.Done(), separate goroutine uses wg.Wait() to delay close(out)
-func CreateUIDReader(in io.Reader) <-chan eutils.XMLRecord {
+func createUIDReader(in io.Reader) <-chan eutils.XMLRecord {
if in == nil {
return nil
@@ -3139,7 +3197,7 @@ func CreateUIDReader(in io.Reader) <-chan eutils.XMLRecord {
file = file[:pos]
}
- out <- eutils.XMLRecord{idx, "", file, nil}
+ out <- eutils.XMLRecord{Index: idx, Text: file}
}
}
@@ -3149,7 +3207,7 @@ func CreateUIDReader(in io.Reader) <-chan eutils.XMLRecord {
return out
}
-func CreateStashers(stash, parent, indx, sfx string, hash, zipp bool, report int, inp <-chan eutils.XMLRecord) <-chan string {
+func createStashers(stash, parent, indx, sfx string, hash, zipp bool, report int, inp <-chan eutils.XMLRecord) <-chan string {
if inp == nil {
return nil
@@ -3246,7 +3304,7 @@ func CreateStashers(stash, parent, indx, sfx string, hash, zipp bool, report int
}
var arry [132]rune
- trie := MakeArchiveTrie(id, arry)
+ trie := makeArchiveTrie(id, arry)
if trie == "" {
return ""
}
@@ -3406,7 +3464,7 @@ func CreateStashers(stash, parent, indx, sfx string, hash, zipp bool, report int
return out
}
-func CreateFetchers(stash, sfx string, zipp bool, inp <-chan eutils.XMLRecord) <-chan eutils.XMLRecord {
+func createFetchers(stash, sfx string, zipp bool, inp <-chan eutils.XMLRecord) <-chan eutils.XMLRecord {
if inp == nil {
return nil
@@ -3425,7 +3483,7 @@ func CreateFetchers(stash, sfx string, zipp bool, inp <-chan eutils.XMLRecord) <
fetchRecord := func(file string, buf bytes.Buffer) string {
var arry [132]rune
- trie := MakeArchiveTrie(file, arry)
+ trie := makeArchiveTrie(file, arry)
if file == "" || trie == "" {
return ""
@@ -3499,7 +3557,7 @@ func CreateFetchers(stash, sfx string, zipp bool, inp <-chan eutils.XMLRecord) <
runtime.Gosched()
- out <- eutils.XMLRecord{ext.Index, "", str, nil}
+ out <- eutils.XMLRecord{Index: ext.Index, Text: str}
}
}
@@ -3520,7 +3578,7 @@ func CreateFetchers(stash, sfx string, zipp bool, inp <-chan eutils.XMLRecord) <
return out
}
-func CreateStreamers(stash string, inp <-chan eutils.XMLRecord) <-chan eutils.XMLRecord {
+func createStreamers(stash string, inp <-chan eutils.XMLRecord) <-chan eutils.XMLRecord {
if inp == nil {
return nil
@@ -3537,7 +3595,7 @@ func CreateStreamers(stash string, inp <-chan eutils.XMLRecord) <-chan eutils.XM
getRecord := func(file string, buf bytes.Buffer) []byte {
var arry [132]rune
- trie := MakeArchiveTrie(file, arry)
+ trie := makeArchiveTrie(file, arry)
if file == "" || trie == "" {
return nil
@@ -3586,7 +3644,7 @@ func CreateStreamers(stash string, inp <-chan eutils.XMLRecord) <-chan eutils.XM
runtime.Gosched()
- out <- eutils.XMLRecord{ext.Index, "", "", data}
+ out <- eutils.XMLRecord{Index: ext.Index, Data: data}
}
}
@@ -3607,7 +3665,7 @@ func CreateStreamers(stash string, inp <-chan eutils.XMLRecord) <-chan eutils.XM
return out
}
-func CreateDispensers(inp <-chan eutils.XMLRecord) <-chan []string {
+func createDispensers(inp <-chan eutils.XMLRecord) <-chan []string {
if inp == nil {
return nil
@@ -3726,7 +3784,7 @@ func CreateDispensers(inp <-chan eutils.XMLRecord) <-chan []string {
return out
}
-func CreateInverters(inp <-chan []string) <-chan eutils.XMLRecord {
+func createInverters(inp <-chan []string) <-chan eutils.XMLRecord {
if inp == nil {
return nil
@@ -3845,7 +3903,7 @@ func CreateInverters(inp <-chan []string) <-chan eutils.XMLRecord {
str := printPosting(key, data)
- out <- eutils.XMLRecord{0, key, str, nil}
+ out <- eutils.XMLRecord{Ident: key, Text: str}
runtime.Gosched()
}
@@ -3868,7 +3926,7 @@ func CreateInverters(inp <-chan []string) <-chan eutils.XMLRecord {
return out
}
-func CreateResolver(inp <-chan eutils.XMLRecord) <-chan string {
+func createResolver(inp <-chan eutils.XMLRecord) <-chan string {
if inp == nil {
return nil
@@ -3923,6 +3981,7 @@ func CreateResolver(inp <-chan eutils.XMLRecord) <-chan string {
return out
}
+// Plex allows distribution of indexing
type Plex struct {
Which int
Ident string
@@ -3931,22 +3990,22 @@ type Plex struct {
Sibs []string
}
-type PlexHeap []Plex
+type plexHeap []Plex
// methods that satisfy heap.Interface
-func (h PlexHeap) Len() int {
+func (h plexHeap) Len() int {
return len(h)
}
-func (h PlexHeap) Less(i, j int) bool {
+func (h plexHeap) Less(i, j int) bool {
return h[i].Ident < h[j].Ident
}
-func (h PlexHeap) Swap(i, j int) {
+func (h plexHeap) Swap(i, j int) {
h[i], h[j] = h[j], h[i]
}
-func (h *PlexHeap) Push(x interface{}) {
+func (h *plexHeap) Push(x interface{}) {
*h = append(*h, x.(Plex))
}
-func (h *PlexHeap) Pop() interface{} {
+func (h *plexHeap) Pop() interface{} {
old := *h
n := len(old)
x := old[n-1]
@@ -3954,7 +4013,7 @@ func (h *PlexHeap) Pop() interface{} {
return x
}
-func CreatePresenters(args []string) []<-chan Plex {
+func createPresenters(args []string) []<-chan Plex {
if args == nil {
return nil
@@ -4053,7 +4112,7 @@ func CreatePresenters(args []string) []<-chan Plex {
return chns
}
-func CreateManifold(inp []<-chan Plex) <-chan Plex {
+func createManifold(inp []<-chan Plex) <-chan Plex {
if inp == nil {
return nil
@@ -4072,7 +4131,7 @@ func CreateManifold(inp []<-chan Plex) <-chan Plex {
defer close(out)
// initialize empty heap
- hp := &PlexHeap{}
+ hp := &plexHeap{}
heap.Init(hp)
// read first object from all input channels in turn
@@ -4146,7 +4205,7 @@ func CreateManifold(inp []<-chan Plex) <-chan Plex {
return out
}
-func CreateFusers(inp <-chan eutils.XMLRecord) <-chan Plex {
+func createFusers(inp <-chan eutils.XMLRecord) <-chan Plex {
if inp == nil {
return nil
@@ -4235,7 +4294,7 @@ func CreateFusers(inp <-chan eutils.XMLRecord) <-chan Plex {
return out
}
-func CreateMergers(inp <-chan Plex) <-chan eutils.XMLRecord {
+func createMergers(inp <-chan Plex) <-chan eutils.XMLRecord {
if inp == nil {
return nil
@@ -4368,7 +4427,7 @@ func CreateMergers(inp <-chan Plex) <-chan eutils.XMLRecord {
str := fusePostings(key, data)
- out <- eutils.XMLRecord{rec, key, str, nil}
+ out <- eutils.XMLRecord{Index: rec, Ident: key, Text: str}
runtime.Gosched()
}
@@ -4391,7 +4450,7 @@ func CreateMergers(inp <-chan Plex) <-chan eutils.XMLRecord {
return out
}
-func CreateSplitter(merg string, zipp bool, inp <-chan eutils.XMLRecord) <-chan string {
+func createSplitter(merg string, zipp bool, inp <-chan eutils.XMLRecord) <-chan string {
if inp == nil {
return nil
@@ -4489,7 +4548,7 @@ func CreateSplitter(merg string, zipp bool, inp <-chan eutils.XMLRecord) <-chan
for curr := range inp {
// use first few characters of identifier
- currTag = IdentifierKey(curr.Ident)
+ currTag = identifierKey(curr.Ident)
if currTag == "" {
continue
}
@@ -4531,7 +4590,7 @@ func CreateSplitter(merg string, zipp bool, inp <-chan eutils.XMLRecord) <-chan
// compare keys from adjacent term lists
if prev.Text != "" && prevTag != currTag {
- // after IdentifierKey converts space to underscore,
+ // after identifierKey converts space to underscore,
// okay that x_ and x0 will be out of alphabetical order
// send closing tag
@@ -4594,7 +4653,7 @@ func CreateSplitter(merg string, zipp bool, inp <-chan eutils.XMLRecord) <-chan
return out
}
-func CreatePromoters(args []string, prom, field string) <-chan string {
+func createPromoters(args []string, prom, field string) <-chan string {
if args == nil {
return nil
@@ -4815,7 +4874,7 @@ func CreatePromoters(args []string, prom, field string) <-chan string {
writeFiveFiles := func(key string) {
var arry [516]rune
- dpath, key := PostingPath(prom, field, key, arry)
+ dpath, key := postingPath(prom, field, key, arry)
if dpath == "" {
return
}
@@ -4859,11 +4918,11 @@ func CreatePromoters(args []string, prom, field string) <-chan string {
ok = true
// use first few characters of identifier
- currTag = IdentifierKey(term)
+ currTag = identifierKey(term)
if prevTag != currTag {
- // after IdentifierKey converts space to underscore,
+ // after identifierKey converts space to underscore,
// okay that xxx_ and xxx0 will be out of alphabetical order
// directory prefix changed from last posting
@@ -4917,7 +4976,7 @@ func CreatePromoters(args []string, prom, field string) <-chan string {
return out
}
-func CreateMatchers(phrs string, exclude bool, inp <-chan eutils.XMLRecord) <-chan eutils.XMLRecord {
+func createMatchers(phrs string, exclude bool, inp <-chan eutils.XMLRecord) <-chan eutils.XMLRecord {
if inp == nil {
return nil
@@ -4928,13 +4987,13 @@ func CreateMatchers(phrs string, exclude bool, inp <-chan eutils.XMLRecord) <-ch
return inp
}
- phrs = PrepareQuery(phrs)
+ phrs = prepareQuery(phrs)
- phrs = ProcessStopWords(phrs)
+ phrs = processStopWords(phrs)
- clauses := PartitionQuery(phrs)
+ clauses := partitionQuery(phrs)
- clauses = SetFieldQualifiers(clauses, false)
+ clauses = setFieldQualifiers(clauses, false)
if clauses == nil {
fmt.Fprintf(os.Stderr, "\nERROR: Unable to parse phrase\n")
@@ -5011,13 +5070,13 @@ func CreateMatchers(phrs string, exclude bool, inp <-chan eutils.XMLRecord) <-ch
}
// skip if stop word, breaking word pair chain
- if eutils.DeStop() && eutils.IsStopWord(item) {
+ if deStop && eutils.IsStopWord(item) {
chain = append(chain, "+")
continue
}
// apply stemming algorithm
- if eutils.DoStem() {
+ if doStem {
isWildCard := strings.HasSuffix(item, "*")
if isWildCard {
// temporarily remove trailing asterisk
@@ -5202,7 +5261,7 @@ func CreateMatchers(phrs string, exclude bool, inp <-chan eutils.XMLRecord) <-ch
if text == "" {
// should never see empty input data
- out <- eutils.XMLRecord{idx, "", text, nil}
+ out <- eutils.XMLRecord{Index: idx, Text: text}
continue
}
@@ -5212,12 +5271,12 @@ func CreateMatchers(phrs string, exclude bool, inp <-chan eutils.XMLRecord) <-ch
if exclude != ok {
// send text of record if phrase match succeeded with -require, or failed with -exclude
- out <- eutils.XMLRecord{idx, "", text, nil}
+ out <- eutils.XMLRecord{Index: idx, Text: text}
continue
}
// otherwise send empty text so unshuffler does not have to deal with record index gaps
- out <- eutils.XMLRecord{idx, "", "", nil}
+ out <- eutils.XMLRecord{Index: idx}
}
}
@@ -5238,7 +5297,7 @@ func CreateMatchers(phrs string, exclude bool, inp <-chan eutils.XMLRecord) <-ch
return out
}
-func CreateExternalIndexer(args []string, zipp bool, in io.Reader) int {
+func createExternalIndexer(args []string, zipp bool, in io.Reader) int {
recordCount := 0
@@ -6193,9 +6252,9 @@ func CreateExternalIndexer(args []string, zipp bool, in io.Reader) int {
return 0
}
-func CreateExternalArchive(stash string, args []string) <-chan string {
+func createExternalArchive(stash string, args []string) <-chan string {
- createPresenters := func(args []string) []<-chan Plex {
+ makePresenters := func(args []string) []<-chan Plex {
if args == nil {
return nil
@@ -6294,7 +6353,7 @@ func CreateExternalArchive(stash string, args []string) <-chan string {
return chns
}
- createManifold := func(inp []<-chan Plex) <-chan Plex {
+ makeManifold := func(inp []<-chan Plex) <-chan Plex {
if inp == nil {
return nil
@@ -6313,7 +6372,7 @@ func CreateExternalArchive(stash string, args []string) <-chan string {
defer close(out)
// initialize empty heap
- hp := &PlexHeap{}
+ hp := &plexHeap{}
heap.Init(hp)
// read first object from all input channels in turn
@@ -6385,7 +6444,7 @@ func CreateExternalArchive(stash string, args []string) <-chan string {
return out
}
- createMergers := func(inp <-chan Plex) <-chan eutils.XMLRecord {
+ makeMergers := func(inp <-chan Plex) <-chan eutils.XMLRecord {
if inp == nil {
return nil
@@ -6518,7 +6577,7 @@ func CreateExternalArchive(stash string, args []string) <-chan string {
str := fusePostings(key, data)
- out <- eutils.XMLRecord{rec, key, str, nil}
+ out <- eutils.XMLRecord{Index: rec, Ident: key, Text: str}
runtime.Gosched()
}
@@ -6541,10 +6600,10 @@ func CreateExternalArchive(stash string, args []string) <-chan string {
return out
}
- chns := createPresenters(args)
- mfld := createManifold(chns)
- mrgr := createMergers(mfld)
- stsq := CreateStashers(stash, "IdxDocument", "IdxDocument/IdxUid", ".e2x", false, true, 50000, mrgr)
+ chns := makePresenters(args)
+ mfld := makeManifold(chns)
+ mrgr := makeMergers(mfld)
+ stsq := createStashers(stash, "IdxDocument", "IdxDocument/IdxUid", ".e2x", false, true, 50000, mrgr)
if chns == nil || mfld == nil || mrgr == nil || stsq == nil {
fmt.Fprintf(os.Stderr, "\nERROR: Unable to create extra index stasher\n")
@@ -6580,8 +6639,8 @@ func main() {
doMixed := false
deAccent := false
doASCII := false
- doStem := false
- deStop := false
+ doStem = false
+ deStop = true
// CONCURRENCY, CLEANUP, AND DEBUGGING FLAGS
@@ -6880,7 +6939,7 @@ func main() {
case "-stems", "-stem":
doStem = true
case "-stops", "-stop":
- deStop = true
+ deStop = false
case "-unicode":
// DoUnicode = true
@@ -6962,7 +7021,7 @@ func main() {
case "stems", "stem":
doStem = true
case "stops", "stop":
- deStop = true
+ deStop = false
case "none", "default":
default:
if flgs != "" {
@@ -7005,7 +7064,7 @@ func main() {
eutils.SetTunings(numProcs, numServe, serverRatio, chanDepth, farmSize, heapSize, goGc)
- eutils.SetOptions(doStrict, doMixed, deAccent, doASCII, doCompress, doCleanup, doStem, deStop, false)
+ eutils.SetOptions(doStrict, doMixed, deAccent, doASCII, doCompress, doCleanup)
// -stats prints number of CPUs and performance tuning values if no other arguments (undocumented)
if stts && len(args) < 1 {
@@ -7184,7 +7243,7 @@ func main() {
if len(args) > 0 {
switch args[0] {
case "-bioconcepts", "-generif", "-generifs":
- recordCount = CreateExternalIndexer(args, zipp, in)
+ recordCount = createExternalIndexer(args, zipp, in)
debug.FreeOSMemory()
@@ -7194,7 +7253,7 @@ func main() {
return
case "-theme", "-themes", "-dpath", "-dpaths", "-thesis":
- recordCount = CreateExternalIndexer(args, zipp, in)
+ recordCount = createExternalIndexer(args, zipp, in)
debug.FreeOSMemory()
@@ -7229,7 +7288,7 @@ func main() {
// remaining arguments are *.e2x files
// e.g., rchive -timer -distribute archive_directory *.e2x
args = args[1:]
- stsq := CreateExternalArchive(path, args)
+ stsq := createExternalArchive(path, args)
if stsq == nil {
fmt.Fprintf(os.Stderr, "\nERROR: Unable to create extra index stasher\n")
@@ -7294,9 +7353,9 @@ func main() {
}
}
- chns := CreatePresenters(args)
- mfld := CreateManifold(chns)
- mrgr := CreateMergers(mfld)
+ chns := createPresenters(args)
+ mfld := createManifold(chns)
+ mrgr := createMergers(mfld)
unsq := eutils.CreateXMLUnshuffler(mrgr)
if chns == nil || mfld == nil || mrgr == nil || unsq == nil {
@@ -7424,11 +7483,11 @@ func main() {
}
}
- chns := CreatePresenters(args)
- mfld := CreateManifold(chns)
- mrgr := CreateMergers(mfld)
+ chns := createPresenters(args)
+ mfld := createManifold(chns)
+ mrgr := createMergers(mfld)
unsq := eutils.CreateXMLUnshuffler(mrgr)
- sptr := CreateSplitter(merg, zipp, unsq)
+ sptr := createSplitter(merg, zipp, unsq)
if chns == nil || mfld == nil || mrgr == nil || unsq == nil || sptr == nil {
fmt.Fprintf(os.Stderr, "\nERROR: Unable to create inverted index merger\n")
@@ -7515,7 +7574,7 @@ func main() {
if prom != "" && fild != "" {
- prmq := CreatePromoters(args, prom, fild)
+ prmq := createPromoters(args, prom, fild)
if prmq == nil {
fmt.Fprintf(os.Stderr, "\nERROR: Unable to create new postings file generator\n")
@@ -7564,7 +7623,7 @@ func main() {
txt := scanr.Text()
// deStop should match value used in building the indices
- recordCount += ProcessSearch(base, txt, true, false)
+ recordCount += processSearch(base, txt, true, false)
}
debug.FreeOSMemory()
@@ -7580,9 +7639,9 @@ func main() {
// deStop should match value used in building the indices
if mock {
- recordCount = ProcessMock(base, phrs, xact, rlxd)
+ recordCount = processMock(base, phrs, xact, rlxd)
} else {
- recordCount = ProcessSearch(base, phrs, xact, rlxd)
+ recordCount = processSearch(base, phrs, xact, rlxd)
}
debug.FreeOSMemory()
@@ -7597,7 +7656,7 @@ func main() {
if base != "" && trms != "" {
// deStop should match value used in building the indices
- recordCount = ProcessCount(base, trms, plrl, psns, rlxd)
+ recordCount = processCount(base, trms, plrl, psns, rlxd)
debug.FreeOSMemory()
@@ -7698,7 +7757,7 @@ func main() {
file := scanr.Text()
var arry [132]rune
- trie := MakeArchiveTrie(file, arry)
+ trie := makeArchiveTrie(file, arry)
if trie == "" || file == "" {
continue
}
@@ -7739,7 +7798,7 @@ func main() {
}
var arry [132]rune
- trie := MakeArchiveTrie(file, arry)
+ trie := makeArchiveTrie(file, arry)
if file == "" || trie == "" {
continue
@@ -7805,7 +7864,7 @@ func main() {
}
var arry [132]rune
- trie := MakeArchiveTrie(file, arry)
+ trie := makeArchiveTrie(file, arry)
if file == "" || trie == "" {
continue
@@ -7910,8 +7969,8 @@ func main() {
// -fetch without -index retrieves XML files in trie-based directory structure
if ftch != "" && indx == "" {
- uidq := CreateUIDReader(in)
- strq := CreateFetchers(ftch, ".xml", zipp, uidq)
+ uidq := createUIDReader(in)
+ strq := createFetchers(ftch, ".xml", zipp, uidq)
unsq := eutils.CreateXMLUnshuffler(strq)
if uidq == nil || strq == nil || unsq == nil {
@@ -7980,8 +8039,8 @@ func main() {
// -stream without -index retrieves compressed XML files in trie-based directory structure
if strm != "" && indx == "" {
- uidq := CreateUIDReader(in)
- strq := CreateStreamers(strm, uidq)
+ uidq := createUIDReader(in)
+ strq := createStreamers(strm, uidq)
unsq := eutils.CreateXMLUnshuffler(strq)
if uidq == nil || strq == nil || unsq == nil {
@@ -8019,8 +8078,8 @@ func main() {
// -summon retrieves link files in trie-based directory structure
if smmn != "" && indx == "" {
- uidq := CreateUIDReader(in)
- strq := CreateFetchers(smmn, ".e2x", zipp, uidq)
+ uidq := createUIDReader(in)
+ strq := createFetchers(smmn, ".e2x", zipp, uidq)
unsq := eutils.CreateXMLUnshuffler(strq)
if uidq == nil || strq == nil || unsq == nil {
@@ -8127,9 +8186,9 @@ func main() {
}
colq := eutils.CreateXMLProducer("IdxDocument", "", rdr)
- dspq := CreateDispensers(colq)
- invq := CreateInverters(dspq)
- rslq := CreateResolver(invq)
+ dspq := createDispensers(colq)
+ invq := createInverters(dspq)
+ rslq := createResolver(invq)
if colq == nil || dspq == nil || invq == nil || rslq == nil {
fmt.Fprintf(os.Stderr, "\nERROR: Unable to create inverter\n")
@@ -8251,8 +8310,8 @@ func main() {
}
chns := eutils.CreateXMLProducer("InvDocument", "", rdr)
- fusr := CreateFusers(chns)
- mrgr := CreateMergers(fusr)
+ fusr := createFusers(chns)
+ mrgr := createMergers(fusr)
unsq := eutils.CreateXMLUnshuffler(mrgr)
if chns == nil || fusr == nil || mrgr == nil || unsq == nil {
@@ -8393,7 +8452,7 @@ func main() {
}
xmlq := eutils.CreateXMLProducer(topPattern, star, rdr)
- mchq := CreateMatchers(phrs, exclude, xmlq)
+ mchq := createMatchers(phrs, exclude, xmlq)
unsq := eutils.CreateXMLUnshuffler(mchq)
if xmlq == nil || mchq == nil || unsq == nil {
@@ -8471,7 +8530,7 @@ func main() {
id = id[:idlen-2]
}
- ReportEncodedMarkup(dmgdType, id, str)
+ reportEncodedMarkup(dmgdType, id, str)
})
if timr {
@@ -8517,7 +8576,7 @@ func main() {
}
var arry [132]rune
- trie := MakeArchiveTrie(id, arry)
+ trie := makeArchiveTrie(id, arry)
if id == "" || trie == "" {
return
@@ -8636,7 +8695,7 @@ func main() {
if stsh != "" && indx != "" {
xmlq := eutils.CreateXMLProducer(topPattern, star, rdr)
- stsq := CreateStashers(stsh, parent, indx, ".xml", hshv, zipp, 1000, xmlq)
+ stsq := createStashers(stsh, parent, indx, ".xml", hshv, zipp, 1000, xmlq)
if xmlq == nil || stsq == nil {
fmt.Fprintf(os.Stderr, "\nERROR: Unable to create stash generator\n")
diff --git a/cmd/transmute.go b/cmd/transmute.go
index 603e498..85fd2e5 100644
--- a/cmd/transmute.go
+++ b/cmd/transmute.go
@@ -31,7 +31,6 @@
package main
import (
- "bufio"
"encoding/base64"
"eutils"
"fmt"
@@ -47,7 +46,6 @@ import (
"strings"
"sync"
"unicode"
- "unicode/utf8"
)
// TRANSMUTE HELP MESSAGE TEXT
@@ -337,8 +335,8 @@ Mismatch Detection (RefSeq Proteins with 3 Residue Differences from RefSeq Genom
// XML FORMATTING FUNCTIONS
-// CreateFormatters does concurrent reformatting, using flush-left to remove leading spaces
-func CreateFormatters(parent string, format string, inp <-chan eutils.XMLRecord) <-chan eutils.XMLRecord {
+// createFormatters does concurrent reformatting, using flush-left to remove leading spaces
+func createFormatters(parent string, format string, inp <-chan eutils.XMLRecord) <-chan eutils.XMLRecord {
if inp == nil {
return nil
@@ -368,7 +366,7 @@ func CreateFormatters(parent string, format string, inp <-chan eutils.XMLRecord)
if text == "" {
// should never see empty input data
- out <- eutils.XMLRecord{idx, "", text, nil}
+ out <- eutils.XMLRecord{Index: idx, Text: text}
continue
}
@@ -378,7 +376,7 @@ func CreateFormatters(parent string, format string, inp <-chan eutils.XMLRecord)
str := eutils.ChanToString(frm)
// send even if empty to get all record counts for reordering
- out <- eutils.XMLRecord{idx, "", str, nil}
+ out <- eutils.XMLRecord{Index: idx, Text: str}
}
}
@@ -399,8 +397,8 @@ func CreateFormatters(parent string, format string, inp <-chan eutils.XMLRecord)
return out
}
-// ProcessFormat reformats XML for ease of reading
-func ProcessFormat(rdr <-chan eutils.XMLBlock, args []string) {
+// processFormat reformats XML for ease of reading
+func processFormat(rdr <-chan eutils.XMLBlock, args []string) {
if rdr == nil || args == nil {
return
@@ -437,12 +435,21 @@ func ProcessFormat(rdr <-chan eutils.XMLBlock, args []string) {
switch args[0] {
case "-xml":
- xml = eutils.GetStringArg(args, "-xml argument")
- args = args[2:]
+ args = args[1:]
+ // -xml argument must be followed by value to use in xml line
+ if len(args) < 1 || strings.HasPrefix(args[0], "-") {
+ fmt.Fprintf(os.Stderr, "\nERROR: -xml argument is missing\n")
+ os.Exit(1)
+ }
+ xml = args[0]
+ args = args[1:]
case "-doctype":
- doctype = eutils.GetStringArg(args, "-doctype argument")
- args = args[2:]
-
+ args = args[1:]
+ if len(args) > 0 {
+ // if -doctype argument followed by value, use instead of DOCTYPE line
+ doctype = args[0]
+ args = args[1:]
+ }
/*
// allow setting of unicode, script, and mathml flags within -format
case "-unicode":
@@ -487,7 +494,7 @@ func ProcessFormat(rdr <-chan eutils.XMLBlock, args []string) {
}
}
- tknq := eutils.CreateTokenizer("", "", rdr)
+ tknq := eutils.CreateTokenizer(rdr)
frgs := eutils.FormatArgs{
Format: format, XML: xml, Doctype: doctype,
@@ -499,14 +506,14 @@ func ProcessFormat(rdr <-chan eutils.XMLBlock, args []string) {
eutils.ChanToStdout(frm)
}
-// ProcessTokens shows individual tokens in stream (undocumented)
-func ProcessTokens(rdr <-chan eutils.XMLBlock) {
+// processTokens shows individual tokens in stream (undocumented)
+func processTokens(rdr <-chan eutils.XMLBlock) {
if rdr == nil {
return
}
- tknq := eutils.CreateTokenizer("", "", rdr)
+ tknq := eutils.CreateTokenizer(rdr)
if tknq == nil {
fmt.Fprintf(os.Stderr, "\nERROR: Unable to create debug tokenizer\n")
@@ -650,14 +657,14 @@ func ProcessTokens(rdr <-chan eutils.XMLBlock) {
}
}
-// ProcessOutline displays outline of XML structure
-func ProcessOutline(rdr <-chan eutils.XMLBlock) {
+// processOutline displays outline of XML structure
+func processOutline(rdr <-chan eutils.XMLBlock) {
if rdr == nil {
return
}
- tknq := eutils.CreateTokenizer("", "", rdr)
+ tknq := eutils.CreateTokenizer(rdr)
if tknq == nil {
fmt.Fprintf(os.Stderr, "\nERROR: Unable to create outline tokenizer\n")
@@ -725,14 +732,14 @@ func ProcessOutline(rdr <-chan eutils.XMLBlock) {
}
}
-// ProcessSynopsis displays paths to XML elements
-func ProcessSynopsis(rdr <-chan eutils.XMLBlock, leaf bool, delim string) {
+// processSynopsis displays paths to XML elements
+func processSynopsis(rdr <-chan eutils.XMLBlock, leaf bool, delim string) {
if rdr == nil {
return
}
- tknq := eutils.CreateTokenizer("", "", rdr)
+ tknq := eutils.CreateTokenizer(rdr)
if tknq == nil {
fmt.Fprintf(os.Stderr, "\nERROR: Unable to create synopsis tokenizer\n")
@@ -840,14 +847,14 @@ func ProcessSynopsis(rdr <-chan eutils.XMLBlock, leaf bool, delim string) {
}
}
-// ProcessFilter modifies XML content, comments, or CDATA
-func ProcessFilter(rdr <-chan eutils.XMLBlock, args []string) {
+// processFilter modifies XML content, comments, or CDATA
+func processFilter(rdr <-chan eutils.XMLBlock, args []string) {
if rdr == nil || args == nil {
return
}
- tknq := eutils.CreateTokenizer("", "", rdr)
+ tknq := eutils.CreateTokenizer(rdr)
if tknq == nil {
fmt.Fprintf(os.Stderr, "\nERROR: Unable to create filter tokenizer\n")
@@ -1155,7 +1162,7 @@ func ProcessFilter(rdr <-chan eutils.XMLBlock, args []string) {
// STRING CONVERTERS
-func EncodeURL(inp io.Reader) {
+func encodeURL(inp io.Reader) {
if inp == nil {
return
@@ -1173,7 +1180,7 @@ func EncodeURL(inp io.Reader) {
}
}
-func DecodeURL(inp io.Reader) {
+func decodeURL(inp io.Reader) {
if inp == nil {
return
@@ -1191,7 +1198,7 @@ func DecodeURL(inp io.Reader) {
}
}
-func EncodeB64(inp io.Reader) {
+func encodeB64(inp io.Reader) {
if inp == nil {
return
@@ -1207,7 +1214,7 @@ func EncodeB64(inp io.Reader) {
}
}
-func DecodeB64(inp io.Reader) {
+func decodeB64(inp io.Reader) {
if inp == nil {
return
@@ -1224,7 +1231,7 @@ func DecodeB64(inp io.Reader) {
}
}
-func DecodeHGVS(inp io.Reader) {
+func decodeHGVS(inp io.Reader) {
if inp == nil {
return
@@ -1243,8 +1250,8 @@ func DecodeHGVS(inp io.Reader) {
// COLUMN ALIGNMENT FORMATTER
-// ProcessAlign aligns a tab-delimited table by individual column widths
-func ProcessAlign(inp io.Reader, args []string) {
+// processAlign aligns a tab-delimited table by individual column widths
+func processAlign(inp io.Reader, args []string) {
// tab-delimited-table to padded-by-spaces alignment inspired by
// Steve Kinzler's align script - see http://kinzler.com/me/align/
@@ -1253,13 +1260,9 @@ func ProcessAlign(inp io.Reader, args []string) {
return
}
- spcs := " "
-
- mrg := ""
- pad := " "
-
- lettrs := make(map[int]rune)
- lst := 'l'
+ mrg := 0
+ pdg := 0
+ aln := ""
// skip past command name
args = args[1:]
@@ -1268,19 +1271,13 @@ func ProcessAlign(inp io.Reader, args []string) {
switch args[0] {
case "-g":
- val := eutils.GetNumericArg(args, "-g spacing between columns", 0, 1, 30)
- pad = spcs[0:val]
+ pdg = eutils.GetNumericArg(args, "-g spacing between columns", 0, 1, 30)
args = args[2:]
case "-h":
- val := eutils.GetNumericArg(args, "-i indent before columns", 0, 1, 30)
- mrg = spcs[0:val]
+ mrg = eutils.GetNumericArg(args, "-i indent before columns", 0, 1, 30)
args = args[2:]
case "-a":
- val := eutils.GetStringArg(args, "-a column alignment code string")
- for i, ch := range val {
- lettrs[i] = ch
- lst = ch
- }
+ aln = eutils.GetStringArg(args, "-a column alignment code string")
args = args[2:]
default:
fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized option after -align command\n")
@@ -1288,232 +1285,21 @@ func ProcessAlign(inp io.Reader, args []string) {
}
}
- var arry []string
-
- width := make(map[int]int)
- whole := make(map[int]int)
- fract := make(map[int]int)
-
- scanr := bufio.NewScanner(inp)
-
- row := 0
- numCols := 0
-
- // allows leading plus or minus, digits interspersed with optional commas, decimal point, and digits
- isNumeric := func(str string) bool {
+ algn := eutils.AlignColumns(inp, mrg, pdg, aln)
- has_num := false
- has_period := false
-
- for i, ch := range str {
- switch ch {
- case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
- has_num = true
- case '+', '-':
- if i > 0 {
- return false
- }
- case '.':
- has_period = true
- case ',':
- if has_period {
- return false
- }
- default:
- return false
- }
- }
-
- return has_num
- }
-
- processLine := func(line string) string {
-
- var flds []string
-
- cols := strings.Split(line, "\t")
- if numCols == 0 {
- numCols = len(cols)
- } else if numCols != len(cols) {
- fmt.Fprintf(os.Stderr, "ERROR: Mismatched number of columns in row ")
- fmt.Fprintf(os.Stderr, strconv.Itoa(row))
- fmt.Fprintf(os.Stderr, ": actual ")
- fmt.Fprintf(os.Stderr, strconv.Itoa(len(cols)))
- fmt.Fprintf(os.Stderr, ", expected ")
- fmt.Fprintf(os.Stderr, strconv.Itoa(numCols))
- fmt.Fprintf(os.Stderr, "\n")
- // os.Exit(1)
- }
-
- for i, str := range cols {
-
- str = eutils.CompressRunsOfSpaces(str)
- str = strings.TrimSpace(str)
-
- flds = append(flds, str)
-
- // determine maximum length in each column
- ln := utf8.RuneCountInString(str)
- if ln > width[i] {
- width[i] = ln
- }
-
- code, ok := lettrs[i]
- if !ok {
- code = lst
- }
-
- switch code {
- case 'n', 'N', 'z', 'Z':
- if isNumeric(str) {
- // determine maximum length of decimal number parts
- wh, fr := eutils.SplitInTwoLeft(str, ".")
- if fr != "" {
- fr = "." + fr
- }
-
- lf := utf8.RuneCountInString(wh)
- if lf > whole[i] {
- whole[i] = lf
- }
- rt := utf8.RuneCountInString(fr)
- if rt > fract[i] {
- fract[i] = rt
- }
- ln = whole[i] + fract[i]
- if ln > width[i] {
- width[i] = ln
- }
- }
- }
- }
-
- return strings.Join(flds, "\t")
- }
-
- for i := 0; i < numCols; i++ {
-
- code, ok := lettrs[i]
- if !ok {
- code = lst
- }
-
- switch code {
- case 'n', 'N', 'z', 'Z':
- // adjust maximum widths with aligned decimal points
- ln := whole[i] + fract[i]
- if ln > width[i] {
- width[i] = ln
- }
- }
- }
-
- // clean up spaces, calculate column widths
- for scanr.Scan() {
-
- row++
- line := scanr.Text()
- if line == "" {
- continue
- }
-
- line = processLine(line)
- arry = append(arry, line)
+ if algn == nil {
+ fmt.Fprintf(os.Stderr, "\nERROR: Unable to create alignment function\n")
+ os.Exit(1)
}
- var buffer strings.Builder
-
- for _, line := range arry {
-
- buffer.Reset()
+ eutils.ChanToStdout(algn)
- cols := strings.Split(line, "\t")
-
- btwn := mrg
- for i, str := range cols {
-
- buffer.WriteString(btwn)
-
- code, ok := lettrs[i]
- if !ok {
- code = lst
- }
-
- ln := utf8.RuneCountInString(str)
- mx := width[i]
- diff := mx - ln
- lft := 0
- rgt := 0
- lft_pad := " "
- rgt_pad := " "
-
- if diff > 0 {
- switch code {
- case 'l':
- rgt = diff
- case 'c':
- lft = diff / 2
- rgt = diff - lft
- case 'r':
- lft = diff
- case 'n', 'N', 'z', 'Z':
- lft = diff
- if isNumeric(str) {
- switch code {
- case 'N':
- rgt_pad = "0"
- case 'z':
- lft_pad = "0"
- case 'Z':
- lft_pad = "0"
- rgt_pad = "0"
- }
- sn := whole[i]
- rc := fract[i]
- wh, fr := eutils.SplitInTwoLeft(str, ".")
- if fract[i] > 0 {
- if fr == "" {
- fr = "."
- } else {
- fr = "." + fr
- }
- lf := utf8.RuneCountInString(wh)
- lft = sn - lf
- rt := utf8.RuneCountInString(fr)
- rgt = rc - rt
- str = wh + fr
- }
- }
- default:
- rgt = diff
- }
- }
-
- for lft > 0 {
- lft--
- buffer.WriteString(lft_pad)
- }
-
- buffer.WriteString(str)
- btwn = pad
-
- for rgt > 0 {
- rgt--
- buffer.WriteString(rgt_pad)
- }
- }
-
- txt := buffer.String()
- txt = strings.TrimRight(txt, " ")
-
- os.Stdout.WriteString(txt)
- os.Stdout.WriteString("\n")
- }
+ return
}
// SEQUENCE EDITING
-func SequenceRemove(inp io.Reader, args []string) {
+func sequenceRemove(inp io.Reader, args []string) {
if inp == nil {
return
@@ -1552,7 +1338,7 @@ func SequenceRemove(inp io.Reader, args []string) {
}
}
-func SequenceRetain(inp io.Reader, args []string) {
+func sequenceRetain(inp io.Reader, args []string) {
if inp == nil {
return
@@ -1589,7 +1375,7 @@ func SequenceRetain(inp io.Reader, args []string) {
}
}
-func SequenceReplace(inp io.Reader, args []string) {
+func sequenceReplace(inp io.Reader, args []string) {
if inp == nil {
return
@@ -1636,7 +1422,7 @@ func SequenceReplace(inp io.Reader, args []string) {
}
}
-func SequenceExtract(inp io.Reader, args []string) {
+func sequenceExtract(inp io.Reader, args []string) {
if inp == nil {
return
@@ -1665,8 +1451,8 @@ func SequenceExtract(inp io.Reader, args []string) {
// REVERSE SEQUENCE
-// SeqFlip reverses without complementing - e.g., minus strand proteins translated in reverse order
-func SeqFlip(inp io.Reader) {
+// seqFlip reverses without complementing - e.g., minus strand proteins translated in reverse order
+func seqFlip(inp io.Reader) {
if inp == nil {
return
@@ -1684,7 +1470,7 @@ func SeqFlip(inp io.Reader) {
// REVERSE COMPLEMENT
-func NucRevComp(inp io.Reader) {
+func nucRevComp(inp io.Reader) {
if inp == nil {
return
@@ -1702,7 +1488,7 @@ func NucRevComp(inp io.Reader) {
// FASTA DIFFERENCES
-func PrintFastaPairs(frst, scnd string) {
+func printFastaPairs(frst, scnd string) {
frst = strings.ToLower(frst)
scnd = strings.ToLower(scnd)
@@ -1782,7 +1568,7 @@ func PrintFastaPairs(frst, scnd string) {
}
}
-func FastaDiff(inp io.Reader, args []string) {
+func fastaDiff(inp io.Reader, args []string) {
if inp == nil {
return
@@ -1807,18 +1593,18 @@ func FastaDiff(inp io.Reader, args []string) {
}
// sequences are assumed to be aligned, this code highlight mismatches
- PrintFastaPairs(frstFasta, scndFasta)
+ printFastaPairs(frstFasta, scndFasta)
}
// PROTEIN WEIGHT
-func ProtWeight(inp io.Reader, args []string) {
+func protWeight(inp io.Reader, args []string) {
if inp == nil {
return
}
- trim_leading_met := true
+ trimLeadingMet := true
// skip past command name
args = args[1:]
@@ -1827,7 +1613,7 @@ func ProtWeight(inp io.Reader, args []string) {
switch args[0] {
case "-met":
- trim_leading_met = false
+ trimLeadingMet = false
args = args[1:]
default:
fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized option after -molwt command\n")
@@ -1837,7 +1623,7 @@ func ProtWeight(inp io.Reader, args []string) {
str := eutils.ReadAllIntoSequence(inp)
- str = eutils.ProteinWeight(str, trim_leading_met)
+ str = eutils.ProteinWeight(str, trimLeadingMet)
os.Stdout.WriteString(str)
if !strings.HasSuffix(str, "\n") {
@@ -1845,8 +1631,8 @@ func ProtWeight(inp io.Reader, args []string) {
}
}
-// CdRegionToProtein reads all of stdin as sequence data
-func CdRegionToProtein(inp io.Reader, args []string) {
+// cdRegionToProtein reads all of stdin as sequence data
+func cdRegionToProtein(inp io.Reader, args []string) {
if inp == nil {
return
@@ -1854,11 +1640,11 @@ func CdRegionToProtein(inp io.Reader, args []string) {
genCode := 1
frame := 0
- include_stop := false
- do_every_codon := false
- remove_trailing_X := false
- is_5prime_complete := true
- is_3prime_complete := true
+ includeStop := false
+ doEveryCodon := false
+ removeTrailingX := false
+ is5primeComplete := true
+ is3primeComplete := true
repeat := 1
@@ -1875,19 +1661,19 @@ func CdRegionToProtein(inp io.Reader, args []string) {
frame = eutils.GetNumericArg(args, "offset into coding sequence", 0, 1, 30)
args = args[2:]
case "-stop", "-stops":
- include_stop = true
+ includeStop = true
args = args[1:]
case "-every", "-all":
- do_every_codon = true
+ doEveryCodon = true
args = args[1:]
case "-trim", "-trailing":
- remove_trailing_X = true
+ removeTrailingX = true
args = args[1:]
case "-part5", "-partial5", "-lt5":
- is_5prime_complete = false
+ is5primeComplete = false
args = args[1:]
case "-part3", "-partial3", "-gt3":
- is_3prime_complete = false
+ is3primeComplete = false
args = args[1:]
case "-repeat":
repeat = eutils.GetNumericArg(args, "number of repetitions for testing", 1, 1, 100)
@@ -1903,7 +1689,7 @@ func CdRegionToProtein(inp io.Reader, args []string) {
for i := 0; i < repeat; i++ {
// repeat multiple times for performance testing (undocumented)
- str := eutils.TranslateCdRegion(txt, genCode, frame, include_stop, do_every_codon, remove_trailing_X, is_5prime_complete, is_3prime_complete)
+ str := eutils.TranslateCdRegion(txt, genCode, frame, includeStop, doEveryCodon, removeTrailingX, is5primeComplete, is3primeComplete)
os.Stdout.WriteString(str)
if !strings.HasSuffix(str, "\n") {
@@ -1938,8 +1724,6 @@ func main() {
doMixed := false
deAccent := false
doASCII := false
- doStem := false
- deStop := false
/*
doUnicode := false
@@ -2048,9 +1832,9 @@ func main() {
// previously visible processing flags (undocumented)
case "-stems", "-stem":
- doStem = true
+ // ignore
case "-stops", "-stop":
- deStop = true
+ // ignore
// allow setting of unicode, script, and mathml flags (undocumented)
case "-unicode":
@@ -2115,9 +1899,9 @@ func main() {
case "mixed":
doMixed = true
case "stems", "stem":
- doStem = true
+ // ignore
case "stops", "stop":
- deStop = true
+ // ignore
case "none", "default":
default:
if flgs != "" {
@@ -2160,7 +1944,7 @@ func main() {
eutils.SetTunings(numProcs, numServe, serverRatio, chanDepth, farmSize, heapSize, goGc)
- eutils.SetOptions(doStrict, doMixed, deAccent, doASCII, doCompress, doCleanup, doStem, deStop, false)
+ eutils.SetOptions(doStrict, doMixed, deAccent, doASCII, doCompress, doCleanup)
// -stats prints number of CPUs and performance tuning values if no other arguments (undocumented)
if stts && len(args) < 1 {
@@ -2279,6 +2063,9 @@ func main() {
return nxt, true
}
+ // The several converter functions that follow must be called
+ // before CreateXMLStreamer starts draining stdin
+
// JSON TO XML CONVERTER
if args[0] == "-j2x" || args[0] == "-json2xml" {
@@ -2441,7 +2228,6 @@ func main() {
// READ TAB-DELIMITED FILE AND WRAP IN XML FIELDS
- // must be called before CreateXMLStreamer starts draining stdin
doTable := func(delim string) {
// skip past command name
@@ -2567,7 +2353,6 @@ func main() {
// READ GENBANK FLATFILE AND TRANSLATE TO INSDSEQ XML
- // must be called before CreateXMLStreamer starts draining stdin
if len(args) > 0 && args[0] == "-g2x" {
gbk := eutils.GenBankConverter(in)
@@ -2627,35 +2412,35 @@ func main() {
switch args[0] {
case "-encodeURL":
- EncodeURL(in)
+ encodeURL(in)
case "-decodeURL":
- DecodeURL(in)
+ decodeURL(in)
case "-encode64", "-encodeB64", "-encodeBase64":
- EncodeB64(in)
+ encodeB64(in)
case "-decode64", "-decodeB64", "-decodeBase64":
- DecodeB64(in)
+ decodeB64(in)
case "-hgvs":
- DecodeHGVS(in)
+ decodeHGVS(in)
case "-align":
- ProcessAlign(in, args)
+ processAlign(in, args)
case "-remove":
- SequenceRemove(in, args)
+ sequenceRemove(in, args)
case "-retain":
- SequenceRetain(in, args)
+ sequenceRetain(in, args)
case "-replace":
- SequenceReplace(in, args)
+ sequenceReplace(in, args)
case "-extract":
- SequenceExtract(in, args)
+ sequenceExtract(in, args)
case "-revcomp":
- NucRevComp(in)
+ nucRevComp(in)
case "-reverse":
- SeqFlip(in)
+ seqFlip(in)
case "-molwt":
- ProtWeight(in, args)
+ protWeight(in, args)
case "-cds2prot":
- CdRegionToProtein(in, args)
+ cdRegionToProtein(in, args)
case "-diff":
- FastaDiff(in, args)
+ fastaDiff(in, args)
default:
// if not any of the conversion commands, keep going
inSwitch = false
@@ -2701,9 +2486,9 @@ func main() {
switch args[0] {
case "-format":
- ProcessFormat(rdr, args)
+ processFormat(rdr, args)
case "-filter":
- ProcessFilter(rdr, args)
+ processFilter(rdr, args)
case "-normalize", "-normal":
if len(args) < 2 {
fmt.Fprintf(os.Stderr, "\nERROR: No database supplied to -normalize\n")
@@ -2713,7 +2498,7 @@ func main() {
nrm := eutils.NormalizeXML(rdr, db)
eutils.ChanToStdout(nrm)
case "-outline":
- ProcessOutline(rdr)
+ processOutline(rdr)
case "-contour":
leaf = true
fallthrough
@@ -2726,9 +2511,9 @@ func main() {
delim = "/"
}
}
- ProcessSynopsis(rdr, leaf, delim)
+ processSynopsis(rdr, leaf, delim)
case "-tokens":
- ProcessTokens(rdr)
+ processTokens(rdr)
default:
// if not any of the formatting commands, keep going
inSwitch = false
@@ -2893,7 +2678,7 @@ func main() {
}
xmlq := eutils.CreateXMLProducer(topPattern, star, rdr)
- fchq := CreateFormatters(topPattern, format, xmlq)
+ fchq := createFormatters(topPattern, format, xmlq)
unsq := eutils.CreateXMLUnshuffler(fchq)
if xmlq == nil || fchq == nil || unsq == nil {
diff --git a/cmd/xtract.go b/cmd/xtract.go
index 45e6cf9..b06bcbe 100644
--- a/cmd/xtract.go
+++ b/cmd/xtract.go
@@ -774,10 +774,19 @@ Remove Suffix
%% ${FILE%%.*} -> example
`
+// GLOBAL VARIABLES
+
+var (
+ doStem bool
+ deStop bool
+)
+
// TYPED CONSTANTS
+// LevelType is the integer type for exploration arguments
type LevelType int
+// LevelType keys for exploration arguments
const (
_ LevelType = iota
UNIT
@@ -791,8 +800,10 @@ const (
PATTERN
)
+// IndentType is the integer type for XML formatting
type IndentType int
+// IndentType keys for XML formatting
const (
SINGULARITY IndentType = iota
COMPACT
@@ -802,8 +813,10 @@ const (
WRAPPED
)
+// OpType is the integer type for operations
type OpType int
+// OpType keys for operations
const (
UNSET OpType = iota
ELEMENT
@@ -920,8 +933,10 @@ const (
UNRECOGNIZED
)
+// ArgumentType is the integer type for argument classification
type ArgumentType int
+// ArgumentType keys for argument classification
const (
_ ArgumentType = iota
EXPLORATION
@@ -930,8 +945,10 @@ const (
CUSTOMIZATION
)
+// RangeType is the integer type for element range choices
type RangeType int
+// RangeType keys for element range choices
const (
NORANGE RangeType = iota
STRINGRANGE
@@ -939,8 +956,10 @@ const (
INTEGERRANGE
)
+// SeqEndType is used for -ucsc-based decisions
type SeqEndType int
+// SeqEndType keys for -ucsc-based decisions
const (
_ SeqEndType = iota
ISSTART
@@ -948,6 +967,7 @@ const (
ISPOS
)
+// SequenceType is used to record XML tag and position for -ucsc-based
type SequenceType struct {
Based int
Which SeqEndType
@@ -1776,6 +1796,7 @@ var ncbi4naToIupac = map[int]string{
// DATA OBJECTS
+// Step contains parameters for executing a single command step
type Step struct {
Type OpType
Value string
@@ -1792,12 +1813,14 @@ type Step struct {
Wild bool
}
+// Operation breaks commands into sequential steps
type Operation struct {
Type OpType
Value string
Stages []*Step
}
+// Block contains nested instructions for executing commands
type Block struct {
Visit string
Parent string
@@ -1814,6 +1837,7 @@ type Block struct {
Subtasks []*Block
}
+// Limiter is used for collecting specific nodes (e.g., first and last)
type Limiter struct {
Obj *eutils.XMLNode
Idx int
@@ -1822,18 +1846,58 @@ type Limiter struct {
// UTILITIES
-func ParseFlag(str string) OpType {
+func hasSpaceOrHyphen(str string) bool {
+
+ for _, ch := range str {
+ if ch == ' ' || ch == '-' {
+ return true
+ }
+ }
+
+ return false
+}
+
+func isAllCapsOrDigits(str string) bool {
+
+ for _, ch := range str {
+ if !unicode.IsUpper(ch) && !unicode.IsDigit(ch) {
+ return false
+ }
+ }
+
+ return true
+}
+
+// sortStringByWords sorts the individual words in a string
+func sortStringByWords(str string) string {
+
+ str = eutils.RemoveCommaOrSemicolon(str)
+
+ // check for multiple words
+ if hasSpaceOrHyphen(str) {
+ flds := strings.Fields(str)
+ sort.Slice(flds, func(i, j int) bool { return flds[i] < flds[j] })
+ str = strings.Join(flds, " ")
+ str = strings.Replace(str, "-", " ", -1)
+ str = eutils.CompressRunsOfSpaces(str)
+ str = strings.TrimRight(str, ".?:")
+ }
+
+ return str
+}
+
+func parseFlag(str string) OpType {
op, ok := opTypeIs[str]
if ok {
return op
}
- if len(str) > 1 && str[0] == '-' && eutils.IsAllCapsOrDigits(str[1:]) {
+ if len(str) > 1 && str[0] == '-' && isAllCapsOrDigits(str[1:]) {
return VARIABLE
}
- if len(str) > 2 && strings.HasPrefix(str, "--") && eutils.IsAllCapsOrDigits(str[2:]) {
+ if len(str) > 2 && strings.HasPrefix(str, "--") && isAllCapsOrDigits(str[2:]) {
return ACCUMULATOR
}
@@ -1844,7 +1908,7 @@ func ParseFlag(str string) OpType {
return UNSET
}
-func ParseMarkup(str, cmd string) int {
+func parseMarkup(str, cmd string) int {
switch str {
case "fuse", "fused":
@@ -1907,8 +1971,8 @@ func DebugBlock(blk *Block, depth int) {
// PARSE COMMAND-LINE ARGUMENTS
-// ParseArguments parses nested exploration instruction from command-line arguments
-func ParseArguments(cmdargs []string, pttrn string) *Block {
+// parseArguments parses nested exploration instruction from command-line arguments
+func parseArguments(cmdargs []string, pttrn string) *Block {
// different names of exploration control arguments allow multiple levels of nested "for" loops in a linear command line
// (capitalized versions for backward-compatibility with original Perl implementation handling of recursive definitions)
@@ -2272,7 +2336,7 @@ func ParseArguments(cmdargs []string, pttrn string) *Block {
if len(str) > 1 {
switch str[0] {
case '&':
- if eutils.IsAllCapsOrDigits(str[1:]) {
+ if isAllCapsOrDigits(str[1:]) {
status = VARIABLE
str = str[1:]
} else if strings.Contains(str, ":") {
@@ -2373,7 +2437,7 @@ func ParseArguments(cmdargs []string, pttrn string) *Block {
switch status {
case UNSET:
- status = ParseFlag(str)
+ status = parseFlag(str)
case POSITION:
if cmds.Position != "" {
fmt.Fprintf(os.Stderr, "\nERROR: -position '%s' conflicts with existing '%s'\n", str, cmds.Position)
@@ -2424,7 +2488,7 @@ func ParseArguments(cmdargs []string, pttrn string) *Block {
// first character may be backslash protecting dash (undocumented)
str = str[1:]
}
- str = eutils.SortStringByWords(str)
+ str = sortStringByWords(str)
tsk := &Step{Type: status, Value: str}
op.Stages = append(op.Stages, tsk)
op = nil
@@ -2560,7 +2624,7 @@ func ParseArguments(cmdargs []string, pttrn string) *Block {
// parse next argument
nextStatus := func(str string) OpType {
- status := ParseFlag(str)
+ status := parseFlag(str)
switch status {
case VARIABLE:
@@ -2629,7 +2693,7 @@ func ParseArguments(cmdargs []string, pttrn string) *Block {
if len(item) > 1 {
switch item[0] {
case '&':
- if eutils.IsAllCapsOrDigits(item[1:]) {
+ if isAllCapsOrDigits(item[1:]) {
status = VARIABLE
item = item[1:]
} else {
@@ -2901,7 +2965,7 @@ func ParseArguments(cmdargs []string, pttrn string) *Block {
}
}
- // ParseArguments
+ // parseArguments
head := &Block{}
@@ -2955,140 +3019,8 @@ func ParseArguments(cmdargs []string, pttrn string) *Block {
return head
}
-// ExploreElements returns matching element values to callback
-func ExploreElements(curr *eutils.XMLNode, mask, prnt, match, attrib string, wildcard, unescape bool, level int, proc func(string, int)) {
-
- if curr == nil || proc == nil {
- return
- }
-
- // **/Object performs deep exploration of recursive data (*/Object also supported)
- deep := false
- if prnt == "**" || prnt == "*" {
- prnt = ""
- deep = true
- }
-
- var exploreChildren func(curr *eutils.XMLNode, acc func(string))
-
- exploreChildren = func(curr *eutils.XMLNode, acc func(string)) {
-
- if curr.Contents != "" {
- acc(curr.Contents)
- }
- for chld := curr.Children; chld != nil; chld = chld.Next {
- if chld.Name != "" {
- acc("<" + chld.Name + ">")
- }
- exploreChildren(chld, acc)
- if chld.Name != "" {
- acc("</" + chld.Name + ">")
- }
- }
- }
-
- // exploreElements recursive definition
- var exploreElements func(curr *eutils.XMLNode, skip string, lev int)
-
- exploreElements = func(curr *eutils.XMLNode, skip string, lev int) {
-
- if !deep && curr.Name == skip {
- // do not explore within recursive object
- return
- }
-
- if curr.Name == match ||
- // parent/* matches any subfield
- (match == "*" && prnt != "") ||
- // wildcard (internal colon) matches any namespace prefix
- (wildcard && strings.HasPrefix(match, ":") && strings.HasSuffix(curr.Name, match)) ||
- (match == "" && attrib != "") {
-
- if prnt == "" ||
- curr.Parent == prnt ||
- (wildcard && strings.HasPrefix(prnt, ":") && strings.HasSuffix(curr.Parent, prnt)) {
-
- if attrib != "" {
- if curr.Attributes != "" && curr.Attribs == nil {
- // parse attributes on-the-fly if queried
- curr.Attribs = eutils.ParseAttributes(curr.Attributes)
- }
- for i := 0; i < len(curr.Attribs)-1; i += 2 {
- // attributes now parsed into array as [ tag, value, tag, value, tag, value, ... ]
- if curr.Attribs[i] == attrib ||
- (wildcard && strings.HasPrefix(attrib, ":") && strings.HasSuffix(curr.Attribs[i], attrib)) {
- proc(curr.Attribs[i+1], level)
- return
- }
- }
-
- } else if curr.Contents != "" {
-
- str := curr.Contents[:]
-
- if unescape && eutils.HasAmpOrNotASCII(str) {
- // processing of <, >, &, ", and ' characters is now delayed until element contents is requested
- str = html.UnescapeString(str)
- }
-
- proc(str, level)
- return
-
- } else if curr.Children != nil {
-
- if eutils.DoMixed() {
- // match with mixed contents - send all child strings
- var buffr strings.Builder
- exploreChildren(curr, func(str string) {
- if str != "" {
- buffr.WriteString(str)
- }
- })
- str := buffr.String()
-
- // clean up reconstructed mixed content
- str = eutils.DoTrimFlankingHTML(str)
- if eutils.HasBadSpace(str) {
- str = eutils.CleanupBadSpaces(str)
- }
- if eutils.HasAdjacentSpaces(str) {
- str = eutils.CompressRunsOfSpaces(str)
- }
- if eutils.NeedsTightening(str) {
- str = eutils.TightenParentheses(str)
- }
- if unescape && eutils.HasAmpOrNotASCII(str) {
- str = html.UnescapeString(str)
- }
-
- proc(str, level)
- return
- }
-
- // for XML container object, send empty string to callback to increment count
- proc("", level)
- // and continue exploring
-
- } else if curr.Attributes != "" {
-
- // for self-closing object, indicate presence by sending empty string to callback
- proc("", level)
- return
- }
- }
- }
-
- for chld := curr.Children; chld != nil; chld = chld.Next {
- // inner exploration is subject to recursive object exclusion
- exploreElements(chld, mask, lev+1)
- }
- }
-
- exploreElements(curr, "", level)
-}
-
-// PrintSubtree supports compression styles selected by -element "*" through "****"
-func PrintSubtree(node *eutils.XMLNode, style IndentType, printAttrs bool, proc func(string)) {
+// printSubtree supports compression styles selected by -element "*" through "****"
+func printSubtree(node *eutils.XMLNode, style IndentType, printAttrs bool, proc func(string)) {
if node == nil || proc == nil {
return
@@ -3258,8 +3190,8 @@ var (
replx map[string]*regexp.Regexp
)
-// ProcessClause handles comma-separated -element arguments
-func ProcessClause(curr *eutils.XMLNode, stages []*Step, mask, prev, pfx, sfx, plg, sep, def, reg, exp string, wrp bool, status OpType, index, level int, variables map[string]string, transform map[string]string, histogram map[string]int) (string, bool) {
+// processClause handles comma-separated -element arguments
+func processClause(curr *eutils.XMLNode, stages []*Step, mask, prev, pfx, sfx, plg, sep, def, reg, exp string, wrp bool, status OpType, index, level int, variables map[string]string, transform map[string]string, histogram map[string]int) (string, bool) {
if curr == nil || stages == nil {
return "", false
@@ -3298,9 +3230,9 @@ func ProcessClause(curr *eutils.XMLNode, stages []*Step, mask, prev, pfx, sfx, p
wildcard := stage.Wild
unescape := (stat != INDICES)
- // exploreElements is a wrapper for ExploreElements, obtaining most arguments as closures
+ // exploreElements is a wrapper for eutils.ExploreElements, obtaining most arguments as closures
exploreElements := func(proc func(string, int)) {
- ExploreElements(curr, mask, prnt, match, attrib, wildcard, unescape, level, proc)
+ eutils.ExploreElements(curr, mask, prnt, match, attrib, wildcard, unescape, level, proc)
}
// sendSlice applies optional [min:max] range restriction and sends result to accumulator
@@ -3579,7 +3511,7 @@ func ProcessClause(curr *eutils.XMLNode, stages []*Step, mask, prev, pfx, sfx, p
case ORDER:
exploreElements(func(str string, lvl int) {
if str != "" {
- str = eutils.SortStringByWords(str)
+ str = sortStringByWords(str)
sendSlice(str)
}
})
@@ -3729,7 +3661,7 @@ func ProcessClause(curr *eutils.XMLNode, stages []*Step, mask, prev, pfx, sfx, p
var buffer strings.Builder
- PrintSubtree(curr, style, printAttrs,
+ printSubtree(curr, style, printAttrs,
func(str string) {
if str != "" {
buffer.WriteString(str)
@@ -4223,7 +4155,7 @@ func ProcessClause(curr *eutils.XMLNode, stages []*Step, mask, prev, pfx, sfx, p
str = eutils.RepairTableMarkup(str, eutils.SPACE)
str = eutils.RepairScriptMarkup(str, eutils.SPACE)
str = eutils.RepairMathMLMarkup(str, eutils.SPACE)
- // RemoveEmbeddedMarkup must be called before UnescapeString, which was suppressed in ExploreElements
+ // RemoveEmbeddedMarkup must be called before UnescapeString, which was suppressed in eutils.ExploreElements
str = eutils.RemoveEmbeddedMarkup(str)
}
@@ -4301,7 +4233,7 @@ func ProcessClause(curr *eutils.XMLNode, stages []*Step, mask, prev, pfx, sfx, p
}
// optional stop word removal
- if eutils.DeStop() && eutils.IsStopWord(item) {
+ if deStop && eutils.IsStopWord(item) {
continue
}
@@ -4404,12 +4336,12 @@ func ProcessClause(curr *eutils.XMLNode, stages []*Step, mask, prev, pfx, sfx, p
})
for _, item := range words {
item = strings.ToLower(item)
- if eutils.DeStop() {
+ if deStop {
if eutils.IsStopWord(item) {
continue
}
}
- if eutils.DoStem() {
+ if doStem {
item = porter2.Stem(item)
item = strings.TrimSpace(item)
}
@@ -4448,13 +4380,13 @@ func ProcessClause(curr *eutils.XMLNode, stages []*Step, mask, prev, pfx, sfx, p
continue
}
item = strings.ToLower(item)
- if eutils.DeStop() {
+ if deStop {
if eutils.IsStopWord(item) {
past = ""
continue
}
}
- if eutils.DoStem() {
+ if doStem {
item = porter2.Stem(item)
item = strings.TrimSpace(item)
}
@@ -4485,12 +4417,12 @@ func ProcessClause(curr *eutils.XMLNode, stages []*Step, mask, prev, pfx, sfx, p
}
for _, item := range words {
item = strings.ToLower(item)
- if eutils.DeStop() {
+ if deStop {
if eutils.IsStopWord(item) {
continue
}
}
- if eutils.DoStem() {
+ if doStem {
item = porter2.Stem(item)
item = strings.TrimSpace(item)
}
@@ -4680,8 +4612,8 @@ func ProcessClause(curr *eutils.XMLNode, stages []*Step, mask, prev, pfx, sfx, p
return txt, true
}
-// ProcessInstructions performs extraction commands on a subset of XML
-func ProcessInstructions(commands []*Operation, curr *eutils.XMLNode, mask, tab, ret string, index, level int, variables map[string]string, transform map[string]string, histogram map[string]int, accum func(string)) (string, string) {
+// processInstructions performs extraction commands on a subset of XML
+func processInstructions(commands []*Operation, curr *eutils.XMLNode, mask, tab, ret string, index, level int, variables map[string]string, transform map[string]string, histogram map[string]int, accum func(string)) (string, string) {
if accum == nil {
return tab, ret
@@ -4731,7 +4663,7 @@ func ProcessInstructions(commands []*Operation, curr *eutils.XMLNode, mask, tab,
TERMS, WORDS, PAIRS, REVERSE, LETTERS, CLAUSES, INDICES, MESHCODE, MATRIX, ACCENTED,
NUM, LEN, SUM, MIN, MAX, INC, DEC, SUB, AVG, DEV, MED, MUL, DIV, MOD, BIN, BIT, ZEROBASED, ONEBASED, UCSCBASED,
REVCOMP, NUCLEIC, FASTA, NCBI2NA, NCBI4NA, MOLWT, HGVS:
- txt, ok := ProcessClause(curr, op.Stages, mask, tab, pfx, sfx, plg, sep, def, reg, exp, wrp, op.Type, index, level, variables, transform, histogram)
+ txt, ok := processClause(curr, op.Stages, mask, tab, pfx, sfx, plg, sep, def, reg, exp, wrp, op.Type, index, level, variables, transform, histogram)
if ok {
plg = ""
lst = elg
@@ -4744,7 +4676,7 @@ func ProcessInstructions(commands []*Operation, curr *eutils.XMLNode, mask, tab,
}
}
case HISTOGRAM:
- txt, ok := ProcessClause(curr, op.Stages, mask, "", "", "", "", "", "", "", "", wrp, op.Type, index, level, variables, transform, histogram)
+ txt, ok := processClause(curr, op.Stages, mask, "", "", "", "", "", "", "", "", wrp, op.Type, index, level, variables, transform, histogram)
if ok {
accum(txt)
}
@@ -4878,7 +4810,7 @@ func ProcessInstructions(commands []*Operation, curr *eutils.XMLNode, mask, tab,
// -if "&VARIABLE" will fail if initialized with empty string ""
delete(variables, varname)
} else {
- txt, ok := ProcessClause(curr, op.Stages, mask, "", pfx, sfx, plg, sep, def, reg, exp, wrp, op.Type, index, level, variables, transform, histogram)
+ txt, ok := processClause(curr, op.Stages, mask, "", pfx, sfx, plg, sep, def, reg, exp, wrp, op.Type, index, level, variables, transform, histogram)
if ok {
plg = ""
lst = elg
@@ -4910,8 +4842,8 @@ func ProcessInstructions(commands []*Operation, curr *eutils.XMLNode, mask, tab,
// CONDITIONAL EXECUTION USES -if AND -unless STATEMENT, WITH SUPPORT FOR DEPRECATED -match AND -avoid STATEMENTS
-// ConditionsAreSatisfied tests a set of conditions to determine if extraction should proceed
-func ConditionsAreSatisfied(conditions []*Operation, curr *eutils.XMLNode, mask string, index, level int, variables map[string]string) bool {
+// conditionsAreSatisfied tests a set of conditions to determine if extraction should proceed
+func conditionsAreSatisfied(conditions []*Operation, curr *eutils.XMLNode, mask string, index, level int, variables map[string]string) bool {
if curr == nil {
return false
@@ -4955,9 +4887,9 @@ func ConditionsAreSatisfied(conditions []*Operation, curr *eutils.XMLNode, mask
found := false
number := ""
- // exploreElements is a wrapper for ExploreElements, obtaining most arguments as closures
+ // exploreElements is a wrapper for eutils.ExploreElements, obtaining most arguments as closures
exploreElements := func(proc func(string, int)) {
- ExploreElements(curr, mask, prnt, match, attrib, wildcard, unescape, level, proc)
+ eutils.ExploreElements(curr, mask, prnt, match, attrib, wildcard, unescape, level, proc)
}
// test string or numeric constraints
@@ -5014,7 +4946,7 @@ func ConditionsAreSatisfied(conditions []*Operation, curr *eutils.XMLNode, mask
return true
}
case RESEMBLES:
- if eutils.SortStringByWords(str) == strings.ToLower(val) {
+ if sortStringByWords(str) == strings.ToLower(val) {
return true
}
default:
@@ -5027,13 +4959,13 @@ func ConditionsAreSatisfied(conditions []*Operation, curr *eutils.XMLNode, mask
switch ch {
case '#':
count := 0
- ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) {
+ eutils.ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) {
count++
})
val = strconv.Itoa(count)
case '%':
length := 0
- ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) {
+ eutils.ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) {
if stn != "" {
length += len(stn)
}
@@ -5041,12 +4973,12 @@ func ConditionsAreSatisfied(conditions []*Operation, curr *eutils.XMLNode, mask
val = strconv.Itoa(length)
case '^':
depth := 0
- ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) {
+ eutils.ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) {
depth = lvl
})
val = strconv.Itoa(depth)
default:
- ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) {
+ eutils.ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) {
if stn != "" {
val = stn
}
@@ -5075,13 +5007,13 @@ func ConditionsAreSatisfied(conditions []*Operation, curr *eutils.XMLNode, mask
switch ch {
case '#':
count := 0
- ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) {
+ eutils.ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) {
count++
})
val = strconv.Itoa(count)
case '%':
length := 0
- ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) {
+ eutils.ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) {
if stn != "" {
length += len(stn)
}
@@ -5089,12 +5021,12 @@ func ConditionsAreSatisfied(conditions []*Operation, curr *eutils.XMLNode, mask
val = strconv.Itoa(length)
case '^':
depth := 0
- ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) {
+ eutils.ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) {
depth = lvl
})
val = strconv.Itoa(depth)
default:
- ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) {
+ eutils.ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) {
if stn != "" {
_, errz := strconv.Atoi(stn)
if errz == nil {
@@ -5391,8 +5323,8 @@ func ConditionsAreSatisfied(conditions []*Operation, curr *eutils.XMLNode, mask
// RECURSIVELY PROCESS EXPLORATION COMMANDS AND XML DATA STRUCTURE
-// ProcessCommands visits XML nodes, performs conditional tests, and executes data extraction instructions
-func ProcessCommands(cmds *Block, curr *eutils.XMLNode, tab, ret string, index, level int, variables map[string]string, transform map[string]string, histogram map[string]int, accum func(string)) (string, string) {
+// processCommands visits XML nodes, performs conditional tests, and executes data extraction instructions
+func processCommands(cmds *Block, curr *eutils.XMLNode, tab, ret string, index, level int, variables map[string]string, transform map[string]string, histogram map[string]int, accum func(string)) (string, string) {
if accum == nil {
return tab, ret
@@ -5424,23 +5356,23 @@ func ProcessCommands(cmds *Block, curr *eutils.XMLNode, tab, ret string, index,
processNode := func(node *eutils.XMLNode, idx, lvl int) {
// apply -if or -unless tests
- if ConditionsAreSatisfied(cmds.Conditions, node, match, idx, lvl, variables) {
+ if conditionsAreSatisfied(cmds.Conditions, node, match, idx, lvl, variables) {
// execute data extraction commands
if len(cmds.Commands) > 0 {
- tab, ret = ProcessInstructions(cmds.Commands, node, match, tab, ret, idx, lvl, variables, transform, histogram, accum)
+ tab, ret = processInstructions(cmds.Commands, node, match, tab, ret, idx, lvl, variables, transform, histogram, accum)
}
// process sub commands on child node
for _, sub := range cmds.Subtasks {
- tab, ret = ProcessCommands(sub, node, tab, ret, 1, lvl, variables, transform, histogram, accum)
+ tab, ret = processCommands(sub, node, tab, ret, 1, lvl, variables, transform, histogram, accum)
}
} else {
// execute commands after -else statement
if len(cmds.Failure) > 0 {
- tab, ret = ProcessInstructions(cmds.Failure, node, match, tab, ret, idx, lvl, variables, transform, histogram, accum)
+ tab, ret = processInstructions(cmds.Failure, node, match, tab, ret, idx, lvl, variables, transform, histogram, accum)
}
}
}
@@ -5677,8 +5609,8 @@ func ProcessCommands(cmds *Block, curr *eutils.XMLNode, tab, ret string, index,
// PROCESS ONE XML COMPONENT RECORD
-// ProcessQuery perform data extraction driven by command-line arguments
-func ProcessQuery(text, parent string, index int, hd, tl string, transform map[string]string, histogram map[string]int, cmds *Block) string {
+// processQuery perform data extraction driven by command-line arguments
+func processQuery(text, parent string, index int, hd, tl string, transform map[string]string, histogram map[string]int, cmds *Block) string {
if text == "" || cmds == nil {
return ""
@@ -5706,7 +5638,7 @@ func ProcessQuery(text, parent string, index int, hd, tl string, transform map[s
if cmds.Position == "select" {
- if ConditionsAreSatisfied(cmds.Conditions, pat, cmds.Match, index, 1, variables) {
+ if conditionsAreSatisfied(cmds.Conditions, pat, cmds.Match, index, 1, variables) {
ok = true
buffer.WriteString(text)
ret = "\n"
@@ -5715,7 +5647,7 @@ func ProcessQuery(text, parent string, index int, hd, tl string, transform map[s
} else {
// start processing at top of command tree and top of XML subregion selected by -pattern
- _, ret = ProcessCommands(cmds, pat, "", "", index, 1, variables, transform, histogram,
+ _, ret = processCommands(cmds, pat, "", "", index, 1, variables, transform, histogram,
func(str string) {
if str != "" {
ok = true
@@ -5752,8 +5684,8 @@ func ProcessQuery(text, parent string, index int, hd, tl string, transform map[s
// e.g., xtract -insd complete mat_peptide "%peptide" product peptide
-// ProcessINSD generates extraction commands for GenBank/RefSeq records in INSDSet format
-func ProcessINSD(args []string, isPipe, addDash, doIndex bool) []string {
+// processINSD generates extraction commands for GenBank/RefSeq records in INSDSet format
+func processINSD(args []string, isPipe, addDash, doIndex bool) []string {
// legal GenBank / GenPept / RefSeq features
@@ -6457,8 +6389,8 @@ func ProcessINSD(args []string, isPipe, addDash, doIndex bool) []string {
// BIOTHINGS EXTRACTION COMMAND GENERATOR
-// ProcessBiopath generates extraction commands for BioThings resources (undocumented)
-func ProcessBiopath(args []string, isPipe bool) []string {
+// processBiopath generates extraction commands for BioThings resources (undocumented)
+func processBiopath(args []string, isPipe bool) []string {
// nquire -get "http://myvariant.info/v1/variant/chr6:g.26093141G>A" \
// -fields clinvar.rcv.conditions.identifiers \
@@ -6515,8 +6447,8 @@ func ProcessBiopath(args []string, isPipe bool) []string {
// HYDRA CITATION MATCHER COMMAND GENERATOR
-// ProcessHydra generates extraction commands for NCBI's in-house citation matcher (undocumented)
-func ProcessHydra(isPipe bool) []string {
+// processHydra generates extraction commands for NCBI's in-house citation matcher (undocumented)
+func processHydra(isPipe bool) []string {
var acc []string
@@ -6533,8 +6465,8 @@ func ProcessHydra(isPipe bool) []string {
// ENTREZ2INDEX COMMAND GENERATOR
-// ProcessE2Index generates extraction commands to create input for Entrez2Index
-func ProcessE2Index(args []string, tform string, isPipe bool) []string {
+// processE2Index generates extraction commands to create input for Entrez2Index
+func processE2Index(args []string, tform string, isPipe bool) []string {
var acc []string
@@ -6548,7 +6480,29 @@ func ProcessE2Index(args []string, tform string, isPipe bool) []string {
patrn := args[0]
args = args[1:]
- if eutils.IsAllNumeric(patrn) {
+ isAllNumeric := func(str string) bool {
+
+ for _, ch := range str {
+ if !unicode.IsDigit(ch) &&
+ ch != '.' &&
+ ch != '+' &&
+ ch != '-' &&
+ ch != '*' &&
+ ch != '/' &&
+ ch != ',' &&
+ ch != '$' &&
+ ch != '#' &&
+ ch != '%' &&
+ ch != '(' &&
+ ch != ')' {
+ return false
+ }
+ }
+
+ return true
+ }
+
+ if isAllNumeric(patrn) {
year = patrn
patrn = args[0]
args = args[1:]
@@ -6558,10 +6512,10 @@ func ProcessE2Index(args []string, tform string, isPipe bool) []string {
args = args[1:]
if !isPipe {
- if !eutils.DeStop() {
+ if !deStop {
acc = append(acc, "-stops")
}
- if eutils.DoStem() {
+ if doStem {
acc = append(acc, "-stems")
}
}
@@ -6635,7 +6589,7 @@ func ProcessE2Index(args []string, tform string, isPipe bool) []string {
// processes with single goroutine call defer close(out) so consumer(s) can range over channel
// processes with multiple instances call defer wg.Done(), separate goroutine uses wg.Wait() to delay close(out)
-func CreateConsumers(cmds *Block, parent, hd, tl string, transform map[string]string, histogram map[string]int, inp <-chan eutils.XMLRecord) <-chan eutils.XMLRecord {
+func createConsumers(cmds *Block, parent, hd, tl string, transform map[string]string, histogram map[string]int, inp <-chan eutils.XMLRecord) <-chan eutils.XMLRecord {
if inp == nil {
return nil
@@ -6661,14 +6615,14 @@ func CreateConsumers(cmds *Block, parent, hd, tl string, transform map[string]st
if text == "" {
// should never see empty input data
- out <- eutils.XMLRecord{idx, "", text, nil}
+ out <- eutils.XMLRecord{Index: idx, Text: text}
continue
}
- str := ProcessQuery(text[:], parent, idx, hd, tl, transform, histogram, cmds)
+ str := processQuery(text[:], parent, idx, hd, tl, transform, histogram, cmds)
// send even if empty to get all record counts for reordering
- out <- eutils.XMLRecord{idx, "", str, nil}
+ out <- eutils.XMLRecord{Index: idx, Text: str}
}
}
@@ -6689,7 +6643,7 @@ func CreateConsumers(cmds *Block, parent, hd, tl string, transform map[string]st
return out
}
-func CreateSelectors(parent, indx string, order map[string]bool, inp <-chan eutils.XMLRecord) <-chan eutils.XMLRecord {
+func createSelectors(parent, indx string, order map[string]bool, inp <-chan eutils.XMLRecord) <-chan eutils.XMLRecord {
if parent == "" || indx == "" || order == nil || inp == nil {
return nil
@@ -6718,7 +6672,7 @@ func CreateSelectors(parent, indx string, order map[string]bool, inp <-chan euti
eutils.FindIdentifiers(text[:], parent, find,
func(id string) {
- id = eutils.SortStringByWords(id)
+ id = sortStringByWords(id)
_, ok := order[id]
if ok {
found = true
@@ -6727,12 +6681,12 @@ func CreateSelectors(parent, indx string, order map[string]bool, inp <-chan euti
if !found {
// identifier field not found or not in identifier list, send empty placeholder for unshuffler
- out <- eutils.XMLRecord{ext.Index, "", "", nil}
+ out <- eutils.XMLRecord{Index: ext.Index}
continue
}
// send selected record
- out <- eutils.XMLRecord{ext.Index, "", text, nil}
+ out <- eutils.XMLRecord{Index: ext.Index, Text: text}
}
}
@@ -6781,8 +6735,8 @@ func main() {
doMixed := false
deAccent := false
doASCII := false
- doStem := false
- deStop := false
+ doStem = false
+ deStop = true
/*
doUnicode := false
@@ -6892,7 +6846,7 @@ func main() {
case "-stems", "-stem":
doStem = true
case "-stops", "-stop":
- deStop = true
+ deStop = false
// allow setting of unicode, script, and mathml flags (undocumented)
case "-unicode":
@@ -6951,7 +6905,7 @@ func main() {
case "stems", "stem":
doStem = true
case "stops", "stop":
- deStop = true
+ deStop = false
case "none", "default":
default:
if flgs != "" {
@@ -6961,9 +6915,9 @@ func main() {
}
/*
- UnicodeFix = ParseMarkup(unicodePolicy, "-unicode")
- ScriptFix = ParseMarkup(scriptPolicy, "-script")
- MathMLFix = ParseMarkup(mathmlPolicy, "-mathml")
+ UnicodeFix = parseMarkup(unicodePolicy, "-unicode")
+ ScriptFix = parseMarkup(scriptPolicy, "-script")
+ MathMLFix = parseMarkup(mathmlPolicy, "-mathml")
if UnicodeFix != NOMARKUP {
doUnicode = true
@@ -6994,7 +6948,7 @@ func main() {
eutils.SetTunings(numProcs, numServe, serverRatio, chanDepth, farmSize, heapSize, goGc)
- eutils.SetOptions(doStrict, doMixed, deAccent, doASCII, doCompress, doCleanup, doStem, deStop, false)
+ eutils.SetOptions(doStrict, doMixed, deAccent, doASCII, doCompress, doCleanup)
// -stats prints number of CPUs and performance tuning values if no other arguments (undocumented)
if stts && len(args) < 1 {
@@ -7180,7 +7134,7 @@ func main() {
args = args[1:]
- insd := ProcessINSD(args, isPipe || usingFile, addDash, doIndex)
+ insd := processINSD(args, isPipe || usingFile, addDash, doIndex)
if !isPipe && !usingFile {
// no piped input, so write output instructions
@@ -7201,7 +7155,7 @@ func main() {
// -hydra filters HydraResponse output by relevance score (undocumented)
if args[0] == "-hydra" {
- hydra := ProcessHydra(isPipe || usingFile)
+ hydra := processHydra(isPipe || usingFile)
if !isPipe && !usingFile {
// no piped input, so write output instructions
@@ -7224,7 +7178,7 @@ func main() {
args = args[1:]
- biopath := ProcessBiopath(args, isPipe || usingFile)
+ biopath := processBiopath(args, isPipe || usingFile)
if !isPipe && !usingFile {
// no piped input, so write output instructions
@@ -7281,7 +7235,7 @@ func main() {
}
}
- res := ProcessE2Index(args, tform, isPipe || usingFile)
+ res := processE2Index(args, tform, isPipe || usingFile)
if !isPipe && !usingFile {
// no piped input, so write output instructions
@@ -7414,7 +7368,7 @@ func main() {
if args[0] == "-token" {
- eutils.StreamTokens("", "", rdr,
+ eutils.StreamTokens(rdr,
func(tkn eutils.XMLToken) {
recordCount++
byteCount += len(tkn.Name) + len(tkn.Attr)
@@ -7609,7 +7563,7 @@ func main() {
line := scanr.Text()
id, _ := eutils.SplitInTwoLeft(line, "\t")
- id = eutils.SortStringByWords(id)
+ id = sortStringByWords(id)
// add identifier to map
order[id] = true
@@ -7618,7 +7572,7 @@ func main() {
fl.Close()
xmlq := eutils.CreateXMLProducer(topPattern, star, rdr)
- fchq := CreateSelectors(topPattern, indx, order, xmlq)
+ fchq := createSelectors(topPattern, indx, order, xmlq)
unsq := eutils.CreateXMLUnshuffler(fchq)
if xmlq == nil || fchq == nil || unsq == nil {
@@ -7993,7 +7947,7 @@ func main() {
// PARSE AND VALIDATE EXTRACTION ARGUMENTS
// parse nested exploration instruction from command-line arguments
- cmds := ParseArguments(args, topPattern)
+ cmds := parseArguments(args, topPattern)
if cmds == nil {
fmt.Fprintf(os.Stderr, "\nERROR: Problem parsing command-line arguments\n")
os.Exit(1)
@@ -8016,7 +7970,7 @@ func main() {
func(str string) {
rec++
beginTime := time.Now()
- ProcessQuery(str[:], parent, rec, hd, tl, transform, histogram, cmds)
+ processQuery(str[:], parent, rec, hd, tl, transform, histogram, cmds)
endTime := time.Now()
duration := endTime.Sub(beginTime)
micro := int(float64(duration.Nanoseconds()) / 1e3)
@@ -8066,7 +8020,7 @@ func main() {
}
xmlq := eutils.CreateXMLProducer(topPattern, star, trdr)
- tblq := CreateConsumers(cmds, parent, hd, tl, transform, histogram, xmlq)
+ tblq := createConsumers(cmds, parent, hd, tl, transform, histogram, xmlq)
if xmlq == nil || tblq == nil {
fmt.Fprintf(os.Stderr, "\nERROR: Unable to create servers\n")
@@ -8181,7 +8135,7 @@ func main() {
cmds.Position = ""
// process single selected record
- res := ProcessQuery(qry[:], parent, idx, hd, tl, transform, histogram, cmds)
+ res := processQuery(qry[:], parent, idx, hd, tl, transform, histogram, cmds)
if res != "" {
fmt.Printf("%s", res)
@@ -8196,7 +8150,7 @@ func main() {
xmlq := eutils.CreateXMLProducer(topPattern, star, rdr)
// launch consumer goroutines to parse and explore partitioned XML objects
- tblq := CreateConsumers(cmds, parent, hd, tl, transform, histogram, xmlq)
+ tblq := createConsumers(cmds, parent, hd, tl, transform, histogram, xmlq)
// launch unshuffler goroutine to restore order of results
unsq := eutils.CreateXMLUnshuffler(tblq)
@@ -8312,9 +8266,9 @@ func main() {
for curr := range unsq {
if beg == nil {
- beg = &eutils.XMLRecord{curr.Index, curr.Ident, curr.Text, nil}
+ beg = &eutils.XMLRecord{Index: curr.Index, Ident: curr.Ident, Text: curr.Text}
} else {
- end = &eutils.XMLRecord{curr.Index, curr.Ident, curr.Text, nil}
+ end = &eutils.XMLRecord{Index: curr.Index, Ident: curr.Ident, Text: curr.Text}
}
recordCount++
@@ -8340,7 +8294,7 @@ func main() {
first = false
} else {
prev = next
- next = &eutils.XMLRecord{curr.Index, curr.Ident, curr.Text, nil}
+ next = &eutils.XMLRecord{Index: curr.Index, Ident: curr.Ident, Text: curr.Text}
}
if prev != nil {
diff --git a/download-ncbi-data b/download-ncbi-data
index 798fc50..992df6d 100755
--- a/download-ncbi-data
+++ b/download-ncbi-data
@@ -1,5 +1,8 @@
#!/bin/sh
+# Public domain notice for all NCBI EDirect scripts is located at:
+# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE
+
cmd="$1"
shift
diff --git a/download-pubmed b/download-pubmed
index 4bf60cf..22b6f65 100755
--- a/download-pubmed
+++ b/download-pubmed
@@ -1,5 +1,8 @@
#!/bin/sh
+# Public domain notice for all NCBI EDirect scripts is located at:
+# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE
+
download() {
dir="$1"
nquire -lst ftp.ncbi.nlm.nih.gov "pubmed" "$dir" |
diff --git a/download-sequence b/download-sequence
index 56f3a17..0e7039f 100755
--- a/download-sequence
+++ b/download-sequence
@@ -1,5 +1,8 @@
#!/bin/sh
+# Public domain notice for all NCBI EDirect scripts is located at:
+# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE
+
filt=""
while [ "$#" -gt 0 ]
do
diff --git a/eblast b/eblast
deleted file mode 100755
index 96f6dc8..0000000
--- a/eblast
+++ /dev/null
@@ -1,159 +0,0 @@
-#!/bin/sh
-
-# ===========================================================================
-#
-# PUBLIC DOMAIN NOTICE
-# National Center for Biotechnology Information (NCBI)
-#
-# This software/database is a "United States Government Work" under the
-# terms of the United States Copyright Act. It was written as part of
-# the author's official duties as a United States Government employee and
-# thus cannot be copyrighted. This software/database is freely available
-# to the public for use. The National Library of Medicine and the U.S.
-# Government do not place any restriction on its use or reproduction.
-# We would, however, appreciate having the NCBI and the author cited in
-# any work or product based on this material.
-#
-# Although all reasonable efforts have been taken to ensure the accuracy
-# and reliability of the software and data, the NLM and the U.S.
-# Government do not and cannot warrant the performance or results that
-# may be obtained by using this software or data. The NLM and the U.S.
-# Government disclaim all warranties, express or implied, including
-# warranties of performance, merchantability or fitness for any particular
-# purpose.
-#
-# ===========================================================================
-#
-# File Name: eblast
-#
-# Author: Jonathan Kans
-#
-# Version Creation Date: 03/05/2021
-#
-# ==========================================================================
-
-# read sequence from stdin
-seq=$( cat )
-
-# remove FASTA definition line, all whitespace including newlines
-seq=$(
- echo "$seq" |
- grep -v '>' |
- tr -d " \t\n\r"
-)
-
-# send BLAST request
-blst=$(
- nquire -url https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi \
- -CMD Put -PROGRAM blastp -DATABASE nr -QUERY "$seq"
-)
-
-# extract request id from result
-rid=$(
- echo "$blst" |
- grep "^ RID" |
- sed -e 's/^ RID = //g' |
- tr -d " \t\n\r"
-)
-
-if [ -z "$rid" ]
-then
- echo "ERROR Unable to create RID" >&2
- exit 1
-fi
-
-# ASCII terminal character color variables
-RD='\033[0;31m'
-BL='\033[0;34m'
-NC='\033[0m'
-
-# echo RID (to stderr)
-echo "${RD}RID:${BL} $rid${NC}" >&2
-
-goOn=true
-count=0
-
-# polling loop
-while [ "$goOn" = true ]
-do
-
- # check for result every 60 seconds to avoid server overuse
- for i in $(seq 1 4)
- do
- count=$((count + 1))
- sleep 15
- # but print progress indicator every 15 seconds (to stderr)
- printf "${RD}.${NC}" >&2
- done
-
- # send polling request
- poll=$(
- nquire -get https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi \
- -CMD Get -FORMAT_OBJECT SearchInfo -RID "$rid"
- )
-
- # obtain the polling status
- stts=$(
- echo "$poll" |
- grep " Status=" |
- sed -e 's/ Status=//g' |
- tr -d " \t\n\r"
- )
-
- case "$stts" in
- WAITING )
- if [ "$count" -gt 40 ]
- then
- echo "ERROR Search $rid timed out" >&2
- exit 1
- fi
- # continue
- ;;
- FAILED )
- echo "ERROR Search $rid failed" >&2
- exit 4
- ;;
- UNKNOWN )
- echo "ERROR Search $rid expired" >&2
- exit 3
- ;;
- READY )
- hits=$(
- echo "$poll" |
- grep "ThereAreHits=yes" |
- sed -e 's/ThereAreHits=//g' |
- tr -d " \t\n\r"
- )
- # end row of progress dots with newline (to stderr)
- printf "\n" >&2
- if [ "$hits" = "yes" ]
- then
- # set flag to exit loop
- goOn=false
- else
- echo "ERROR No hits found for $rid" >&2
- exit 2
- fi
- ;;
- * )
- echo "ERROR Unknown status: $stts" >&2
- exit 5
- ;;
- esac
-done
-
-sleep 10
-
-#fetch result
-res=$(
- nquire -get https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi \
- -CMD Get -FORMAT_TYPE XML -RID "$rid" |
- transmute -format indent -doctype "-"
-)
-
-# sort accessions by score, remove score column
-echo "$res" |
-xtract -pattern Hit -element Hsp_score Hit_accession |
-sort -nr |
-cut -f 2
-
diff --git a/efetch b/efetch
index 905bf2b..b4020cc 100755
--- a/efetch
+++ b/efetch
@@ -47,12 +47,12 @@ esac
PERL=""
-internal=no
+internal=false
while [ "$#" -ne 0 ]
do
case "$1" in
-internal )
- internal=yes
+ internal=true
shift
;;
-newmode )
@@ -68,7 +68,7 @@ do
;;
esac
done
-if [ "$internal" = yes ]
+if [ "$internal" = true ]
then
set _ -internal "$@"
shift
diff --git a/efilter b/efilter
index 0ff4612..11a5049 100755
--- a/efilter
+++ b/efilter
@@ -47,12 +47,12 @@ esac
PERL=""
-internal=no
+internal=false
while [ "$#" -ne 0 ]
do
case "$1" in
-internal )
- internal=yes
+ internal=true
shift
;;
-newmode )
@@ -68,7 +68,7 @@ do
;;
esac
done
-if [ "$internal" = yes ]
+if [ "$internal" = true ]
then
set _ -internal "$@"
shift
diff --git a/einfo b/einfo
index 55193fe..d0d3219 100755
--- a/einfo
+++ b/einfo
@@ -47,12 +47,12 @@ esac
PERL=""
-internal=no
+internal=false
while [ "$#" -ne 0 ]
do
case "$1" in
-internal )
- internal=yes
+ internal=true
shift
;;
-newmode )
@@ -68,7 +68,7 @@ do
;;
esac
done
-if [ "$internal" = yes ]
+if [ "$internal" = true ]
then
set _ -internal "$@"
shift
@@ -114,6 +114,16 @@ fi
PrintHelp() {
echo "einfo $version"
+ echo ""
+ sfx=""
+ if [ "$external" = true ]
+ then
+ sfx=" - external"
+ elif [ "$internal" = true ]
+ then
+ sfx=" - internal"
+ fi
+ echo "$( uname -s ) - $( uname -m )${sfx}"
cat << "EOF"
Database Selection
diff --git a/elink b/elink
index 8ba82eb..2e8f184 100755
--- a/elink
+++ b/elink
@@ -47,12 +47,12 @@ esac
PERL=""
-internal=no
+internal=false
while [ "$#" -ne 0 ]
do
case "$1" in
-internal )
- internal=yes
+ internal=true
shift
;;
-newmode )
@@ -68,7 +68,7 @@ do
;;
esac
done
-if [ "$internal" = yes ]
+if [ "$internal" = true ]
then
set _ -internal "$@"
shift
diff --git a/epost b/epost
index 437fdc8..c3064fc 100755
--- a/epost
+++ b/epost
@@ -47,12 +47,12 @@ esac
PERL=""
-internal=no
+internal=false
while [ "$#" -ne 0 ]
do
case "$1" in
-internal )
- internal=yes
+ internal=true
shift
;;
-newmode )
@@ -68,7 +68,7 @@ do
;;
esac
done
-if [ "$internal" = yes ]
+if [ "$internal" = true ]
then
set _ -internal "$@"
shift
diff --git a/esample b/esample
index b6ff65e..8d33342 100755
--- a/esample
+++ b/esample
@@ -1,5 +1,8 @@
#!/bin/sh
+# Public domain notice for all NCBI EDirect scripts is located at:
+# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE
+
do_help() {
cat <<EOF
diff --git a/esearch b/esearch
index bc248cb..f180d46 100755
--- a/esearch
+++ b/esearch
@@ -47,12 +47,12 @@ esac
PERL=""
-internal=no
+internal=false
while [ "$#" -ne 0 ]
do
case "$1" in
-internal )
- internal=yes
+ internal=true
shift
;;
-newmode )
@@ -68,7 +68,7 @@ do
;;
esac
done
-if [ "$internal" = yes ]
+if [ "$internal" = true ]
then
set _ -internal "$@"
shift
@@ -1418,17 +1418,13 @@ fi
# although only 'or' and 'not' actually cause misinterpretation of:
# -db biosample -query "package metagenome or environmental version 1 0 [PROP]"
+# changed to replace all internal spaces with underscore, except leaving one
+# space before field bracket, and added assembly to list of databases given
+# this special processing of FILT, PROP, and ORGN controlled vocabularies
+
ProtectWithUnderscores() {
- item="$1"
- case "$item" in
- *" and "* | *" or "* | *" not "* )
- echo "$item" | sed -e "s/ and /_and_/g; s/ or /_or_/g; s/ not /_not_/g"
- ;;
- * )
- echo "$item"
- ;;
- esac
+ echo "$1" | sed -e 's/ \[/\[/g; s/ /_/g; s/\[/ \[/g; s/\[/ \[/g; s/_ \[/ \[/g'
}
ProcessEntrezQuery() {
@@ -1443,16 +1439,17 @@ ProcessEntrezQuery() {
while read item
do
item=$( echo "$item" | sed -e 's/^ *//g; s/ *$//g; s/ */ /g' )
- case "$item" in
+ opt=$( echo "$item" | tr '[:upper:]' '[:lower:]' )
+ case "$opt" in
"" )
;;
- *"[FILT]" | *"[Filter]" | *"[filter]" )
+ *"[filt]" | *"[filter]" )
ProtectWithUnderscores "$item"
;;
- *"[PROP]" | *"[Properties]" | *"[properties]" )
+ *"[prop]" | *"[properties]" )
ProtectWithUnderscores "$item"
;;
- *"[ORGN]" | *"[Organism]" | *"[organism]" )
+ *"[orgn]" | *"[organism]" )
ProtectWithUnderscores "$item"
;;
* )
@@ -1463,7 +1460,7 @@ ProcessEntrezQuery() {
}
case "$dbase" in
- nuc* | prot* | gene | genome | popset | taxonomy | clinvar | cdd | sra | ipg | bio* )
+ nuc* | prot* | gene | genome | popset | taxonomy | assembly | clinvar | cdd | sra | ipg | bio* )
case "$query" in
*\|* )
# skip if query contains an embedded vertical bar, reserved for splitting in ProcessEntrezQuery
diff --git a/esummary b/esummary
index efe3d6c..13c9790 100755
--- a/esummary
+++ b/esummary
@@ -91,12 +91,12 @@ fi
PERL=""
-internal=no
+internal=false
while [ "$#" -ne 0 ]
do
case "$1" in
-internal )
- internal=yes
+ internal=true
shift
;;
-newmode )
@@ -112,7 +112,7 @@ do
;;
esac
done
-if [ "$internal" = yes ]
+if [ "$internal" = true ]
then
set _ -internal "$@"
shift
diff --git a/eutils/align.go b/eutils/align.go
new file mode 100644
index 0000000..9e20a9a
--- /dev/null
+++ b/eutils/align.go
@@ -0,0 +1,310 @@
+// ===========================================================================
+//
+// PUBLIC DOMAIN NOTICE
+// National Center for Biotechnology Information (NCBI)
+//
+// This software/database is a "United States Government Work" under the
+// terms of the United States Copyright Act. It was written as part of
+// the author's official duties as a United States Government employee and
+// thus cannot be copyrighted. This software/database is freely available
+// to the public for use. The National Library of Medicine and the U.S.
+// Government do not place any restriction on its use or reproduction.
+// We would, however, appreciate having the NCBI and the author cited in
+// any work or product based on this material.
+//
+// Although all reasonable efforts have been taken to ensure the accuracy
+// and reliability of the software and data, the NLM and the U.S.
+// Government do not and cannot warrant the performance or results that
+// may be obtained by using this software or data. The NLM and the U.S.
+// Government disclaim all warranties, express or implied, including
+// warranties of performance, merchantability or fitness for any particular
+// purpose.
+//
+// ===========================================================================
+//
+// File Name: align.go
+//
+// Author: Jonathan Kans
+//
+// ==========================================================================
+
+package eutils
+
+import (
+ "bufio"
+ "fmt"
+ "io"
+ "os"
+ "strconv"
+ "strings"
+ "unicode/utf8"
+)
+
+// AlignColumns aligns a tab-delimited table by individual column widths
+func AlignColumns(inp io.Reader, margin, padding int, align string) <-chan string {
+
+ if inp == nil {
+ return nil
+ }
+
+ out := make(chan string, chanDepth)
+ if out == nil {
+ fmt.Fprintf(os.Stderr, "Unable to create alignment channel\n")
+ os.Exit(1)
+ }
+
+ spcs := " "
+
+ mrg := ""
+ pad := " "
+
+ lettrs := make(map[int]rune)
+ lst := 'l'
+
+ if margin > 0 && margin < 30 {
+ mrg = spcs[0:margin]
+ }
+
+ if padding > 0 && padding < 30 {
+ pad = spcs[0:padding]
+ }
+
+ for i, ch := range align {
+ lettrs[i] = ch
+ lst = ch
+ }
+
+ alignTable := func(inp io.Reader, out chan<- string) {
+
+ // close channel when all chunks have been sent
+ defer close(out)
+
+ var arry []string
+
+ width := make(map[int]int)
+ whole := make(map[int]int)
+ fract := make(map[int]int)
+
+ scanr := bufio.NewScanner(inp)
+
+ row := 0
+ numCols := 0
+
+ // allows leading plus or minus, digits interspersed with optional commas, decimal point, and digits
+ isNumeric := func(str string) bool {
+
+ hasNum := false
+ hasPeriod := false
+
+ for i, ch := range str {
+ switch ch {
+ case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
+ hasNum = true
+ case '+', '-':
+ if i > 0 {
+ return false
+ }
+ case '.':
+ hasPeriod = true
+ case ',':
+ if hasPeriod {
+ return false
+ }
+ default:
+ return false
+ }
+ }
+
+ return hasNum
+ }
+
+ processLine := func(line string) string {
+
+ var flds []string
+
+ cols := strings.Split(line, "\t")
+ if numCols == 0 {
+ numCols = len(cols)
+ } else if numCols != len(cols) {
+ fmt.Fprintf(os.Stderr, "ERROR: Mismatched number of columns in row ")
+ fmt.Fprintf(os.Stderr, strconv.Itoa(row))
+ fmt.Fprintf(os.Stderr, ": actual ")
+ fmt.Fprintf(os.Stderr, strconv.Itoa(len(cols)))
+ fmt.Fprintf(os.Stderr, ", expected ")
+ fmt.Fprintf(os.Stderr, strconv.Itoa(numCols))
+ fmt.Fprintf(os.Stderr, "\n")
+ // os.Exit(1)
+ }
+
+ for i, str := range cols {
+
+ str = CompressRunsOfSpaces(str)
+ str = strings.TrimSpace(str)
+
+ flds = append(flds, str)
+
+ // determine maximum length in each column
+ ln := utf8.RuneCountInString(str)
+ if ln > width[i] {
+ width[i] = ln
+ }
+
+ code, ok := lettrs[i]
+ if !ok {
+ code = lst
+ }
+
+ switch code {
+ case 'n', 'N', 'z', 'Z':
+ if isNumeric(str) {
+ // determine maximum length of decimal number parts
+ wh, fr := SplitInTwoLeft(str, ".")
+ if fr != "" {
+ fr = "." + fr
+ }
+
+ lf := utf8.RuneCountInString(wh)
+ if lf > whole[i] {
+ whole[i] = lf
+ }
+ rt := utf8.RuneCountInString(fr)
+ if rt > fract[i] {
+ fract[i] = rt
+ }
+ ln = whole[i] + fract[i]
+ if ln > width[i] {
+ width[i] = ln
+ }
+ }
+ }
+ }
+
+ return strings.Join(flds, "\t")
+ }
+
+ for i := 0; i < numCols; i++ {
+
+ code, ok := lettrs[i]
+ if !ok {
+ code = lst
+ }
+
+ switch code {
+ case 'n', 'N', 'z', 'Z':
+ // adjust maximum widths with aligned decimal points
+ ln := whole[i] + fract[i]
+ if ln > width[i] {
+ width[i] = ln
+ }
+ }
+ }
+
+ // clean up spaces, calculate column widths
+ for scanr.Scan() {
+
+ row++
+ line := scanr.Text()
+ if line == "" {
+ continue
+ }
+
+ line = processLine(line)
+ arry = append(arry, line)
+ }
+
+ var buffer strings.Builder
+
+ for _, line := range arry {
+
+ buffer.Reset()
+
+ cols := strings.Split(line, "\t")
+
+ btwn := mrg
+ for i, str := range cols {
+
+ buffer.WriteString(btwn)
+
+ code, ok := lettrs[i]
+ if !ok {
+ code = lst
+ }
+
+ ln := utf8.RuneCountInString(str)
+ mx := width[i]
+ diff := mx - ln
+ lft := 0
+ rgt := 0
+ lftPad := " "
+ rgtPad := " "
+
+ if diff > 0 {
+ switch code {
+ case 'l':
+ rgt = diff
+ case 'c':
+ lft = diff / 2
+ rgt = diff - lft
+ case 'r':
+ lft = diff
+ case 'n', 'N', 'z', 'Z':
+ lft = diff
+ if isNumeric(str) {
+ switch code {
+ case 'N':
+ rgtPad = "0"
+ case 'z':
+ lftPad = "0"
+ case 'Z':
+ lftPad = "0"
+ rgtPad = "0"
+ }
+ sn := whole[i]
+ rc := fract[i]
+ wh, fr := SplitInTwoLeft(str, ".")
+ if fract[i] > 0 {
+ if fr == "" {
+ fr = "."
+ } else {
+ fr = "." + fr
+ }
+ lf := utf8.RuneCountInString(wh)
+ lft = sn - lf
+ rt := utf8.RuneCountInString(fr)
+ rgt = rc - rt
+ str = wh + fr
+ }
+ }
+ default:
+ rgt = diff
+ }
+ }
+
+ for lft > 0 {
+ lft--
+ buffer.WriteString(lftPad)
+ }
+
+ buffer.WriteString(str)
+ btwn = pad
+
+ for rgt > 0 {
+ rgt--
+ buffer.WriteString(rgtPad)
+ }
+ }
+
+ txt := buffer.String()
+ txt = strings.TrimRight(txt, " ") + "\n"
+
+ if txt != "" {
+ out <- txt
+ }
+ }
+ }
+
+ // launch single alignment goroutine
+ go alignTable(inp, out)
+
+ return out
+}
diff --git a/eutils/chan.go b/eutils/chan.go
new file mode 100644
index 0000000..3b6d2c1
--- /dev/null
+++ b/eutils/chan.go
@@ -0,0 +1,113 @@
+// ===========================================================================
+//
+// PUBLIC DOMAIN NOTICE
+// National Center for Biotechnology Information (NCBI)
+//
+// This software/database is a "United States Government Work" under the
+// terms of the United States Copyright Act. It was written as part of
+// the author's official duties as a United States Government employee and
+// thus cannot be copyrighted. This software/database is freely available
+// to the public for use. The National Library of Medicine and the U.S.
+// Government do not place any restriction on its use or reproduction.
+// We would, however, appreciate having the NCBI and the author cited in
+// any work or product based on this material.
+//
+// Although all reasonable efforts have been taken to ensure the accuracy
+// and reliability of the software and data, the NLM and the U.S.
+// Government do not and cannot warrant the performance or results that
+// may be obtained by using this software or data. The NLM and the U.S.
+// Government disclaim all warranties, express or implied, including
+// warranties of performance, merchantability or fitness for any particular
+// purpose.
+//
+// ===========================================================================
+//
+// File Name: chan.go
+//
+// Author: Jonathan Kans
+//
+// ==========================================================================
+
+package eutils
+
+import (
+ "io"
+ "os"
+ "strings"
+)
+
+// stringChanReader connect a string output channel to an io.Reader interface
+type stringChanReader struct {
+ c <-chan string
+ s string
+}
+
+func (r *stringChanReader) Read(b []byte) (n int, err error) {
+
+ if r.s != "" {
+ n = copy(b, []byte(r.s))
+ r.s = r.s[n:]
+ return
+ }
+
+ for str := range r.c {
+ r.s = str
+ n = copy(b, []byte(r.s))
+ r.s = r.s[n:]
+ return
+ }
+
+ return 0, io.EOF
+}
+
+// ChanToReader converts a string channel to an ioReader
+func ChanToReader(inp <-chan string) io.Reader {
+
+ if inp == nil {
+ return nil
+ }
+
+ return &stringChanReader{c: inp}
+}
+
+// ChanToStdout sends a string channel to stdout
+func ChanToStdout(inp <-chan string) {
+
+ if inp == nil {
+ return
+ }
+
+ last := ""
+
+ for str := range inp {
+ last = str
+ os.Stdout.WriteString(str)
+ }
+
+ if !strings.HasSuffix(last, "\n") {
+ os.Stdout.WriteString("\n")
+ }
+}
+
+// ChanToString converts a string channel to a string
+func ChanToString(inp <-chan string) string {
+
+ if inp == nil {
+ return ""
+ }
+
+ var buffer strings.Builder
+
+ last := ""
+
+ for str := range inp {
+ last = str
+ buffer.WriteString(str)
+ }
+
+ if !strings.HasSuffix(last, "\n") {
+ buffer.WriteString("\n")
+ }
+
+ return buffer.String()
+}
diff --git a/eutils/format.go b/eutils/format.go
index 6dcb6fd..010cd1a 100644
--- a/eutils/format.go
+++ b/eutils/format.go
@@ -368,7 +368,7 @@ func xmlFormatter(rcrd, prnt string, inp <-chan XMLToken, offset int, doXML bool
return txt
}
- doCleanup := func(tkn XMLToken, nxtTag int, nxtName, nxtAttr string) {
+ cleanToken := func(tkn XMLToken, nxtTag int, nxtName, nxtAttr string) {
if skip > 0 {
skip--
@@ -541,7 +541,7 @@ func xmlFormatter(rcrd, prnt string, inp <-chan XMLToken, offset int, doXML bool
}
if primed {
- doCleanup(prev, tkn.Tag, tkn.Name, tkn.Attr)
+ cleanToken(prev, tkn.Tag, tkn.Name, tkn.Attr)
}
prev = XMLToken{tkn.Tag, tkn.Cont, tkn.Name, tkn.Attr, tkn.Index, tkn.Line}
diff --git a/eutils/misc.go b/eutils/misc.go
index fe06c09..61b073a 100644
--- a/eutils/misc.go
+++ b/eutils/misc.go
@@ -35,8 +35,8 @@ import (
"golang.org/x/text/runes"
"golang.org/x/text/transform"
"golang.org/x/text/unicode/norm"
+ "html"
"os"
- "sort"
"strconv"
"strings"
"sync"
@@ -768,6 +768,56 @@ func CleanupContents(str string, ascii, amper, mixed bool) string {
return str
}
+// CleanupQuery performs optional operations on XML query strings
+func CleanupQuery(str string, exactMatch, removeBrackets bool) string {
+
+ if exactMatch {
+ str = html.EscapeString(str)
+ }
+
+ // cleanup string
+ if IsNotASCII(str) {
+ str = DoAccentTransform(str)
+ if HasUnicodeMarkup(str) {
+ str = RepairUnicodeMarkup(str, SPACE)
+ }
+ }
+
+ if exactMatch {
+ str = strings.ToLower(str)
+ }
+
+ if HasBadSpace(str) {
+ str = CleanupBadSpaces(str)
+ }
+
+ if removeBrackets {
+ if HasAngleBracket(str) {
+ str = RepairEncodedMarkup(str)
+ str = RepairScriptMarkup(str, SPACE)
+ str = RepairMathMLMarkup(str, SPACE)
+ // RemoveEmbeddedMarkup must be called before UnescapeString, which was suppressed in ExploreElements
+ str = RemoveEmbeddedMarkup(str)
+ }
+ }
+
+ if HasAmpOrNotASCII(str) {
+ str = html.UnescapeString(str)
+ }
+
+ if IsNotASCII(str) {
+ if HasGreek(str) {
+ str = SpellGreek(str)
+ str = CompressRunsOfSpaces(str)
+ }
+ if !exactMatch {
+ str = UnicodeToASCII(str)
+ }
+ }
+
+ return str
+}
+
// CompressRunsOfSpaces turns runs of spaces into a single space
func CompressRunsOfSpaces(str string) string {
@@ -1012,47 +1062,6 @@ func FixSpecialCases(str string) string {
return str
}
-// FixThemeCases expands Global Network of Biomedical Relationships theme abbreviations
-func FixThemeCases(str string) string {
-
- if !strings.Contains(str, "[thme]") && !strings.Contains(str, "[conv]") {
- return str
- }
-
- var arry []string
-
- terms := strings.Fields(str)
-
- for _, item := range terms {
-
- switch item {
- case "a+":
- arry = append(arry, "ap")
- case "e+":
- arry = append(arry, "ep")
- case "ec+":
- arry = append(arry, "ecp")
- case "eg+":
- arry = append(arry, "egp")
- case "v+":
- arry = append(arry, "vp")
- case "a-":
- arry = append(arry, "am")
- case "e-":
- arry = append(arry, "em")
- case "ec-":
- arry = append(arry, "ecm")
- default:
- arry = append(arry, item)
- }
- }
-
- // reconstruct string from transformed words
- str = strings.Join(arry, " ")
-
- return str
-}
-
// FlattenMathML removes embedded MathML structure
func FlattenMathML(str string, policy int) string {
@@ -1285,30 +1294,6 @@ func HasHyphenOrApostrophe(str string) bool {
return false
}
-// HasPlusOrMinus reports on plus or minus symbols
-func HasPlusOrMinus(str string) bool {
-
- for _, ch := range str {
- if ch == '-' || ch == '+' {
- return true
- }
- }
-
- return false
-}
-
-// HasSpaceOrHyphen reports if multiple words exist in string
-func HasSpaceOrHyphen(str string) bool {
-
- for _, ch := range str {
- if ch == ' ' || ch == '-' {
- return true
- }
- }
-
- return false
-}
-
// HasUnicodeMarkup checks for Unicode superscript or subscript characters
func HasUnicodeMarkup(str string) bool {
@@ -1389,18 +1374,6 @@ func HTMLRepair(str string) (string, bool) {
return res, ok
}
-// IsAllCapsOrDigits matches upper-case letters or digits
-func IsAllCapsOrDigits(str string) bool {
-
- for _, ch := range str {
- if !unicode.IsUpper(ch) && !unicode.IsDigit(ch) {
- return false
- }
- }
-
- return true
-}
-
// IsAllDigits matches only digits
func IsAllDigits(str string) bool {
@@ -1425,29 +1398,6 @@ func IsAllDigitsOrPeriod(str string) bool {
return true
}
-// IsAllNumeric accepts digits and arithmetic operator symbols
-func IsAllNumeric(str string) bool {
-
- for _, ch := range str {
- if !unicode.IsDigit(ch) &&
- ch != '.' &&
- ch != '+' &&
- ch != '-' &&
- ch != '*' &&
- ch != '/' &&
- ch != ',' &&
- ch != '$' &&
- ch != '#' &&
- ch != '%' &&
- ch != '(' &&
- ch != ')' {
- return false
- }
- }
-
- return true
-}
-
// IsNotASCII returns true for any character greater than 7-bits
func IsNotASCII(str string) bool {
@@ -2170,57 +2120,6 @@ func RepairUnicodeMarkup(str string, policy int) string {
return buffer.String()
}
-// ReverseComplement returns the reverse complement of a sequence
-func ReverseComplement(seq string) string {
-
- runes := []rune(seq)
- // reverse sequence letters - middle base in odd-length sequence is not touched
- for i, j := 0, len(runes)-1; i < j; i, j = i+1, j-1 {
- runes[i], runes[j] = runes[j], runes[i]
- }
- found := false
- // now complement every base, also handling uracil, leaving case intact
- for i, ch := range runes {
- runes[i], found = revComp[ch]
- if !found {
- runes[i] = 'X'
- }
- }
- seq = string(runes)
-
- return seq
-}
-
-// SequenceReverse reverses a sequence, but does not complement the bases
-func SequenceReverse(seq string) string {
-
- runes := []rune(seq)
- // reverse sequence letters - middle base in odd-length sequence is not touched
- for i, j := 0, len(runes)-1; i < j; i, j = i+1, j-1 {
- runes[i], runes[j] = runes[j], runes[i]
- }
- seq = string(runes)
-
- return seq
-}
-
-// SortStringByWords sorts the individual words in a string
-func SortStringByWords(str string) string {
-
- str = RemoveCommaOrSemicolon(str)
-
- if HasSpaceOrHyphen(str) {
- flds := strings.Fields(str)
- sort.Slice(flds, func(i, j int) bool { return flds[i] < flds[j] })
- str = strings.Join(flds, " ")
- str = strings.Replace(str, "-", " ", -1)
- str = CompressRunsOfSpaces(str)
- str = strings.TrimRight(str, ".?:")
- }
-
- return str
-}
-
// SpellGreek spells Greek letters (e..g, alpha, beta) for easier searching
func SpellGreek(str string) string {
diff --git a/eutils/normal.go b/eutils/normal.go
index de06363..f6a4505 100644
--- a/eutils/normal.go
+++ b/eutils/normal.go
@@ -50,7 +50,7 @@ func NormalizeXML(rdr <-chan XMLBlock, db string) <-chan string {
os.Exit(1)
}
- tknq := CreateTokenizer("", "", rdr)
+ tknq := CreateTokenizer(rdr)
if tknq == nil {
fmt.Fprintf(os.Stderr, "\nERROR: Unable to create normalize tokenizer\n")
diff --git a/eutils/parse.go b/eutils/parse.go
index 23c2403..407d0a2 100644
--- a/eutils/parse.go
+++ b/eutils/parse.go
@@ -32,6 +32,7 @@ package eutils
import (
"fmt"
+ "html"
"os"
"strings"
)
@@ -102,7 +103,7 @@ type XMLToken struct {
Line int
}
-// ParseAttributes produces tag/value pairs, only run on request
+// ParseAttributes produces tag/value pairs, only run on request.
func ParseAttributes(attrb string) []string {
if attrb == "" {
@@ -195,9 +196,9 @@ func ParseAttributes(attrb string) []string {
return arry
}
-// parseXML calls XML parser on a partitioned string or on an XML reader, optimized for
-// maximum processing speed, sending tokens for CDATA and COMMENT sections, and optionally
-// tracking line numbers
+// parseXML calls XML parser on a partitioned string or on an XMLBlock channel of trimmed strings.
+// It is optimized for maximum processing speed, sends tokens for CDATA and COMMENT sections (for
+// unpacking by NormalizeXML), and optionally tracks line numbers (for ValidateXML).
func parseXML(record, parent string, inp <-chan XMLBlock, tokens func(XMLToken), find *XMLFind, ids func(string)) (*XMLNode, string) {
if record == "" && (inp == nil || tokens == nil) {
@@ -216,6 +217,7 @@ func parseXML(record, parent string, inp <-chan XMLBlock, tokens func(XMLToken),
which := NOTAG
skipTo := ""
+ // updateLineCount is used to keep track of the correct line count for XML validation
updateLineCount := func(max int) {
// count lines
for i := lag; i < max; i++ {
@@ -226,7 +228,7 @@ func parseXML(record, parent string, inp <-chan XMLBlock, tokens func(XMLToken),
lag = Idx
}
- // calculate for warning messages, do not update lineNum or lag variables
+ // currentLineCount calculates correct line for warning messages, does not update lineNum or lag variables
currentLineCount := func(max int) int {
line := lineNum
for i := lag; i < max; i++ {
@@ -237,7 +239,7 @@ func parseXML(record, parent string, inp <-chan XMLBlock, tokens func(XMLToken),
return line
}
- // get next XML token
+ // nextToken returns the type and content fields for the next XML token
nextToken := func(idx int) (int, int, string, string, int) {
if record == "" {
@@ -669,7 +671,7 @@ func parseXML(record, parent string, inp <-chan XMLBlock, tokens func(XMLToken),
farmMax := farmSize
farmItems := make([]XMLNode, farmMax)
- // allocate multiple nodes in a large array for memory management efficiency
+ // nextNode allocates multiple nodes in a large array for memory management efficiency
nextNode := func(strt, attr, prnt string) *XMLNode {
// if farm array slots used up, allocate new array
@@ -699,13 +701,13 @@ func parseXML(record, parent string, inp <-chan XMLBlock, tokens func(XMLToken),
// parseSpecial recursive definition
var parseSpecial func(string, string, string) (*XMLNode, bool)
- // parse XML tags into tree structure for searching, no contentMods flags set
+ // parseSpecial parses XML tags into tree structure for searching, no contentMods flags set
parseSpecial = func(strt, attr, prnt string) (*XMLNode, bool) {
var obj *XMLNode
ok := true
- // obtain next node from farm
+ // nextNode obtains next node from farm
node := nextNode(strt, attr, prnt)
if node == nil {
return nil, false
@@ -785,7 +787,7 @@ func parseXML(record, parent string, inp <-chan XMLBlock, tokens func(XMLToken),
// parseLevel recursive definition
var parseLevel func(string, string, string) (*XMLNode, bool)
- // parse XML tags into tree structure for searching, some contentMods flags set
+ // parseLevel parses XML tags into tree structure for searching, some contentMods flags set
parseLevel = func(strt, attr, prnt string) (*XMLNode, bool) {
var obj *XMLNode
@@ -902,7 +904,7 @@ func parseXML(record, parent string, inp <-chan XMLBlock, tokens func(XMLToken),
// parseIndex recursive definition
var parseIndex func(string, string, string) string
- // parse XML tags looking for trie index element
+ // parseIndex parses XML tags looking for trie index element
parseIndex = func(strt, attr, prnt string) string {
versn := ""
@@ -1098,7 +1100,7 @@ func parseXML(record, parent string, inp <-chan XMLBlock, tokens func(XMLToken),
return top, ""
}
-// ParseRecord is the main public access to parseXML
+// ParseRecord is the main public access to parseXML.
func ParseRecord(text, parent string) *XMLNode {
pat, _ := parseXML(text, parent, nil, nil, nil, nil)
@@ -1106,7 +1108,7 @@ func ParseRecord(text, parent string) *XMLNode {
return pat
}
-// FindIdentifier returns a single identifier
+// FindIdentifier returns a single identifier.
func FindIdentifier(text, parent string, find *XMLFind) string {
_, id := parseXML(text, parent, nil, nil, find, nil)
@@ -1114,19 +1116,19 @@ func FindIdentifier(text, parent string, find *XMLFind) string {
return id
}
-// FindIdentifiers returns a set of identifiers through a callback
+// FindIdentifiers returns a set of identifiers through a callback.
func FindIdentifiers(text, parent string, find *XMLFind, ids func(string)) {
parseXML(text, parent, nil, nil, find, ids)
}
-// StreamTokens streams tokens through a callback
-func StreamTokens(text, parent string, inp <-chan XMLBlock, streamer func(tkn XMLToken)) {
+// StreamTokens streams tokens from a reader through a callback.
+func StreamTokens(inp <-chan XMLBlock, streamer func(tkn XMLToken)) {
- parseXML(text, parent, inp, streamer, nil, nil)
+ parseXML("", "", inp, streamer, nil, nil)
}
-// StreamValues streams token values through a callback
+// StreamValues streams token values from a parsed record through a callback.
func StreamValues(text, parent string, stream func(string, string, string)) {
elementName := ""
@@ -1148,8 +1150,8 @@ func StreamValues(text, parent string, stream func(string, string, string)) {
parseXML(text, parent, nil, streamer, nil, nil)
}
-// CreateTokenizer streams tokens through a channel
-func CreateTokenizer(text, parent string, inp <-chan XMLBlock) <-chan XMLToken {
+// CreateTokenizer streams tokens through a channel.
+func CreateTokenizer(inp <-chan XMLBlock) <-chan XMLToken {
if inp == nil {
return nil
@@ -1162,17 +1164,153 @@ func CreateTokenizer(text, parent string, inp <-chan XMLBlock) <-chan XMLToken {
}
// xmlTokenizer sends XML tokens through channel
- xmlTokenizer := func(text, parent string, inp <-chan XMLBlock, out chan<- XMLToken) {
+ xmlTokenizer := func(inp <-chan XMLBlock, out chan<- XMLToken) {
// close channel when all records have been processed
defer close(out)
// parse XML and send tokens through channel
- parseXML(text, parent, inp, func(tkn XMLToken) { out <- tkn }, nil, nil)
+ parseXML("", "", inp, func(tkn XMLToken) { out <- tkn }, nil, nil)
}
// launch single tokenizer goroutine
- go xmlTokenizer(text, parent, inp, out)
+ go xmlTokenizer(inp, out)
return out
}
+
+// ExploreElements returns matching element values to callback.
+func ExploreElements(curr *XMLNode, mask, prnt, match, attrib string, wildcard, unescape bool, level int, proc func(string, int)) {
+
+ if curr == nil || proc == nil {
+ return
+ }
+
+ // **/Object performs deep exploration of recursive data (*/Object also supported)
+ deep := false
+ if prnt == "**" || prnt == "*" {
+ prnt = ""
+ deep = true
+ }
+
+ // exploreChildren recursive definition
+ var exploreChildren func(curr *XMLNode, acc func(string))
+
+ // exploreChildren handles mixed-content chains of embedded tags
+ exploreChildren = func(curr *XMLNode, acc func(string)) {
+
+ if curr.Contents != "" {
+ acc(curr.Contents)
+ }
+ for chld := curr.Children; chld != nil; chld = chld.Next {
+ if chld.Name != "" {
+ acc("<" + chld.Name + ">")
+ }
+ exploreChildren(chld, acc)
+ if chld.Name != "" {
+ acc("</" + chld.Name + ">")
+ }
+ }
+ }
+
+ // exploreElements recursive definition
+ var exploreElements func(curr *XMLNode, skip string, lev int)
+
+ // exploreElements visits nodes looking for matches to requested object
+ exploreElements = func(curr *XMLNode, skip string, lev int) {
+
+ if !deep && curr.Name == skip {
+ // do not explore within recursive object
+ return
+ }
+
+ if curr.Name == match ||
+ // parent/* matches any subfield
+ (match == "*" && prnt != "") ||
+ // wildcard (internal colon) matches any namespace prefix
+ (wildcard && strings.HasPrefix(match, ":") && strings.HasSuffix(curr.Name, match)) ||
+ (match == "" && attrib != "") {
+
+ if prnt == "" ||
+ curr.Parent == prnt ||
+ (wildcard && strings.HasPrefix(prnt, ":") && strings.HasSuffix(curr.Parent, prnt)) {
+
+ if attrib != "" {
+ if curr.Attributes != "" && curr.Attribs == nil {
+ // parse attributes on-the-fly if queried
+ curr.Attribs = ParseAttributes(curr.Attributes)
+ }
+ for i := 0; i < len(curr.Attribs)-1; i += 2 {
+ // attributes now parsed into array as [ tag, value, tag, value, tag, value, ... ]
+ if curr.Attribs[i] == attrib ||
+ (wildcard && strings.HasPrefix(attrib, ":") && strings.HasSuffix(curr.Attribs[i], attrib)) {
+ proc(curr.Attribs[i+1], level)
+ return
+ }
+ }
+
+ } else if curr.Contents != "" {
+
+ str := curr.Contents[:]
+
+ if unescape && HasAmpOrNotASCII(str) {
+ // processing of <, >, &, ", and ' characters is now delayed until element contents is requested
+ str = html.UnescapeString(str)
+ }
+
+ proc(str, level)
+ return
+
+ } else if curr.Children != nil {
+
+ if doMixed {
+ // match with mixed contents - send all child strings
+ var buffr strings.Builder
+ exploreChildren(curr, func(str string) {
+ if str != "" {
+ buffr.WriteString(str)
+ }
+ })
+ str := buffr.String()
+
+ // clean up reconstructed mixed content
+ str = DoTrimFlankingHTML(str)
+ if HasBadSpace(str) {
+ str = CleanupBadSpaces(str)
+ }
+ if HasAdjacentSpaces(str) {
+ str = CompressRunsOfSpaces(str)
+ }
+ if NeedsTightening(str) {
+ str = TightenParentheses(str)
+ }
+ if unescape && HasAmpOrNotASCII(str) {
+ str = html.UnescapeString(str)
+ }
+
+ proc(str, level)
+ return
+ }
+
+ // for XML container object, send empty string to callback to increment count
+ proc("", level)
+ // and continue exploring
+
+ } else if curr.Attributes != "" {
+
+ // for self-closing object, indicate presence by sending empty string to callback
+ proc("", level)
+ return
+ }
+ }
+ }
+
+ for chld := curr.Children; chld != nil; chld = chld.Next {
+ // inner exploration is subject to recursive object exclusion
+ exploreElements(chld, mask, lev+1)
+ }
+ }
+
+ // start recursive exploration from current scope
+ exploreElements(curr, "", level)
+}
diff --git a/eutils/spdi.go b/eutils/spdi.go
index 40b7177..177e17d 100644
--- a/eutils/spdi.go
+++ b/eutils/spdi.go
@@ -320,3 +320,37 @@ func SequenceExtract(seq, featLoc string) string {
return buffer.String()
}
+
+// ReverseComplement returns the reverse complement of a sequence
+func ReverseComplement(seq string) string {
+
+ runes := []rune(seq)
+ // reverse sequence letters - middle base in odd-length sequence is not touched
+ for i, j := 0, len(runes)-1; i < j; i, j = i+1, j-1 {
+ runes[i], runes[j] = runes[j], runes[i]
+ }
+ found := false
+ // now complement every base, also handling uracil, leaving case intact
+ for i, ch := range runes {
+ runes[i], found = revComp[ch]
+ if !found {
+ runes[i] = 'X'
+ }
+ }
+ seq = string(runes)
+
+ return seq
+}
+
+// SequenceReverse reverses a sequence, but does not complement the bases
+func SequenceReverse(seq string) string {
+
+ runes := []rune(seq)
+ // reverse sequence letters - middle base in odd-length sequence is not touched
+ for i, j := 0, len(runes)-1; i < j; i, j = i+1, j-1 {
+ runes[i], runes[j] = runes[j], runes[i]
+ }
+ seq = string(runes)
+
+ return seq
+}
diff --git a/eutils/split.go b/eutils/split.go
index dcb940e..a01c866 100644
--- a/eutils/split.go
+++ b/eutils/split.go
@@ -34,20 +34,25 @@ import (
"strings"
)
-// PartitionPattern splits XML input from <pattern> to </pattern> and sends individual records to a callback
+// PartitionPattern splits XML input from <pattern> to </pattern> and sends
+// individual records to a callback. Requiring the input to be an XMLBlock
+// channel of trimmed strings, generated by CreateXMLStreamer, simplifies the
+// code by eliminating the need to check for an incomplete object tag at the end.
func PartitionPattern(pat, star string, inp <-chan XMLBlock, proc func(string)) {
if pat == "" || inp == nil || proc == nil {
return
}
+ // Scanner stores the precomputed Boyer-Moore-Horspool pattern matching table.
+ // By experiment, this was slightly (but reproducibly) faster than the Boyer-Moore-Sunday variant.
type Scanner struct {
Pattern string
PatLength int
CharSkip [256]int
}
- // initialize <pattern> to </pattern> scanner
+ // newScanner initializes <pattern> to </pattern> scanner.
newScanner := func(pattern string) *Scanner {
if pattern == "" {
@@ -74,7 +79,7 @@ func PartitionPattern(pat, star string, inp <-chan XMLBlock, proc func(string))
return scr
}
- // check surroundings of match candidate
+ // isAnElement checks surroundings of match candidate.
isAnElement := func(text string, lf, rt, mx int) bool {
if (lf >= 0 && text[lf] == '<') || (lf > 0 && text[lf] == '/' && text[lf-1] == '<') {
@@ -86,7 +91,7 @@ func PartitionPattern(pat, star string, inp <-chan XMLBlock, proc func(string))
return false
}
- // modified Boyer-Moore-Horspool search function
+ // findNextMatch is a modified Boyer-Moore-Horspool search function for maximum partitioning speed.
findNextMatch := func(scr *Scanner, text string, offset int) (int, int, int) {
if scr == nil || text == "" {
@@ -132,8 +137,10 @@ func PartitionPattern(pat, star string, inp <-chan XMLBlock, proc func(string))
return -1, -1, -1
}
+ // PatternType is the integer type for XML tag classification
type PatternType int
+ // PatternType keys for XML parsing
const (
NOPATTERN PatternType = iota
STARTPATTERN
@@ -141,7 +148,7 @@ func PartitionPattern(pat, star string, inp <-chan XMLBlock, proc func(string))
STOPPATTERN
)
- // find next element with pattern name
+ // nextPattern finds next element with pattern name.
nextPattern := func(scr *Scanner, text string, pos int) (PatternType, int, int, int) {
if scr == nil || text == "" {
@@ -168,8 +175,7 @@ func PartitionPattern(pat, star string, inp <-chan XMLBlock, proc func(string))
}
}
- // -pattern Object construct
-
+ // doNormal handles -pattern Object construct, keeping track of nesting level.
doNormal := func() {
// current depth of -pattern objects
@@ -239,9 +245,9 @@ func PartitionPattern(pat, star string, inp <-chan XMLBlock, proc func(string))
}
}
- // -pattern Parent/* construct now works with catenated files, but not if components
- // are recursive or self-closing objects, process those through -format first
-
+ // doStar handles -pattern Parent/* construct for heterogeneous objects. It now works
+ // with concatenated files, but not if components are recursive or self-closing objects.
+ // Process the latter through transmute -format -self first.
doStar := func() {
// current depth of -pattern objects
@@ -302,7 +308,7 @@ func PartitionPattern(pat, star string, inp <-chan XMLBlock, proc func(string))
}
if tag[0] == '/' {
if strings.HasPrefix(tag[1:], pat) {
- //should be </pattern> at end, want to continue if catenated files
+ //should be </pattern> at end, want to continue if concatenated files
return "/"
}
return ""
@@ -335,7 +341,7 @@ func PartitionPattern(pat, star string, inp <-chan XMLBlock, proc func(string))
return
}
- // check for catenated parent set files
+ // check for concatenated parent set files
if tag[0] == '/' {
scr = newScanner(pat)
if scr == nil {
diff --git a/eutils/utils.go b/eutils/utils.go
index 7ec3638..a2f8cb6 100644
--- a/eutils/utils.go
+++ b/eutils/utils.go
@@ -34,12 +34,10 @@ import (
"fmt"
"github.com/klauspost/cpuid"
"github.com/pbnjay/memory"
- "io"
"os"
"runtime"
"runtime/debug"
"strconv"
- "strings"
"time"
)
@@ -83,8 +81,6 @@ var (
doASCII bool
doCompress bool
doCleanup bool
- doStem bool
- deStop bool
)
// additional options
@@ -189,7 +185,7 @@ func SetTunings(nmProcs, nmServe, svRatio, chnDepth, frmSize, hepSize, gogc int)
}
// SetOptions sets processing options
-func SetOptions(strict, mixed, accent, ascii, compress, cleanup, stems, stops, count bool) {
+func SetOptions(strict, mixed, accent, ascii, compress, cleanup bool) {
doStrict = strict
doMixed = mixed
@@ -200,10 +196,7 @@ func SetOptions(strict, mixed, accent, ascii, compress, cleanup, stems, stops, c
doCompress = compress
doCleanup = cleanup
- doStem = stems
- deStop = !stops
-
- countLines = count
+ countLines = false
// set dependent flags
countLines = doMixed
@@ -223,122 +216,16 @@ func NumServe() int {
return numServe
}
-// DoStrict returns the -strict value
-func DoStrict() bool {
-
- return doStrict
-}
-
-// DoMixed returns the -mixed value
-func DoMixed() bool {
-
- return doMixed
-}
-
-// DeAccent returns the -accent value
-func DeAccent() bool {
-
- return deAccent
-}
-
-// DoASCII returns the -ascii value
-func DoASCII() bool {
-
- return doASCII
-}
-
-// DoCompress returns the -compress value
-func DoCompress() bool {
-
- return doCompress
-}
-
-// DeStop returns the -stops value
-func DeStop() bool {
-
- return deStop
-}
-
-// DoStem returns the -stems value
-func DoStem() bool {
-
- return doStem
-}
-
-// stringChanReader connect a string output channel to an io.Reader interface
-type stringChanReader struct {
- c <-chan string
- s string
-}
-
-func (r *stringChanReader) Read(b []byte) (n int, err error) {
-
- if r.s != "" {
- n = copy(b, []byte(r.s))
- r.s = r.s[n:]
- return
- }
-
- for str := range r.c {
- r.s = str
- n = copy(b, []byte(r.s))
- r.s = r.s[n:]
- return
- }
-
- return 0, io.EOF
-}
-
-// ChanToIoReader converts a string channel to an ioReader
-func ChanToIoReader(inp <-chan string) io.Reader {
+// GetTunings returns performance parameter values
+func GetTunings() (nmProcs, nmServe, svRatio, chnDepth, frmSize, hepSize, gogc int) {
- if inp == nil {
- return nil
- }
-
- return &stringChanReader{c: inp, s: ""}
+ return numProcs, numServe, serverRatio, chanDepth, farmSize, heapSize, goGc
}
-// ChanToStdout sends a string channel to stdout
-func ChanToStdout(inp <-chan string) {
-
- if inp == nil {
- return
- }
-
- last := ""
-
- for str := range inp {
- last = str
- os.Stdout.WriteString(str)
- }
-
- if !strings.HasSuffix(last, "\n") {
- os.Stdout.WriteString("\n")
- }
-}
-
-// ChanToString converts a string channel to a string
-func ChanToString(inp <-chan string) string {
-
- if inp == nil {
- return ""
- }
-
- var buffer strings.Builder
-
- last := ""
-
- for str := range inp {
- last = str
- buffer.WriteString(str)
- }
-
- if !strings.HasSuffix(last, "\n") {
- buffer.WriteString("\n")
- }
+// GetOptions returns processing option values
+func GetOptions() (strict, mixed, accent, ascii, compress, cleanup bool) {
- return buffer.String()
+ return doStrict, doMixed, deAccent, doASCII, doCompress, doCleanup
}
// GetNumericArg returns an integer argument, reporting an error if no remaining arguments
@@ -556,7 +443,7 @@ func init() {
inAsnBits['\''] = false
// initialize reading and cleaning options with default values
- SetOptions(false, false, false, false, false, false, false, false, false)
+ SetOptions(false, false, false, false, false, false)
// initialize performance tuning variables with default values
SetTunings(0, 0, 0, 0, 0, 0, 0)
diff --git a/eutils/valid.go b/eutils/valid.go
index 0c30530..9bddf45 100644
--- a/eutils/valid.go
+++ b/eutils/valid.go
@@ -45,7 +45,7 @@ func ValidateXML(rdr <-chan XMLBlock, fnd string, html bool) int {
countLines = true
- tknq := CreateTokenizer("", "", rdr)
+ tknq := CreateTokenizer(rdr)
if tknq == nil {
fmt.Fprintf(os.Stderr, "\nERROR: Unable to create validator tokenizer\n")
diff --git a/eutils/xml.go b/eutils/xml.go
index aed00f3..6a4be7c 100644
--- a/eutils/xml.go
+++ b/eutils/xml.go
@@ -38,10 +38,14 @@ import (
"os"
)
-// XMLBlock is a string with a leading left angle bracket and trailing right angle bracket
+// XMLBlock is a string that begins with a left angle bracket and is trimmed back to
+// end with a right angle bracket. The excluded characters are saved and prepended
+// to the next buffer. Providing complete object tags simplifies subsequent parsing.
type XMLBlock string
-// CreateXMLStreamer reads XML input file into a channel of trimmed blocks
+// CreateXMLStreamer reads XML input into a channel of trimmed strings that are
+// then split by PartitionPattern into individual records (which can be processed
+// concurrently), or parsed directly into a channel of tokens by CreateTokenizer.
func CreateXMLStreamer(in io.Reader) <-chan XMLBlock {
if in == nil {
@@ -54,15 +58,17 @@ func CreateXMLStreamer(in io.Reader) <-chan XMLBlock {
os.Exit(1)
}
- // xmlReader sends XML blocks through channel
+ // xmlReader sends trimmed XML blocks through the output channel.
xmlReader := func(in io.Reader, out chan<- XMLBlock) {
// close channel when all blocks have been processed
defer close(out)
- // 65536 appears to be the maximum number of characters presented to io.Reader when input is piped from stdin
- // increasing size of buffer when input is from a file does not improve program performance
- // additional 16384 bytes are reserved for copying previous remainder to start of buffer before next read
+ // 65536 appears to be the maximum number of characters presented to io.Reader
+ // when input is piped from stdin. Increasing the buffer size when input is from
+ // a file does not improve program performance. An additional 16384 bytes are
+ // reserved for copying the previous remainder to the beginning of the buffer
+ // before the next read.
const XMLBUFSIZE = 65536 + 16384
buffer := make([]byte, XMLBUFSIZE)
@@ -71,10 +77,15 @@ func CreateXMLStreamer(in io.Reader) <-chan XMLBlock {
delta := 0
isClosed := false
+ // htmlBehind is used in strict mode to trim back further when a lower-case tag
+ // is encountered. This may be a formatting decoration, such as <i> or </i> for
+ // italics. Processing HTML, which may have embedded mixed content, requires use
+ // of mixed mode.
htmlBehind := func(bufr []byte, pos, txtlen int) bool {
for pos >= 0 {
if bufr[pos] == '<' {
+ // detect lower-case markup tags, or DispFormula in PubMed
return HTMLAhead(string(bufr), pos, txtlen) != 0
}
pos--
@@ -83,7 +94,10 @@ func CreateXMLStreamer(in io.Reader) <-chan XMLBlock {
return false
}
- // read one buffer, trim at last > and retain remainder for next call, signal if no > character
+ // nextBuffer reads one buffer, trims back to the right-most > character, and
+ // retains the remainder for prepending in the next call. It also signals if
+ // there was no > character, resulting in subsequent calls to nextBuffer to
+ // continue reading a large content string.
nextBuffer := func() ([]byte, bool, bool) {
if isClosed {
@@ -94,30 +108,34 @@ func CreateXMLStreamer(in io.Reader) <-chan XMLBlock {
m := copy(buffer, remainder)
remainder = ""
if m > 16384 {
- // previous remainder is larger than reserved section, write and signal need to continue reading
+ // previous remainder is larger than reserved section,
+ // write and signal the need to continue reading.
return buffer[:m], true, false
}
// read next block, append behind copied remainder from previous read
n, err := in.Read(buffer[m:])
- // with data piped through stdin, read function may not always return the same number of bytes each time
+ // with data piped through stdin, read function may not always return the
+ // same number of bytes each time
if err != nil {
if err != io.EOF {
- // real error
+ // real error.
fmt.Fprintf(os.Stderr, "\nERROR: %s\n", err.Error())
- // Ignore bytes - non-conforming implementations of io.Reader may returned mangled data on non-EOF errors
+ // ignore bytes - non-conforming implementations of io.Reader may
+ // return mangled data on non-EOF errors
isClosed = true
return nil, false, true
}
- // end of file
+ // end of file.
isClosed = true
if n == 0 {
- // if EOF and no more data, do not send final remainder (not terminated by right angle bracket that is used as a sentinel)
+ // if EOF and no more data, do not send final remainder (not terminated
+ // by right angle bracket that is used as a sentinel)
return nil, false, true
}
}
if n < 0 {
- // Reality check - non-conforming implementations of io.Reader may return -1
+ // reality check - non-conforming implementations of io.Reader may return -1
fmt.Fprintf(os.Stderr, "\nERROR: io.Reader returned negative count %d\n", n)
// treat as n == 0 in order to update file offset and avoid losing previous remainder
n = 0
@@ -130,13 +148,14 @@ func CreateXMLStreamer(in io.Reader) <-chan XMLBlock {
// slice of actual characters read
bufr := buffer[:n+m]
- // look for last > character
- // safe to back up on UTF-8 rune array when looking for 7-bit ASCII character
+ // Look for last > character. It is safe to back up on UTF-8 rune array when looking
+ // for a 7-bit ASCII character.
pos := -1
for pos = len(bufr) - 1; pos >= 0; pos-- {
if bufr[pos] == '>' {
if doStrict {
- // optionally skip backwards past embedded i, b, u, sub, and sup HTML open, close, and empty tags, and MathML
+ // optionally skip backwards past embedded i, b, u, sub, and sup
+ // HTML open, close, and empty tags, and MathML instructions
if htmlBehind(bufr, pos, len(bufr)) {
continue
}
@@ -157,8 +176,9 @@ func CreateXMLStreamer(in io.Reader) <-chan XMLBlock {
return bufr[:], true, false
}
- // nextBlock reads buffer, concatenates if necessary to place long element content into a single string
- // all result strings end in > character that is used as a sentinel in subsequent code
+ // nextBlock reads buffer, concatenates if necessary to place long element content
+ // into a single string. All result strings end in > character that is used as a
+ // sentinel in subsequent code.
nextBlock := func() string {
// read next buffer
@@ -169,8 +189,8 @@ func CreateXMLStreamer(in io.Reader) <-chan XMLBlock {
return ""
}
- // if buffer does not end with > character
if cont {
+ // current line does not end with > character
var buff bytes.Buffer
// keep reading long content blocks
@@ -202,7 +222,7 @@ func CreateXMLStreamer(in io.Reader) <-chan XMLBlock {
// trimming spaces here would throw off line tracking
- // optionally compress/cleanup tags/attributes and contents (undocumented)
+ // optionally compress/cleanup tags/attributes and contents
if doCleanup {
if HasBadSpace(str) {
str = CleanupBadSpaces(str)
@@ -227,7 +247,9 @@ func CreateXMLStreamer(in io.Reader) <-chan XMLBlock {
return out
}
-// XMLRecord structure wraps a numbered XML record sent down a channel
+// XMLRecord wraps a numbered XML record or the results of data extraction on
+// that record. The Index field stores the record's original position in the
+// input stream. The Data field is used for binary compressed PubmedArticle XML.
type XMLRecord struct {
Index int
Ident string
@@ -235,30 +257,9 @@ type XMLRecord struct {
Data []byte
}
-type xmlRecordHeap []XMLRecord
-
-// methods that satisfy heap.Interface
-func (h xmlRecordHeap) Len() int {
- return len(h)
-}
-func (h xmlRecordHeap) Less(i, j int) bool {
- return h[i].Index < h[j].Index
-}
-func (h xmlRecordHeap) Swap(i, j int) {
- h[i], h[j] = h[j], h[i]
-}
-func (h *xmlRecordHeap) Push(x interface{}) {
- *h = append(*h, x.(XMLRecord))
-}
-func (h *xmlRecordHeap) Pop() interface{} {
- old := *h
- n := len(old)
- x := old[n-1]
- *h = old[0 : n-1]
- return x
-}
-
-// CreateXMLProducer partitions an XML set and sends it down a channel
+// CreateXMLProducer partitions an XML set and sends records down a channel.
+// After processing asynchronously in multiple concurrent go routines, the
+// original order can be restored by passage through the XMLUnshuffler.
func CreateXMLProducer(pat, star string, rdr <-chan XMLBlock) <-chan XMLRecord {
if rdr == nil {
@@ -271,7 +272,7 @@ func CreateXMLProducer(pat, star string, rdr <-chan XMLBlock) <-chan XMLRecord {
os.Exit(1)
}
- // xmlProducer sends partitioned XML strings through channel
+ // xmlProducer sends partitioned XML strings through channel.
xmlProducer := func(pat, star string, rdr <-chan XMLBlock, out chan<- XMLRecord) {
// close channel when all records have been processed
@@ -293,7 +294,32 @@ func CreateXMLProducer(pat, star string, rdr <-chan XMLBlock) <-chan XMLRecord {
return out
}
-// CreateXMLUnshuffler uses heap to restore output of multiple consumers to original record order
+// xmlRecordHeap collects asynchronous processing results for presentation in the original order.
+type xmlRecordHeap []XMLRecord
+
+// methods that satisfy heap.Interface
+func (h xmlRecordHeap) Len() int {
+ return len(h)
+}
+func (h xmlRecordHeap) Less(i, j int) bool {
+ return h[i].Index < h[j].Index
+}
+func (h xmlRecordHeap) Swap(i, j int) {
+ h[i], h[j] = h[j], h[i]
+}
+func (h *xmlRecordHeap) Push(x interface{}) {
+ *h = append(*h, x.(XMLRecord))
+}
+func (h *xmlRecordHeap) Pop() interface{} {
+ old := *h
+ n := len(old)
+ x := old[n-1]
+ *h = old[0 : n-1]
+ return x
+}
+
+// CreateXMLUnshuffler passes the output of multiple concurrent processors to
+// a heap, which releases results in the same order as the original records.
func CreateXMLUnshuffler(inp <-chan XMLRecord) <-chan XMLRecord {
if inp == nil {
@@ -306,7 +332,7 @@ func CreateXMLUnshuffler(inp <-chan XMLRecord) <-chan XMLRecord {
os.Exit(1)
}
- // xmlUnshuffler restores original order with heap
+ // xmlUnshuffler restores original order with heap.
xmlUnshuffler := func(inp <-chan XMLRecord, out chan<- XMLRecord) {
// close channel when all records have been processed
@@ -326,7 +352,8 @@ func CreateXMLUnshuffler(inp <-chan XMLRecord) <-chan XMLRecord {
// push result onto heap
heap.Push(hp, ext)
- // read several values before checking to see if next record to print has been processed
+ // Read several values before checking to see if next record to print has been processed.
+ // The default heapSize value has been tuned by experiment for maximum performance.
if delay < heapSize {
delay++
continue
@@ -356,11 +383,11 @@ func CreateXMLUnshuffler(inp <-chan XMLRecord) <-chan XMLRecord {
next++
}
- // keep checking heap to see if next result is already available
+ // continue to check heap to see if next result is already available
}
}
- // send remainder of heap to output
+ // flush remainder of heap to output
for hp.Len() > 0 {
curr := heap.Pop(hp).(XMLRecord)
diff --git a/exclude-uid-lists b/exclude-uid-lists
index 3d21831..0c2792b 100755
--- a/exclude-uid-lists
+++ b/exclude-uid-lists
@@ -1,5 +1,8 @@
#!/bin/bash -norc
+# Public domain notice for all NCBI EDirect scripts is located at:
+# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE
+
# Usage: exclude-uid-lists FILE1 FILE2
comm -23 <(sort -f "$1") <(sort -f "$2") | sort -n
diff --git a/expand-current b/expand-current
index 1d00fc2..2a7e1fa 100755
--- a/expand-current
+++ b/expand-current
@@ -1,5 +1,8 @@
#!/bin/sh
+# Public domain notice for all NCBI EDirect scripts is located at:
+# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE
+
while [ $# -gt 0 ]
do
case "$1" in
diff --git a/fetch-pubmed b/fetch-pubmed
index 4984cb0..c4346d2 100755
--- a/fetch-pubmed
+++ b/fetch-pubmed
@@ -1,5 +1,8 @@
#!/bin/sh
+# Public domain notice for all NCBI EDirect scripts is located at:
+# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE
+
doall=false
dofresh=false
flag="none"
diff --git a/filter-stop-words b/filter-stop-words
index 51a3fa2..82ed462 100755
--- a/filter-stop-words
+++ b/filter-stop-words
@@ -1,5 +1,8 @@
#!/bin/bash -norc
+# Public domain notice for all NCBI EDirect scripts is located at:
+# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE
+
stop_words="#a#about#above#abs#accordingly#across#after#afterwards#again#\
against#all#almost#alone#along#already#also#although#always#am#among#\
amongst#an#analyze#and#another#any#anyhow#anyone#anything#anywhere#\
diff --git a/filter-table b/filter-table
new file mode 100755
index 0000000..8be2240
--- /dev/null
+++ b/filter-table
@@ -0,0 +1,8 @@
+#!/bin/sh
+
+# Public domain notice for all NCBI EDirect scripts is located at:
+# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE
+
+# MUST be called with single quotes, e.g.:
+# filter-table '10 <= $2 && $2 <= 30'
+awk -F '\t' -v 'OFS=\t' "( $* ) {print}"
diff --git a/gbf2xml b/gbf2xml
index 61e431b..874c3e1 100755
--- a/gbf2xml
+++ b/gbf2xml
@@ -1,3 +1,6 @@
#!/bin/sh
+# Public domain notice for all NCBI EDirect scripts is located at:
+# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE
+
transmute -g2x "$@"
diff --git a/hlp-xtract.txt b/hlp-xtract.txt
index a259e9f..df9944d 100644
--- a/hlp-xtract.txt
+++ b/hlp-xtract.txt
@@ -119,6 +119,14 @@ Citation Lookup
efilter -days 365 |
efetch -format abstract
+Stopwords and Stemming
+
+ pm=$( efetch -db pubmed -id 2005826 -format xml )
+ echo "$pm" | xtract -pattern PubmedArticle -sep " " -words ArticleTitle
+ echo "$pm" | xtract -stops -pattern PubmedArticle -sep " " -words ArticleTitle
+ echo "$pm" | xtract -stems -pattern PubmedArticle -sep " " -words ArticleTitle
+ echo "$pm" | xtract -stops -stems -pattern PubmedArticle -sep " " -words ArticleTitle
+
DOI Extraction
esearch -db pubmed -query "Rowley JD [AUTH]" |
@@ -347,7 +355,8 @@ Genome Range
-min ChrStart,ChrStop -element "&NAME" "&DESC" |
sort -k 1,1n | cut -f 2- |
grep -v pseudogene | grep -v uncharacterized |
- between-two-genes ASMT IL3RA
+ between-two-genes ASMT IL3RA |
+ align-columns -g 4
IL3RA interleukin 3 receptor subunit alpha
SLC25A6 solute carrier family 25 member 6
@@ -361,7 +370,7 @@ Genome Range
Centromere Position
nquire -ftp ftp.ncbi.nlm.nih.gov pub/gdp ideogram_9606_GCF_000001305.14_850_V1 |
- grep acen | cut -f 1,2,6,7 | grep "^X\t"
+ grep acen | cut -f 1,2,6,7 | grep "^X"
X p 58100001 61000000
X q 61000001 63800000
diff --git a/index-extras b/index-extras
index adbe5cb..5359443 100755
--- a/index-extras
+++ b/index-extras
@@ -1,5 +1,8 @@
#!/bin/sh
+# Public domain notice for all NCBI EDirect scripts is located at:
+# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE
+
while [ $# -gt 0 ]
do
case "$1" in
diff --git a/index-pubmed b/index-pubmed
index ce6e7cf..003b33d 100755
--- a/index-pubmed
+++ b/index-pubmed
@@ -1,5 +1,8 @@
#!/bin/sh
+# Public domain notice for all NCBI EDirect scripts is located at:
+# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE
+
startat=0
while [ $# -gt 0 ]
diff --git a/intersect-uid-lists b/intersect-uid-lists
index 23cc2f0..61281b5 100755
--- a/intersect-uid-lists
+++ b/intersect-uid-lists
@@ -1,5 +1,8 @@
#!/bin/bash -norc
+# Public domain notice for all NCBI EDirect scripts is located at:
+# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE
+
# Usage: intersect-uid-lists FILE1 FILE2
comm -12 <(sort -f "$1") <(sort -f "$2") | sort -n
diff --git a/join-into-groups-of b/join-into-groups-of
index 22bb6c4..7832a13 100755
--- a/join-into-groups-of
+++ b/join-into-groups-of
@@ -1,3 +1,7 @@
#!/bin/sh
+
+# Public domain notice for all NCBI EDirect scripts is located at:
+# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE
+
xargs -n "$@" echo |
sed 's/ /,/g'
diff --git a/pm-collect b/pm-collect
index d8d2641..a59c78d 100755
--- a/pm-collect
+++ b/pm-collect
@@ -1,5 +1,8 @@
#!/bin/sh
+# Public domain notice for all NCBI EDirect scripts is located at:
+# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE
+
if [ "$#" -eq 0 ]
then
echo "Must supply path for archive files"
diff --git a/pm-index b/pm-index
index 3255da9..38950c1 100755
--- a/pm-index
+++ b/pm-index
@@ -1,5 +1,8 @@
#!/bin/sh
+# Public domain notice for all NCBI EDirect scripts is located at:
+# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE
+
if [ "$#" -eq 0 ]
then
echo "Must supply path for indexed files"
diff --git a/pm-invert b/pm-invert
index 8dd355b..acc0520 100755
--- a/pm-invert
+++ b/pm-invert
@@ -1,5 +1,8 @@
#!/bin/sh
+# Public domain notice for all NCBI EDirect scripts is located at:
+# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE
+
if [ "$#" -eq 0 ]
then
echo "Must supply path for inverted files"
diff --git a/pm-merge b/pm-merge
index 662b52a..93e3588 100755
--- a/pm-merge
+++ b/pm-merge
@@ -1,5 +1,8 @@
#!/bin/sh
+# Public domain notice for all NCBI EDirect scripts is located at:
+# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE
+
if [ "$#" -eq 0 ]
then
echo "Must supply path for merged files"
diff --git a/pm-prepare b/pm-prepare
index ef8a070..b8ee767 100755
--- a/pm-prepare
+++ b/pm-prepare
@@ -1,5 +1,8 @@
#!/bin/sh
+# Public domain notice for all NCBI EDirect scripts is located at:
+# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE
+
if [ "$#" -eq 0 ]
then
echo "Must supply path to archive files"
diff --git a/pm-promote b/pm-promote
index c98ead9..ce91ff0 100755
--- a/pm-promote
+++ b/pm-promote
@@ -1,5 +1,8 @@
#!/bin/sh
+# Public domain notice for all NCBI EDirect scripts is located at:
+# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE
+
if [ "$#" -eq 0 ]
then
echo "Must supply path for postings files"
diff --git a/pm-refresh b/pm-refresh
index 339a35f..e67ef8f 100755
--- a/pm-refresh
+++ b/pm-refresh
@@ -1,5 +1,8 @@
#!/bin/sh
+# Public domain notice for all NCBI EDirect scripts is located at:
+# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE
+
if [ "$#" -eq 0 ]
then
echo "Must supply path for archive files"
diff --git a/pm-stash b/pm-stash
index cd651f0..919b34f 100755
--- a/pm-stash
+++ b/pm-stash
@@ -1,5 +1,8 @@
#!/bin/sh
+# Public domain notice for all NCBI EDirect scripts is located at:
+# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE
+
if [ "$#" -eq 0 ]
then
echo "Must supply path to archive files"
diff --git a/print-columns b/print-columns
new file mode 100755
index 0000000..42aeb6f
--- /dev/null
+++ b/print-columns
@@ -0,0 +1,8 @@
+#!/bin/sh
+
+# Public domain notice for all NCBI EDirect scripts is located at:
+# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE
+
+# MUST be called with single quotes, e.g.:
+# print-columns '$1, $2+1, $3, $4-1, $5'
+awk -F '\t' -v 'OFS=\t' "{print $*}"
diff --git a/reorder-columns b/reorder-columns
index eaa0b9f..f8fc2dd 100755
--- a/reorder-columns
+++ b/reorder-columns
@@ -1,12 +1,15 @@
#!/bin/sh
+# Public domain notice for all NCBI EDirect scripts is located at:
+# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE
+
# Usage: reorder-columns COLUMN NUMBERS...
cmd=""
com="$"
for col in "$@"
do
- cmd=`echo "$cmd$com$col"`
+ cmd=$( echo "$cmd$com$col" )
com=", $"
done
awk -F '\t' -v 'OFS=\t' "{print $cmd}"
diff --git a/run-ncbi-converter b/run-ncbi-converter
index c8a0f20..222e8c7 100755
--- a/run-ncbi-converter
+++ b/run-ncbi-converter
@@ -1,4 +1,8 @@
#!/usr/bin/env perl
+
+# Public domain notice for all NCBI EDirect scripts is located at:
+# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE
+
use warnings;
use strict;
diff --git a/setup-deps.pl b/setup-deps.pl
index f2ac7d2..081a406 100755
--- a/setup-deps.pl
+++ b/setup-deps.pl
@@ -1,4 +1,8 @@
#!/usr/bin/env perl
+
+# Public domain notice for all NCBI EDirect scripts is located at:
+# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE
+
use warnings;
use strict;
use CPAN::MyConfig;
diff --git a/setup.sh b/setup.sh
index 5976e46..4707f7c 100755
--- a/setup.sh
+++ b/setup.sh
@@ -1,5 +1,8 @@
#!/bin/bash -norc
+# Public domain notice for all NCBI EDirect scripts is located at:
+# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE
+
PERL=perl
case "`uname -s`" in
Darwin )
diff --git a/skip-if-file-exists b/skip-if-file-exists
index 0cb450e..5759fb8 100755
--- a/skip-if-file-exists
+++ b/skip-if-file-exists
@@ -1,4 +1,8 @@
#!/bin/bash -norc
+
+# Public domain notice for all NCBI EDirect scripts is located at:
+# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE
+
while IFS=$'\t' read fl
do
if [ ! -f "$fl" ]
diff --git a/sort-table b/sort-table
new file mode 100755
index 0000000..b632ef4
--- /dev/null
+++ b/sort-table
@@ -0,0 +1,6 @@
+#!/bin/sh
+
+# Public domain notice for all NCBI EDirect scripts is located at:
+# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE
+
+sort -t "$(printf '\t')" "$@"
diff --git a/sort-uniq-count b/sort-uniq-count
index 1fcdd8e..69fc193 100755
--- a/sort-uniq-count
+++ b/sort-uniq-count
@@ -1,13 +1,18 @@
#!/bin/sh
+
+# Public domain notice for all NCBI EDirect scripts is located at:
+# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE
+
flags="f"
if [ -n "$*" ]
then
- flags=`echo " $*" | sed 's/[^bfinrs]//g'`
+ flags=$( echo " $*" | sed 's/[^bfinrs]//g' )
if [ -z "$flags" ]
then
flags="s"
fi
fi
+grep '.' |
sort "-$flags" |
uniq -i -c |
awk '{ n=$1; sub(/[ \t]*[0-9]+[ \t]/, ""); print n "\t" $0 }'
diff --git a/sort-uniq-count-rank b/sort-uniq-count-rank
index 7aa67c5..b363daf 100755
--- a/sort-uniq-count-rank
+++ b/sort-uniq-count-rank
@@ -1,13 +1,18 @@
#!/bin/sh
+
+# Public domain notice for all NCBI EDirect scripts is located at:
+# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE
+
flags="f"
if [ -n "$*" ]
then
- flags=`echo " $*" | sed 's/[^bfinrs]//g'`
+ flags=$( echo " $*" | sed 's/[^bfinrs]//g' )
if [ -z "$flags" ]
then
flags="s"
fi
fi
+grep '.' |
sort "-$flags" |
uniq -i -c |
awk '{ n=$1; sub(/[ \t]*[0-9]+[ \t]/, ""); print n "\t" $0 }' |
diff --git a/stream-pubmed b/stream-pubmed
index 0419ee3..b3d9e60 100755
--- a/stream-pubmed
+++ b/stream-pubmed
@@ -1,5 +1,8 @@
#!/bin/sh
+# Public domain notice for all NCBI EDirect scripts is located at:
+# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE
+
flag="none"
while [ $# -gt 0 ]
diff --git a/test-edirect b/test-edirect
index 3b1c5fa..0af9687 100755
--- a/test-edirect
+++ b/test-edirect
@@ -198,6 +198,14 @@ PrintTimeAndTitle "Citation Lookup"
efilter -days 365 |
efetch -format abstract
+PrintTimeAndTitle "Stopwords and Stemming"
+
+ pm=$( efetch -db pubmed -id 2005826 -format xml )
+ echo "$pm" | xtract -pattern PubmedArticle -sep " " -words ArticleTitle
+ echo "$pm" | xtract -stops -pattern PubmedArticle -sep " " -words ArticleTitle
+ echo "$pm" | xtract -stems -pattern PubmedArticle -sep " " -words ArticleTitle
+ echo "$pm" | xtract -stops -stems -pattern PubmedArticle -sep " " -words ArticleTitle
+
PrintTimeAndTitle "DOI Extraction"
esearch -db pubmed -query "Rowley JD [AUTH]" |
@@ -313,12 +321,13 @@ PrintTimeAndTitle "Genome Range"
-min ChrStart,ChrStop -element "&NAME" "&DESC" |
sort -k 1,1n | cut -f 2- |
grep -v pseudogene | grep -v uncharacterized |
- between-two-genes ASMT IL3RA
+ between-two-genes ASMT IL3RA |
+ align-columns -g 4
PrintTimeAndTitle "Centromere Position"
nquire -ftp ftp.ncbi.nlm.nih.gov pub/gdp ideogram_9606_GCF_000001305.14_850_V1 |
- grep acen | cut -f 1,2,6,7 | grep "^X\t"
+ grep acen | cut -f 1,2,6,7 | grep "^X"
PrintTimeAndTitle "Gene Regions"
@@ -442,6 +451,15 @@ PrintTimeAndTitle "Structural Similarity"
-if PdbClass -equals Hydrolase \
-element PdbAcc PdbDescr
+PrintTimeAndTitle "Underscore Protection"
+
+ esearch -db biosample -query "package metagenome or environmental version 1 0 [PROP]" |
+ xtract -pattern ENTREZ_DIRECT -element Count
+
+ esearch -db assembly -query "algae [ORGN] AND complete genome [FILT]" |
+ efilter -query "refseq has annotation [PROP] NOT anomalous [FILT]" |
+ xtract -pattern ENTREZ_DIRECT -element Count
+
PrintTimeAndTitle "Amino Acid Substitutions"
esearch -db gene -query "OPN1MW [PREF] AND human [ORGN]" |
diff --git a/test-pubmed-index b/test-pubmed-index
index 554cc04..26675c6 100755
--- a/test-pubmed-index
+++ b/test-pubmed-index
@@ -1,5 +1,8 @@
#!/bin/bash
+# Public domain notice for all NCBI EDirect scripts is located at:
+# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE
+
seconds_start=$(date "+%s")
for i in {1..100}
do
diff --git a/theme-aliases b/theme-aliases
index a903480..f35ca0f 100755
--- a/theme-aliases
+++ b/theme-aliases
@@ -1,5 +1,8 @@
#!/bin/bash
+# Public domain notice for all NCBI EDirect scripts is located at:
+# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE
+
alias ChemCodeToName='phrase-search -convert chem Code Name'
alias ChemCodeToTerm='phrase-search -convert chem Code Term'
alias ChemCodeToTree='phrase-search -convert chem Code Tree'
diff --git a/tst-elink.txt b/tst-elink.txt
index aeff820..498dd81 100644
--- a/tst-elink.txt
+++ b/tst-elink.txt
@@ -1,3 +1,4 @@
+assembly nuccore 9513491
cdd pubmed 274590
gds pubmed 1336
gds taxonomy 1336
diff --git a/word-at-a-time b/word-at-a-time
index 53ed73c..b72a2b1 100755
--- a/word-at-a-time
+++ b/word-at-a-time
@@ -1,4 +1,8 @@
#!/bin/bash -norc
+
+# Public domain notice for all NCBI EDirect scripts is located at:
+# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE
+
sed 's/[^a-zA-Z0-9]/ /g; s/^ *//' |
tr 'A-Z' 'a-z' |
fmt -w 1
diff --git a/xml2tbl b/xml2tbl
index e6c0470..c8fa488 100755
--- a/xml2tbl
+++ b/xml2tbl
@@ -1,5 +1,8 @@
#!/bin/sh
+# Public domain notice for all NCBI EDirect scripts is located at:
+# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE
+
xtract -pattern INSDSeq -pfx ">Feature " \
-first INSDSeqid,INSDSeq_accession-version \
-group INSDFeature -FKEY INSDFeature_key \
diff --git a/xy-plot b/xy-plot
index 7990c4f..f5c733a 100755
--- a/xy-plot
+++ b/xy-plot
@@ -1,4 +1,8 @@
#!/bin/sh
+
+# Public domain notice for all NCBI EDirect scripts is located at:
+# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE
+
# For Mac, please obtain command-line-enabled Plot2x from http://apps.micw.org/apps/plot2/downloads.php
# For Unix or PC/Cygwin, please obtain gnuplot from http://gnuplot.sourceforge.net/download.html
plot2x=