diff options
68 files changed, 1555 insertions, 1311 deletions
@@ -64,7 +64,7 @@ Transmute converts a concatenated stream of JSON objects or other structured for Xtract can use waypoints to navigate a complex XML hierarchy and obtain data values by field name: - xtract -pattern entities -group P527 -block datavalue -element id | + xtract -pattern entities -group P527/mainsnak -block datavalue -element id | The resulting output can be post-processed by Unix utilities or scripts: @@ -924,6 +924,10 @@ Information on how to obtain an API Key is described in this NCBI blogpost: https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities +The Public Domain Notice for all NCBI EDirect scripts is located at: + + https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE + Questions or comments on EDirect may be sent to info@ncbi.nlm.nih.gov. This research was supported by the Intramural Research Program of the National Library of Medicine at the NIH. diff --git a/accn-at-a-time b/accn-at-a-time index 000c68b..adc6994 100755 --- a/accn-at-a-time +++ b/accn-at-a-time @@ -1,4 +1,8 @@ #!/bin/bash -norc + +# Public domain notice for all NCBI EDirect scripts is located at: +# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE + sed 's/[^a-zA-Z0-9_.]/ /g; s/^ *//' | tr 'A-Z' 'a-z' | fmt -w 1 diff --git a/align-columns b/align-columns index b149804..87b31fa 100755 --- a/align-columns +++ b/align-columns @@ -1,5 +1,8 @@ #!/bin/sh +# Public domain notice for all NCBI EDirect scripts is located at: +# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE + # inspired by Steve Kinzler's align script - see http://kinzler.com/me/align/ # requires tab-delimited input, output aligned by padding with spaces diff --git a/amino-acid-composition b/amino-acid-composition index 177276a..cc84e2c 100755 --- a/amino-acid-composition +++ b/amino-acid-composition @@ -1,4 +1,8 @@ #!/bin/bash -norc + +# Public domain notice for all NCBI EDirect scripts is located at: +# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE + abbrev=( Ala Asx Cys Asp Glu Phe Gly His Ile \ Xle Lys Leu Met Asn Pyl Pro Gln Arg \ Ser Thr Sec Val Trp Xxx Tyr Glx ) diff --git a/archive-pubmed b/archive-pubmed index b7e2141..706f72a 100755 --- a/archive-pubmed +++ b/archive-pubmed @@ -1,5 +1,8 @@ #!/bin/sh +# Public domain notice for all NCBI EDirect scripts is located at: +# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE + while [ $# -gt 0 ] do case "$1" in diff --git a/between-two-genes b/between-two-genes index b2c78f2..673f159 100755 --- a/between-two-genes +++ b/between-two-genes @@ -1,2 +1,6 @@ #!/bin/bash -norc + +# Public domain notice for all NCBI EDirect scripts is located at: +# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE + awk -F '\t' -v 'OFS=\t' "/^$1\t/{a++}/^$2\t/{a++}a>0{print}a>1{exit}" diff --git a/cmd/rchive.go b/cmd/rchive.go index e443822..af3db22 100644 --- a/cmd/rchive.go +++ b/cmd/rchive.go @@ -387,6 +387,11 @@ Execution Profiling go tool pprof --pdf ./cpu.pprof > ./callgraph.pdf ` +var ( + doStem bool + deStop bool +) + var idxFields = [12]string{ "CHEM", "CODE", @@ -402,18 +407,20 @@ var idxFields = [12]string{ "YEAR", } +// Master points to a term and to its postings data type Master struct { TermOffset int32 PostOffset int32 } +// Arrays contains postings lists and word offsets type Arrays struct { Data []int32 Ofst [][]int16 Dist int } -func ReportEncodedMarkup(typ, id, str string) { +func reportEncodedMarkup(typ, id, str string) { var buffer strings.Builder @@ -599,8 +606,8 @@ func ReportEncodedMarkup(typ, id, str string) { // DIRECTORY PATH UTILITIES -// MakeArchiveTrie allows a short prefix of letters with an optional underscore, and splits the remainder into character pairs -func MakeArchiveTrie(str string, arry [132]rune) string { +// makeArchiveTrie allows a short prefix of letters with an optional underscore, and splits the remainder into character pairs +func makeArchiveTrie(str string, arry [132]rune) string { if len(str) > 64 { return "" @@ -697,8 +704,8 @@ func MakeArchiveTrie(str string, arry [132]rune) string { return strings.ToUpper(res) } -// MakePostingsTrie splits a string into characters, separated by path delimiting slashes -func MakePostingsTrie(str string, arry [516]rune) string { +// makePostingsTrie splits a string into characters, separated by path delimiting slashes +func makePostingsTrie(str string, arry [516]rune) string { if len(str) > 256 { return "" @@ -889,7 +896,7 @@ var mergLen = map[string]int{ "tre": 4, } -func PostingDir(term string) string { +func postingDir(term string) string { if len(term) < 3 { return term @@ -910,7 +917,7 @@ func PostingDir(term string) string { return term[:3] } -func IdentifierKey(term string) string { +func identifierKey(term string) string { // remove punctuation from term key := strings.Map(func(c rune) rune { @@ -924,17 +931,17 @@ func IdentifierKey(term string) string { key = strings.Replace(key, "-", "_", -1) // use first 2, 3, or 4 characters of identifier for directory - key = PostingDir(key) + key = postingDir(key) return key } -func PostingPath(prom, field, term string, arry [516]rune) (string, string) { +func postingPath(prom, field, term string, arry [516]rune) (string, string) { // use first few characters of identifier for directory - dir := IdentifierKey(term) + dir := identifierKey(term) - trie := MakePostingsTrie(dir, arry) + trie := makePostingsTrie(dir, arry) if trie == "" { return "", "" } @@ -944,7 +951,7 @@ func PostingPath(prom, field, term string, arry [516]rune) (string, string) { return dpath, dir } -func CommonOpenFile(dpath, fname string) (*os.File, int64) { +func commonOpenFile(dpath, fname string) (*os.File, int64) { fpath := path.Join(dpath, fname) if fpath == "" { @@ -971,9 +978,9 @@ func CommonOpenFile(dpath, fname string) (*os.File, int64) { return inFile, size } -func ReadMasterIndex(dpath, key, field string) []Master { +func readMasterIndex(dpath, key, field string) []Master { - inFile, size := CommonOpenFile(dpath, key+"."+field+".mst") + inFile, size := commonOpenFile(dpath, key+"."+field+".mst") if inFile == nil { return nil } @@ -994,9 +1001,9 @@ func ReadMasterIndex(dpath, key, field string) []Master { return data } -func ReadTermList(dpath, key, field string) []byte { +func readTermList(dpath, key, field string) []byte { - inFile, size := CommonOpenFile(dpath, key+"."+field+".trm") + inFile, size := commonOpenFile(dpath, key+"."+field+".trm") if inFile == nil { return nil } @@ -1017,9 +1024,9 @@ func ReadTermList(dpath, key, field string) []byte { return data } -func ReadPostingData(dpath, key, field string, offset int32, size int32) []int32 { +func readPostingData(dpath, key, field string, offset int32, size int32) []int32 { - inFile, _ := CommonOpenFile(dpath, key+"."+field+".pst") + inFile, _ := commonOpenFile(dpath, key+"."+field+".pst") if inFile == nil { return nil } @@ -1046,9 +1053,9 @@ func ReadPostingData(dpath, key, field string, offset int32, size int32) []int32 return data } -func ReadPositionIndex(dpath, key, field string, offset int32, size int32) []int32 { +func readPositionIndex(dpath, key, field string, offset int32, size int32) []int32 { - inFile, _ := CommonOpenFile(dpath, key+"."+field+".uqi") + inFile, _ := commonOpenFile(dpath, key+"."+field+".uqi") if inFile == nil { return nil } @@ -1075,9 +1082,9 @@ func ReadPositionIndex(dpath, key, field string, offset int32, size int32) []int return data } -func ReadOffsetData(dpath, key, field string, offset int32, size int32) []int16 { +func readOffsetData(dpath, key, field string, offset int32, size int32) []int16 { - inFile, _ := CommonOpenFile(dpath, key+"."+field+".ofs") + inFile, _ := commonOpenFile(dpath, key+"."+field+".ofs") if inFile == nil { return nil } @@ -1104,7 +1111,7 @@ func ReadOffsetData(dpath, key, field string, offset int32, size int32) []int16 return data } -func ReadMasterIndexFuture(dpath, key, field string) <-chan []Master { +func readMasterIndexFuture(dpath, key, field string) <-chan []Master { out := make(chan []Master, eutils.ChanDepth()) if out == nil { @@ -1115,7 +1122,7 @@ func ReadMasterIndexFuture(dpath, key, field string) <-chan []Master { // masterIndexFuture asynchronously gets the master file and sends results through channel masterIndexFuture := func(dpath, key, field string, out chan<- []Master) { - data := ReadMasterIndex(dpath, key, field) + data := readMasterIndex(dpath, key, field) out <- data @@ -1128,7 +1135,7 @@ func ReadMasterIndexFuture(dpath, key, field string) <-chan []Master { return out } -func ReadTermListFuture(dpath, key, field string) <-chan []byte { +func readTermListFuture(dpath, key, field string) <-chan []byte { out := make(chan []byte, eutils.ChanDepth()) if out == nil { @@ -1139,7 +1146,7 @@ func ReadTermListFuture(dpath, key, field string) <-chan []byte { // termListFuture asynchronously gets posting IDs and sends results through channel termListFuture := func(dpath, key, field string, out chan<- []byte) { - data := ReadTermList(dpath, key, field) + data := readTermList(dpath, key, field) out <- data @@ -1152,21 +1159,21 @@ func ReadTermListFuture(dpath, key, field string) <-chan []byte { return out } -func GetPostingIDs(prom, term, field string, simple bool) ([]int32, [][]int16) { +func getPostingIDs(prom, term, field string, simple bool) ([]int32, [][]int16) { var ( arry [516]rune ) - dpath, key := PostingPath(prom, field, term, arry) + dpath, key := postingPath(prom, field, term, arry) if dpath == "" { return nil, nil } // schedule asynchronous fetching - mi := ReadMasterIndexFuture(dpath, key, field) + mi := readMasterIndexFuture(dpath, key, field) - tl := ReadTermListFuture(dpath, key, field) + tl := readTermListFuture(dpath, key, field) // fetch master index and term list indx := <-mi @@ -1215,7 +1222,7 @@ func GetPostingIDs(prom, term, field string, simple bool) ([]int32, [][]int16) { tlen := len(term) isWildCard = true term = strings.TrimSuffix(term, "*") - pdlen := len(PostingDir(term)) + pdlen := len(postingDir(term)) if tlen < pdlen { fmt.Fprintf(os.Stderr, "Wildcard term '%s' must be at least %d characters long - ignoring this word\n", term, pdlen) return nil, nil @@ -1243,7 +1250,7 @@ func GetPostingIDs(prom, term, field string, simple bool) ([]int32, [][]int16) { size := indx[R].PostOffset - offset // read relevant postings list section - data := ReadPostingData(dpath, key, field, offset, size) + data := readPostingData(dpath, key, field, offset, size) if data == nil || len(data) < 1 { return nil, nil } @@ -1272,7 +1279,7 @@ func GetPostingIDs(prom, term, field string, simple bool) ([]int32, [][]int16) { } // read relevant word position section, includes phantom offset at end - uqis := ReadPositionIndex(dpath, key, field, offset, size+4) + uqis := readPositionIndex(dpath, key, field, offset, size+4) if uqis == nil { return nil, nil } @@ -1285,7 +1292,7 @@ func GetPostingIDs(prom, term, field string, simple bool) ([]int32, [][]int16) { to := uqis[ulen-1] // read offset section - ofst := ReadOffsetData(dpath, key, field, from, to-from) + ofst := readOffsetData(dpath, key, field, from, to-from) if ofst == nil { return nil, nil } @@ -1353,7 +1360,7 @@ func GetPostingIDs(prom, term, field string, simple bool) ([]int32, [][]int16) { size := indx[R+1].PostOffset - offset // read relevant postings list section - data := ReadPostingData(dpath, key, field, offset, size) + data := readPostingData(dpath, key, field, offset, size) if data == nil || len(data) < 1 { return nil, nil } @@ -1363,7 +1370,7 @@ func GetPostingIDs(prom, term, field string, simple bool) ([]int32, [][]int16) { } // read relevant word position section, includes phantom offset at end - uqis := ReadPositionIndex(dpath, key, field, offset, size+4) + uqis := readPositionIndex(dpath, key, field, offset, size+4) if uqis == nil { return nil, nil } @@ -1376,7 +1383,7 @@ func GetPostingIDs(prom, term, field string, simple bool) ([]int32, [][]int16) { to := uqis[ulen-1] // read offset section - ofst := ReadOffsetData(dpath, key, field, from, to-from) + ofst := readOffsetData(dpath, key, field, from, to-from) if ofst == nil { return nil, nil } @@ -1401,18 +1408,18 @@ func GetPostingIDs(prom, term, field string, simple bool) ([]int32, [][]int16) { return nil, nil } -func PrintTermCount(base, term, field string) int { +func printTermCount(base, term, field string) int { - data, _ := GetPostingIDs(base, term, field, true) + data, _ := getPostingIDs(base, term, field, true) size := len(data) fmt.Fprintf(os.Stdout, "%d\t%s\n", size, term) return size } -func PrintTermCounts(base, term, field string) int { +func printTermCounts(base, term, field string) int { - pdlen := len(PostingDir(term)) + pdlen := len(postingDir(term)) if len(term) < pdlen { fmt.Fprintf(os.Stderr, "\nERROR: Term count argument must be at least %d characters\n", pdlen) @@ -1425,15 +1432,15 @@ func PrintTermCounts(base, term, field string) int { } var arry [516]rune - dpath, key := PostingPath(base, field, term, arry) + dpath, key := postingPath(base, field, term, arry) if dpath == "" { return 0 } // schedule asynchronous fetching - mi := ReadMasterIndexFuture(dpath, key, field) + mi := readMasterIndexFuture(dpath, key, field) - tl := ReadTermListFuture(dpath, key, field) + tl := readTermListFuture(dpath, key, field) // fetch master index and term list indx := <-mi @@ -1497,9 +1504,9 @@ func PrintTermCounts(base, term, field string) int { return count } -func PrintTermPositions(base, term, field string) int { +func printTermPositions(base, term, field string) int { - data, ofst := GetPostingIDs(base, term, field, false) + data, ofst := getPostingIDs(base, term, field, false) size := len(data) fmt.Fprintf(os.Stdout, "\n%d\t%s\n\n", size, term) @@ -1519,7 +1526,7 @@ func PrintTermPositions(base, term, field string) int { // BOOLEAN OPERATIONS FOR POSTINGS LISTS -func ExtendPositionalIDs(N []int32, np [][]int16, M []int32, mp [][]int16, delta int, proc func(pn, pm []int16, dlt int16) []int16) ([]int32, [][]int16) { +func extendPositionalIDs(N []int32, np [][]int16, M []int32, mp [][]int16, delta int, proc func(pn, pm []int16, dlt int16) []int16) ([]int32, [][]int16) { if proc == nil { return nil, nil @@ -1596,7 +1603,7 @@ func ExtendPositionalIDs(N []int32, np [][]int16, M []int32, mp [][]int16, delta return res, ofs } -func IntersectIDs(N, M []int32) []int32 { +func intersectIDs(N, M []int32) []int32 { if N == nil { return M @@ -1661,7 +1668,7 @@ func IntersectIDs(N, M []int32) []int32 { // if m * log(n) < m + n, binary search has fewer comparisons, but processor memory caches make linear algorithm faster /* -func IntersectBinary(N, M []int32) []int32 { +func intersectBinary(N, M []int32) []int32 { if N == nil { return M @@ -1713,7 +1720,7 @@ func IntersectBinary(N, M []int32) []int32 { } */ -func CombineIDs(N, M []int32) []int32 { +func combineIDs(N, M []int32) []int32 { if N == nil { return M @@ -1770,7 +1777,7 @@ func CombineIDs(N, M []int32) []int32 { return res } -func ExcludeIDs(N, M []int32) []int32 { +func excludeIDs(N, M []int32) []int32 { if N == nil { return nil @@ -1877,7 +1884,7 @@ func decodeFields(str string) string { return str } -func PostingIDsFuture(base, term, field string, dist int) <-chan Arrays { +func postingIDsFuture(base, term, field string, dist int) <-chan Arrays { out := make(chan Arrays, eutils.ChanDepth()) if out == nil { @@ -1888,7 +1895,7 @@ func PostingIDsFuture(base, term, field string, dist int) <-chan Arrays { // postingFuture asynchronously gets posting IDs and sends results through channel postingFuture := func(base, term, field string, dist int, out chan<- Arrays) { - data, ofst := GetPostingIDs(base, term, field, false) + data, ofst := getPostingIDs(base, term, field, false) out <- Arrays{Data: data, Ofst: ofst, Dist: dist} @@ -1901,7 +1908,7 @@ func PostingIDsFuture(base, term, field string, dist int) <-chan Arrays { return out } -func EvaluateQuery(base string, clauses []string) int { +func evaluateQuery(base string, clauses []string) int { if clauses == nil || clauses[0] == "" { return 0 @@ -2021,7 +2028,7 @@ func EvaluateQuery(base string, clauses []string) int { // efetch -format uid | phrase-search -query "[PIPE] AND L [THME]" var data []int32 // read UIDs from stdin - uidq := CreateUIDReader(os.Stdin) + uidq := createUIDReader(os.Stdin) for ext := range uidq { val, err := strconv.Atoi(ext.Text) @@ -2055,7 +2062,7 @@ func EvaluateQuery(base string, clauses []string) int { return nil, nil, 0 } term = strings.Replace(term, "_", " ", -1) - data, _ := GetPostingIDs(base, term, field, true) + data, _ := getPostingIDs(base, term, field, true) count++ return data, nil, 1 } @@ -2077,7 +2084,7 @@ func EvaluateQuery(base string, clauses []string) int { continue } - fetch := PostingIDsFuture(base, term, field, dist) + fetch := postingIDsFuture(base, term, field, dist) futures = append(futures, fetch) @@ -2116,7 +2123,7 @@ func EvaluateQuery(base string, clauses []string) int { for i := 1; i < len(intersect); i++ { // add subsequent words, keep starting positions of phrases that contain all words in proper position - data, ofst = ExtendPositionalIDs(data, ofst, intersect[i].Data, intersect[i].Ofst, intersect[i].Dist, phrasePositions) + data, ofst = extendPositionalIDs(data, ofst, intersect[i].Data, intersect[i].Ofst, intersect[i].Dist, phrasePositions) if len(data) < 1 { // bail if phrase not present return nil, nil, 0 @@ -2222,7 +2229,7 @@ func EvaluateQuery(base string, clauses []string) int { return nil, "" } // next phrase must be within specified distance after the previous phrase - data, ofst = ExtendPositionalIDs(data, ofst, next, noff, delta+dist, proximityPositions) + data, ofst = extendPositionalIDs(data, ofst, next, noff, delta+dist, proximityPositions) if len(data) < 1 { return nil, "" } @@ -2239,7 +2246,7 @@ func EvaluateQuery(base string, clauses []string) int { data, tkn := prox() for tkn == "!" { next, tkn = prox() - data = ExcludeIDs(data, next) + data = excludeIDs(data, next) } return data, tkn @@ -2252,7 +2259,7 @@ func EvaluateQuery(base string, clauses []string) int { data, tkn := excl() for tkn == "&" { next, tkn = excl() - data = IntersectIDs(data, next) + data = intersectIDs(data, next) } return data, tkn @@ -2265,7 +2272,7 @@ func EvaluateQuery(base string, clauses []string) int { data, tkn := term() for tkn == "|" { next, tkn = term() - data = CombineIDs(data, next) + data = combineIDs(data, next) } return data, tkn @@ -2308,7 +2315,7 @@ func EvaluateQuery(base string, clauses []string) int { // QUERY PARSING FUNCTIONS -func PrepareQuery(str string) string { +func prepareQuery(str string) string { if str == "" { return "" @@ -2377,8 +2384,59 @@ func PrepareQuery(str string) string { str = strings.Replace(str, "_", " ", -1) - if eutils.HasPlusOrMinus(str) { - str = eutils.FixThemeCases(str) + hasPlusOrMinus := func(str string) bool { + + for _, ch := range str { + if ch == '-' || ch == '+' { + return true + } + } + + return false + } + + fixThemeCases := func(str string) string { + + if !strings.Contains(str, "[thme]") && !strings.Contains(str, "[conv]") { + return str + } + + var arry []string + + terms := strings.Fields(str) + + for _, item := range terms { + + switch item { + case "a+": + arry = append(arry, "ap") + case "e+": + arry = append(arry, "ep") + case "ec+": + arry = append(arry, "ecp") + case "eg+": + arry = append(arry, "egp") + case "v+": + arry = append(arry, "vp") + case "a-": + arry = append(arry, "am") + case "e-": + arry = append(arry, "em") + case "ec-": + arry = append(arry, "ecm") + default: + arry = append(arry, item) + } + } + + // reconstruct string from transformed words + str = strings.Join(arry, " ") + + return str + } + + if hasPlusOrMinus(str) { + str = fixThemeCases(str) } if eutils.HasHyphenOrApostrophe(str) { @@ -2407,7 +2465,7 @@ func PrepareQuery(str string) string { return tmp } -func PrepareExact(str string) string { +func prepareExact(str string) string { if str == "" { return "" @@ -2514,7 +2572,7 @@ func PrepareExact(str string) string { } // optional stop word removal - if eutils.DeStop() && eutils.IsStopWord(item) { + if deStop && eutils.IsStopWord(item) { chain = append(chain, "+") continue } @@ -2535,7 +2593,7 @@ func PrepareExact(str string) string { return tmp } -func ProcessStopWords(str string) string { +func processStopWords(str string) string { if str == "" { return "" @@ -2591,7 +2649,7 @@ func ProcessStopWords(str string) string { } // skip if stop word, breaking phrase chain - if eutils.DeStop() && eutils.IsStopWord(item) { + if deStop && eutils.IsStopWord(item) { chain = append(chain, "+") continue } @@ -2622,7 +2680,7 @@ func ProcessStopWords(str string) string { return tmp } -func PartitionQuery(str string) []string { +func partitionQuery(str string) []string { if str == "" { return nil @@ -2674,7 +2732,7 @@ func PartitionQuery(str string) []string { return tmp } -func SetFieldQualifiers(clauses []string, rlxd bool) []string { +func setFieldQualifiers(clauses []string, rlxd bool) []string { var res []string @@ -2708,7 +2766,7 @@ func SetFieldQualifiers(clauses []string, rlxd bool) []string { } // skip if stop word, breaking phrase chain - if eutils.DeStop() && eutils.IsStopWord(item) { + if deStop && eutils.IsStopWord(item) { chain = append(chain, "+") continue } @@ -2921,60 +2979,60 @@ func SetFieldQualifiers(clauses []string, rlxd bool) []string { // SEARCH TERM LISTS FOR PHRASES OR NORMALIZED TERMS, OR MATCH BY PATTERN -func ProcessSearch(base, phrase string, xact, rlxd bool) int { +func processSearch(base, phrase string, xact, rlxd bool) int { if phrase == "" { return 0 } if xact { - phrase = PrepareExact(phrase) + phrase = prepareExact(phrase) } else { - phrase = PrepareQuery(phrase) + phrase = prepareQuery(phrase) } - phrase = ProcessStopWords(phrase) + phrase = processStopWords(phrase) - clauses := PartitionQuery(phrase) + clauses := partitionQuery(phrase) - clauses = SetFieldQualifiers(clauses, rlxd) + clauses = setFieldQualifiers(clauses, rlxd) - return EvaluateQuery(base, clauses) + return evaluateQuery(base, clauses) } -func ProcessMock(base, phrase string, xact, rlxd bool) int { +func processMock(base, phrase string, xact, rlxd bool) int { if phrase == "" { return 0 } - fmt.Fprintf(os.Stdout, "ProcessSearch:\n\n%s\n\n", phrase) + fmt.Fprintf(os.Stdout, "processSearch:\n\n%s\n\n", phrase) if xact { - phrase = PrepareExact(phrase) + phrase = prepareExact(phrase) - fmt.Fprintf(os.Stdout, "PrepareExact:\n\n%s\n\n", phrase) + fmt.Fprintf(os.Stdout, "prepareExact:\n\n%s\n\n", phrase) } else { - phrase = PrepareQuery(phrase) + phrase = prepareQuery(phrase) - fmt.Fprintf(os.Stdout, "PrepareQuery:\n\n%s\n\n", phrase) + fmt.Fprintf(os.Stdout, "prepareQuery:\n\n%s\n\n", phrase) } - phrase = ProcessStopWords(phrase) + phrase = processStopWords(phrase) - fmt.Fprintf(os.Stdout, "ProcessStopWords:\n\n%s\n\n", phrase) + fmt.Fprintf(os.Stdout, "processStopWords:\n\n%s\n\n", phrase) - clauses := PartitionQuery(phrase) + clauses := partitionQuery(phrase) - fmt.Fprintf(os.Stdout, "PartitionQuery:\n\n") + fmt.Fprintf(os.Stdout, "partitionQuery:\n\n") for _, tkn := range clauses { fmt.Fprintf(os.Stdout, "%s\n", tkn) } fmt.Fprintf(os.Stdout, "\n") - clauses = SetFieldQualifiers(clauses, rlxd) + clauses = setFieldQualifiers(clauses, rlxd) - fmt.Fprintf(os.Stdout, "SetFieldQualifiers:\n\n") + fmt.Fprintf(os.Stdout, "setFieldQualifiers:\n\n") for _, tkn := range clauses { fmt.Fprintf(os.Stdout, "%s\n", tkn) } @@ -2983,19 +3041,19 @@ func ProcessMock(base, phrase string, xact, rlxd bool) int { return 0 } -func ProcessCount(base, phrase string, plrl, psns, rlxd bool) int { +func processCount(base, phrase string, plrl, psns, rlxd bool) int { if phrase == "" { return 0 } - phrase = PrepareQuery(phrase) + phrase = prepareQuery(phrase) - phrase = ProcessStopWords(phrase) + phrase = processStopWords(phrase) - clauses := PartitionQuery(phrase) + clauses := partitionQuery(phrase) - clauses = SetFieldQualifiers(clauses, rlxd) + clauses = setFieldQualifiers(clauses, rlxd) if clauses == nil { return 0 @@ -3075,11 +3133,11 @@ func ProcessCount(base, phrase string, plrl, psns, rlxd bool) int { term = strings.Replace(term, "_", " ", -1) if psns { - count += PrintTermPositions(base, term, field) + count += printTermPositions(base, term, field) } else if plrl { - count += PrintTermCounts(base, term, field) + count += printTermCounts(base, term, field) } else { - count += PrintTermCount(base, term, field) + count += printTermCount(base, term, field) } } } @@ -3106,7 +3164,7 @@ func ProcessCount(base, phrase string, plrl, psns, rlxd bool) int { // processes with single goroutine call defer close(out) so consumer(s) can range over channel // processes with multiple instances call defer wg.Done(), separate goroutine uses wg.Wait() to delay close(out) -func CreateUIDReader(in io.Reader) <-chan eutils.XMLRecord { +func createUIDReader(in io.Reader) <-chan eutils.XMLRecord { if in == nil { return nil @@ -3139,7 +3197,7 @@ func CreateUIDReader(in io.Reader) <-chan eutils.XMLRecord { file = file[:pos] } - out <- eutils.XMLRecord{idx, "", file, nil} + out <- eutils.XMLRecord{Index: idx, Text: file} } } @@ -3149,7 +3207,7 @@ func CreateUIDReader(in io.Reader) <-chan eutils.XMLRecord { return out } -func CreateStashers(stash, parent, indx, sfx string, hash, zipp bool, report int, inp <-chan eutils.XMLRecord) <-chan string { +func createStashers(stash, parent, indx, sfx string, hash, zipp bool, report int, inp <-chan eutils.XMLRecord) <-chan string { if inp == nil { return nil @@ -3246,7 +3304,7 @@ func CreateStashers(stash, parent, indx, sfx string, hash, zipp bool, report int } var arry [132]rune - trie := MakeArchiveTrie(id, arry) + trie := makeArchiveTrie(id, arry) if trie == "" { return "" } @@ -3406,7 +3464,7 @@ func CreateStashers(stash, parent, indx, sfx string, hash, zipp bool, report int return out } -func CreateFetchers(stash, sfx string, zipp bool, inp <-chan eutils.XMLRecord) <-chan eutils.XMLRecord { +func createFetchers(stash, sfx string, zipp bool, inp <-chan eutils.XMLRecord) <-chan eutils.XMLRecord { if inp == nil { return nil @@ -3425,7 +3483,7 @@ func CreateFetchers(stash, sfx string, zipp bool, inp <-chan eutils.XMLRecord) < fetchRecord := func(file string, buf bytes.Buffer) string { var arry [132]rune - trie := MakeArchiveTrie(file, arry) + trie := makeArchiveTrie(file, arry) if file == "" || trie == "" { return "" @@ -3499,7 +3557,7 @@ func CreateFetchers(stash, sfx string, zipp bool, inp <-chan eutils.XMLRecord) < runtime.Gosched() - out <- eutils.XMLRecord{ext.Index, "", str, nil} + out <- eutils.XMLRecord{Index: ext.Index, Text: str} } } @@ -3520,7 +3578,7 @@ func CreateFetchers(stash, sfx string, zipp bool, inp <-chan eutils.XMLRecord) < return out } -func CreateStreamers(stash string, inp <-chan eutils.XMLRecord) <-chan eutils.XMLRecord { +func createStreamers(stash string, inp <-chan eutils.XMLRecord) <-chan eutils.XMLRecord { if inp == nil { return nil @@ -3537,7 +3595,7 @@ func CreateStreamers(stash string, inp <-chan eutils.XMLRecord) <-chan eutils.XM getRecord := func(file string, buf bytes.Buffer) []byte { var arry [132]rune - trie := MakeArchiveTrie(file, arry) + trie := makeArchiveTrie(file, arry) if file == "" || trie == "" { return nil @@ -3586,7 +3644,7 @@ func CreateStreamers(stash string, inp <-chan eutils.XMLRecord) <-chan eutils.XM runtime.Gosched() - out <- eutils.XMLRecord{ext.Index, "", "", data} + out <- eutils.XMLRecord{Index: ext.Index, Data: data} } } @@ -3607,7 +3665,7 @@ func CreateStreamers(stash string, inp <-chan eutils.XMLRecord) <-chan eutils.XM return out } -func CreateDispensers(inp <-chan eutils.XMLRecord) <-chan []string { +func createDispensers(inp <-chan eutils.XMLRecord) <-chan []string { if inp == nil { return nil @@ -3726,7 +3784,7 @@ func CreateDispensers(inp <-chan eutils.XMLRecord) <-chan []string { return out } -func CreateInverters(inp <-chan []string) <-chan eutils.XMLRecord { +func createInverters(inp <-chan []string) <-chan eutils.XMLRecord { if inp == nil { return nil @@ -3845,7 +3903,7 @@ func CreateInverters(inp <-chan []string) <-chan eutils.XMLRecord { str := printPosting(key, data) - out <- eutils.XMLRecord{0, key, str, nil} + out <- eutils.XMLRecord{Ident: key, Text: str} runtime.Gosched() } @@ -3868,7 +3926,7 @@ func CreateInverters(inp <-chan []string) <-chan eutils.XMLRecord { return out } -func CreateResolver(inp <-chan eutils.XMLRecord) <-chan string { +func createResolver(inp <-chan eutils.XMLRecord) <-chan string { if inp == nil { return nil @@ -3923,6 +3981,7 @@ func CreateResolver(inp <-chan eutils.XMLRecord) <-chan string { return out } +// Plex allows distribution of indexing type Plex struct { Which int Ident string @@ -3931,22 +3990,22 @@ type Plex struct { Sibs []string } -type PlexHeap []Plex +type plexHeap []Plex // methods that satisfy heap.Interface -func (h PlexHeap) Len() int { +func (h plexHeap) Len() int { return len(h) } -func (h PlexHeap) Less(i, j int) bool { +func (h plexHeap) Less(i, j int) bool { return h[i].Ident < h[j].Ident } -func (h PlexHeap) Swap(i, j int) { +func (h plexHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] } -func (h *PlexHeap) Push(x interface{}) { +func (h *plexHeap) Push(x interface{}) { *h = append(*h, x.(Plex)) } -func (h *PlexHeap) Pop() interface{} { +func (h *plexHeap) Pop() interface{} { old := *h n := len(old) x := old[n-1] @@ -3954,7 +4013,7 @@ func (h *PlexHeap) Pop() interface{} { return x } -func CreatePresenters(args []string) []<-chan Plex { +func createPresenters(args []string) []<-chan Plex { if args == nil { return nil @@ -4053,7 +4112,7 @@ func CreatePresenters(args []string) []<-chan Plex { return chns } -func CreateManifold(inp []<-chan Plex) <-chan Plex { +func createManifold(inp []<-chan Plex) <-chan Plex { if inp == nil { return nil @@ -4072,7 +4131,7 @@ func CreateManifold(inp []<-chan Plex) <-chan Plex { defer close(out) // initialize empty heap - hp := &PlexHeap{} + hp := &plexHeap{} heap.Init(hp) // read first object from all input channels in turn @@ -4146,7 +4205,7 @@ func CreateManifold(inp []<-chan Plex) <-chan Plex { return out } -func CreateFusers(inp <-chan eutils.XMLRecord) <-chan Plex { +func createFusers(inp <-chan eutils.XMLRecord) <-chan Plex { if inp == nil { return nil @@ -4235,7 +4294,7 @@ func CreateFusers(inp <-chan eutils.XMLRecord) <-chan Plex { return out } -func CreateMergers(inp <-chan Plex) <-chan eutils.XMLRecord { +func createMergers(inp <-chan Plex) <-chan eutils.XMLRecord { if inp == nil { return nil @@ -4368,7 +4427,7 @@ func CreateMergers(inp <-chan Plex) <-chan eutils.XMLRecord { str := fusePostings(key, data) - out <- eutils.XMLRecord{rec, key, str, nil} + out <- eutils.XMLRecord{Index: rec, Ident: key, Text: str} runtime.Gosched() } @@ -4391,7 +4450,7 @@ func CreateMergers(inp <-chan Plex) <-chan eutils.XMLRecord { return out } -func CreateSplitter(merg string, zipp bool, inp <-chan eutils.XMLRecord) <-chan string { +func createSplitter(merg string, zipp bool, inp <-chan eutils.XMLRecord) <-chan string { if inp == nil { return nil @@ -4489,7 +4548,7 @@ func CreateSplitter(merg string, zipp bool, inp <-chan eutils.XMLRecord) <-chan for curr := range inp { // use first few characters of identifier - currTag = IdentifierKey(curr.Ident) + currTag = identifierKey(curr.Ident) if currTag == "" { continue } @@ -4531,7 +4590,7 @@ func CreateSplitter(merg string, zipp bool, inp <-chan eutils.XMLRecord) <-chan // compare keys from adjacent term lists if prev.Text != "" && prevTag != currTag { - // after IdentifierKey converts space to underscore, + // after identifierKey converts space to underscore, // okay that x_ and x0 will be out of alphabetical order // send closing tag @@ -4594,7 +4653,7 @@ func CreateSplitter(merg string, zipp bool, inp <-chan eutils.XMLRecord) <-chan return out } -func CreatePromoters(args []string, prom, field string) <-chan string { +func createPromoters(args []string, prom, field string) <-chan string { if args == nil { return nil @@ -4815,7 +4874,7 @@ func CreatePromoters(args []string, prom, field string) <-chan string { writeFiveFiles := func(key string) { var arry [516]rune - dpath, key := PostingPath(prom, field, key, arry) + dpath, key := postingPath(prom, field, key, arry) if dpath == "" { return } @@ -4859,11 +4918,11 @@ func CreatePromoters(args []string, prom, field string) <-chan string { ok = true // use first few characters of identifier - currTag = IdentifierKey(term) + currTag = identifierKey(term) if prevTag != currTag { - // after IdentifierKey converts space to underscore, + // after identifierKey converts space to underscore, // okay that xxx_ and xxx0 will be out of alphabetical order // directory prefix changed from last posting @@ -4917,7 +4976,7 @@ func CreatePromoters(args []string, prom, field string) <-chan string { return out } -func CreateMatchers(phrs string, exclude bool, inp <-chan eutils.XMLRecord) <-chan eutils.XMLRecord { +func createMatchers(phrs string, exclude bool, inp <-chan eutils.XMLRecord) <-chan eutils.XMLRecord { if inp == nil { return nil @@ -4928,13 +4987,13 @@ func CreateMatchers(phrs string, exclude bool, inp <-chan eutils.XMLRecord) <-ch return inp } - phrs = PrepareQuery(phrs) + phrs = prepareQuery(phrs) - phrs = ProcessStopWords(phrs) + phrs = processStopWords(phrs) - clauses := PartitionQuery(phrs) + clauses := partitionQuery(phrs) - clauses = SetFieldQualifiers(clauses, false) + clauses = setFieldQualifiers(clauses, false) if clauses == nil { fmt.Fprintf(os.Stderr, "\nERROR: Unable to parse phrase\n") @@ -5011,13 +5070,13 @@ func CreateMatchers(phrs string, exclude bool, inp <-chan eutils.XMLRecord) <-ch } // skip if stop word, breaking word pair chain - if eutils.DeStop() && eutils.IsStopWord(item) { + if deStop && eutils.IsStopWord(item) { chain = append(chain, "+") continue } // apply stemming algorithm - if eutils.DoStem() { + if doStem { isWildCard := strings.HasSuffix(item, "*") if isWildCard { // temporarily remove trailing asterisk @@ -5202,7 +5261,7 @@ func CreateMatchers(phrs string, exclude bool, inp <-chan eutils.XMLRecord) <-ch if text == "" { // should never see empty input data - out <- eutils.XMLRecord{idx, "", text, nil} + out <- eutils.XMLRecord{Index: idx, Text: text} continue } @@ -5212,12 +5271,12 @@ func CreateMatchers(phrs string, exclude bool, inp <-chan eutils.XMLRecord) <-ch if exclude != ok { // send text of record if phrase match succeeded with -require, or failed with -exclude - out <- eutils.XMLRecord{idx, "", text, nil} + out <- eutils.XMLRecord{Index: idx, Text: text} continue } // otherwise send empty text so unshuffler does not have to deal with record index gaps - out <- eutils.XMLRecord{idx, "", "", nil} + out <- eutils.XMLRecord{Index: idx} } } @@ -5238,7 +5297,7 @@ func CreateMatchers(phrs string, exclude bool, inp <-chan eutils.XMLRecord) <-ch return out } -func CreateExternalIndexer(args []string, zipp bool, in io.Reader) int { +func createExternalIndexer(args []string, zipp bool, in io.Reader) int { recordCount := 0 @@ -6193,9 +6252,9 @@ func CreateExternalIndexer(args []string, zipp bool, in io.Reader) int { return 0 } -func CreateExternalArchive(stash string, args []string) <-chan string { +func createExternalArchive(stash string, args []string) <-chan string { - createPresenters := func(args []string) []<-chan Plex { + makePresenters := func(args []string) []<-chan Plex { if args == nil { return nil @@ -6294,7 +6353,7 @@ func CreateExternalArchive(stash string, args []string) <-chan string { return chns } - createManifold := func(inp []<-chan Plex) <-chan Plex { + makeManifold := func(inp []<-chan Plex) <-chan Plex { if inp == nil { return nil @@ -6313,7 +6372,7 @@ func CreateExternalArchive(stash string, args []string) <-chan string { defer close(out) // initialize empty heap - hp := &PlexHeap{} + hp := &plexHeap{} heap.Init(hp) // read first object from all input channels in turn @@ -6385,7 +6444,7 @@ func CreateExternalArchive(stash string, args []string) <-chan string { return out } - createMergers := func(inp <-chan Plex) <-chan eutils.XMLRecord { + makeMergers := func(inp <-chan Plex) <-chan eutils.XMLRecord { if inp == nil { return nil @@ -6518,7 +6577,7 @@ func CreateExternalArchive(stash string, args []string) <-chan string { str := fusePostings(key, data) - out <- eutils.XMLRecord{rec, key, str, nil} + out <- eutils.XMLRecord{Index: rec, Ident: key, Text: str} runtime.Gosched() } @@ -6541,10 +6600,10 @@ func CreateExternalArchive(stash string, args []string) <-chan string { return out } - chns := createPresenters(args) - mfld := createManifold(chns) - mrgr := createMergers(mfld) - stsq := CreateStashers(stash, "IdxDocument", "IdxDocument/IdxUid", ".e2x", false, true, 50000, mrgr) + chns := makePresenters(args) + mfld := makeManifold(chns) + mrgr := makeMergers(mfld) + stsq := createStashers(stash, "IdxDocument", "IdxDocument/IdxUid", ".e2x", false, true, 50000, mrgr) if chns == nil || mfld == nil || mrgr == nil || stsq == nil { fmt.Fprintf(os.Stderr, "\nERROR: Unable to create extra index stasher\n") @@ -6580,8 +6639,8 @@ func main() { doMixed := false deAccent := false doASCII := false - doStem := false - deStop := false + doStem = false + deStop = true // CONCURRENCY, CLEANUP, AND DEBUGGING FLAGS @@ -6880,7 +6939,7 @@ func main() { case "-stems", "-stem": doStem = true case "-stops", "-stop": - deStop = true + deStop = false case "-unicode": // DoUnicode = true @@ -6962,7 +7021,7 @@ func main() { case "stems", "stem": doStem = true case "stops", "stop": - deStop = true + deStop = false case "none", "default": default: if flgs != "" { @@ -7005,7 +7064,7 @@ func main() { eutils.SetTunings(numProcs, numServe, serverRatio, chanDepth, farmSize, heapSize, goGc) - eutils.SetOptions(doStrict, doMixed, deAccent, doASCII, doCompress, doCleanup, doStem, deStop, false) + eutils.SetOptions(doStrict, doMixed, deAccent, doASCII, doCompress, doCleanup) // -stats prints number of CPUs and performance tuning values if no other arguments (undocumented) if stts && len(args) < 1 { @@ -7184,7 +7243,7 @@ func main() { if len(args) > 0 { switch args[0] { case "-bioconcepts", "-generif", "-generifs": - recordCount = CreateExternalIndexer(args, zipp, in) + recordCount = createExternalIndexer(args, zipp, in) debug.FreeOSMemory() @@ -7194,7 +7253,7 @@ func main() { return case "-theme", "-themes", "-dpath", "-dpaths", "-thesis": - recordCount = CreateExternalIndexer(args, zipp, in) + recordCount = createExternalIndexer(args, zipp, in) debug.FreeOSMemory() @@ -7229,7 +7288,7 @@ func main() { // remaining arguments are *.e2x files // e.g., rchive -timer -distribute archive_directory *.e2x args = args[1:] - stsq := CreateExternalArchive(path, args) + stsq := createExternalArchive(path, args) if stsq == nil { fmt.Fprintf(os.Stderr, "\nERROR: Unable to create extra index stasher\n") @@ -7294,9 +7353,9 @@ func main() { } } - chns := CreatePresenters(args) - mfld := CreateManifold(chns) - mrgr := CreateMergers(mfld) + chns := createPresenters(args) + mfld := createManifold(chns) + mrgr := createMergers(mfld) unsq := eutils.CreateXMLUnshuffler(mrgr) if chns == nil || mfld == nil || mrgr == nil || unsq == nil { @@ -7424,11 +7483,11 @@ func main() { } } - chns := CreatePresenters(args) - mfld := CreateManifold(chns) - mrgr := CreateMergers(mfld) + chns := createPresenters(args) + mfld := createManifold(chns) + mrgr := createMergers(mfld) unsq := eutils.CreateXMLUnshuffler(mrgr) - sptr := CreateSplitter(merg, zipp, unsq) + sptr := createSplitter(merg, zipp, unsq) if chns == nil || mfld == nil || mrgr == nil || unsq == nil || sptr == nil { fmt.Fprintf(os.Stderr, "\nERROR: Unable to create inverted index merger\n") @@ -7515,7 +7574,7 @@ func main() { if prom != "" && fild != "" { - prmq := CreatePromoters(args, prom, fild) + prmq := createPromoters(args, prom, fild) if prmq == nil { fmt.Fprintf(os.Stderr, "\nERROR: Unable to create new postings file generator\n") @@ -7564,7 +7623,7 @@ func main() { txt := scanr.Text() // deStop should match value used in building the indices - recordCount += ProcessSearch(base, txt, true, false) + recordCount += processSearch(base, txt, true, false) } debug.FreeOSMemory() @@ -7580,9 +7639,9 @@ func main() { // deStop should match value used in building the indices if mock { - recordCount = ProcessMock(base, phrs, xact, rlxd) + recordCount = processMock(base, phrs, xact, rlxd) } else { - recordCount = ProcessSearch(base, phrs, xact, rlxd) + recordCount = processSearch(base, phrs, xact, rlxd) } debug.FreeOSMemory() @@ -7597,7 +7656,7 @@ func main() { if base != "" && trms != "" { // deStop should match value used in building the indices - recordCount = ProcessCount(base, trms, plrl, psns, rlxd) + recordCount = processCount(base, trms, plrl, psns, rlxd) debug.FreeOSMemory() @@ -7698,7 +7757,7 @@ func main() { file := scanr.Text() var arry [132]rune - trie := MakeArchiveTrie(file, arry) + trie := makeArchiveTrie(file, arry) if trie == "" || file == "" { continue } @@ -7739,7 +7798,7 @@ func main() { } var arry [132]rune - trie := MakeArchiveTrie(file, arry) + trie := makeArchiveTrie(file, arry) if file == "" || trie == "" { continue @@ -7805,7 +7864,7 @@ func main() { } var arry [132]rune - trie := MakeArchiveTrie(file, arry) + trie := makeArchiveTrie(file, arry) if file == "" || trie == "" { continue @@ -7910,8 +7969,8 @@ func main() { // -fetch without -index retrieves XML files in trie-based directory structure if ftch != "" && indx == "" { - uidq := CreateUIDReader(in) - strq := CreateFetchers(ftch, ".xml", zipp, uidq) + uidq := createUIDReader(in) + strq := createFetchers(ftch, ".xml", zipp, uidq) unsq := eutils.CreateXMLUnshuffler(strq) if uidq == nil || strq == nil || unsq == nil { @@ -7980,8 +8039,8 @@ func main() { // -stream without -index retrieves compressed XML files in trie-based directory structure if strm != "" && indx == "" { - uidq := CreateUIDReader(in) - strq := CreateStreamers(strm, uidq) + uidq := createUIDReader(in) + strq := createStreamers(strm, uidq) unsq := eutils.CreateXMLUnshuffler(strq) if uidq == nil || strq == nil || unsq == nil { @@ -8019,8 +8078,8 @@ func main() { // -summon retrieves link files in trie-based directory structure if smmn != "" && indx == "" { - uidq := CreateUIDReader(in) - strq := CreateFetchers(smmn, ".e2x", zipp, uidq) + uidq := createUIDReader(in) + strq := createFetchers(smmn, ".e2x", zipp, uidq) unsq := eutils.CreateXMLUnshuffler(strq) if uidq == nil || strq == nil || unsq == nil { @@ -8127,9 +8186,9 @@ func main() { } colq := eutils.CreateXMLProducer("IdxDocument", "", rdr) - dspq := CreateDispensers(colq) - invq := CreateInverters(dspq) - rslq := CreateResolver(invq) + dspq := createDispensers(colq) + invq := createInverters(dspq) + rslq := createResolver(invq) if colq == nil || dspq == nil || invq == nil || rslq == nil { fmt.Fprintf(os.Stderr, "\nERROR: Unable to create inverter\n") @@ -8251,8 +8310,8 @@ func main() { } chns := eutils.CreateXMLProducer("InvDocument", "", rdr) - fusr := CreateFusers(chns) - mrgr := CreateMergers(fusr) + fusr := createFusers(chns) + mrgr := createMergers(fusr) unsq := eutils.CreateXMLUnshuffler(mrgr) if chns == nil || fusr == nil || mrgr == nil || unsq == nil { @@ -8393,7 +8452,7 @@ func main() { } xmlq := eutils.CreateXMLProducer(topPattern, star, rdr) - mchq := CreateMatchers(phrs, exclude, xmlq) + mchq := createMatchers(phrs, exclude, xmlq) unsq := eutils.CreateXMLUnshuffler(mchq) if xmlq == nil || mchq == nil || unsq == nil { @@ -8471,7 +8530,7 @@ func main() { id = id[:idlen-2] } - ReportEncodedMarkup(dmgdType, id, str) + reportEncodedMarkup(dmgdType, id, str) }) if timr { @@ -8517,7 +8576,7 @@ func main() { } var arry [132]rune - trie := MakeArchiveTrie(id, arry) + trie := makeArchiveTrie(id, arry) if id == "" || trie == "" { return @@ -8636,7 +8695,7 @@ func main() { if stsh != "" && indx != "" { xmlq := eutils.CreateXMLProducer(topPattern, star, rdr) - stsq := CreateStashers(stsh, parent, indx, ".xml", hshv, zipp, 1000, xmlq) + stsq := createStashers(stsh, parent, indx, ".xml", hshv, zipp, 1000, xmlq) if xmlq == nil || stsq == nil { fmt.Fprintf(os.Stderr, "\nERROR: Unable to create stash generator\n") diff --git a/cmd/transmute.go b/cmd/transmute.go index 603e498..85fd2e5 100644 --- a/cmd/transmute.go +++ b/cmd/transmute.go @@ -31,7 +31,6 @@ package main import ( - "bufio" "encoding/base64" "eutils" "fmt" @@ -47,7 +46,6 @@ import ( "strings" "sync" "unicode" - "unicode/utf8" ) // TRANSMUTE HELP MESSAGE TEXT @@ -337,8 +335,8 @@ Mismatch Detection (RefSeq Proteins with 3 Residue Differences from RefSeq Genom // XML FORMATTING FUNCTIONS -// CreateFormatters does concurrent reformatting, using flush-left to remove leading spaces -func CreateFormatters(parent string, format string, inp <-chan eutils.XMLRecord) <-chan eutils.XMLRecord { +// createFormatters does concurrent reformatting, using flush-left to remove leading spaces +func createFormatters(parent string, format string, inp <-chan eutils.XMLRecord) <-chan eutils.XMLRecord { if inp == nil { return nil @@ -368,7 +366,7 @@ func CreateFormatters(parent string, format string, inp <-chan eutils.XMLRecord) if text == "" { // should never see empty input data - out <- eutils.XMLRecord{idx, "", text, nil} + out <- eutils.XMLRecord{Index: idx, Text: text} continue } @@ -378,7 +376,7 @@ func CreateFormatters(parent string, format string, inp <-chan eutils.XMLRecord) str := eutils.ChanToString(frm) // send even if empty to get all record counts for reordering - out <- eutils.XMLRecord{idx, "", str, nil} + out <- eutils.XMLRecord{Index: idx, Text: str} } } @@ -399,8 +397,8 @@ func CreateFormatters(parent string, format string, inp <-chan eutils.XMLRecord) return out } -// ProcessFormat reformats XML for ease of reading -func ProcessFormat(rdr <-chan eutils.XMLBlock, args []string) { +// processFormat reformats XML for ease of reading +func processFormat(rdr <-chan eutils.XMLBlock, args []string) { if rdr == nil || args == nil { return @@ -437,12 +435,21 @@ func ProcessFormat(rdr <-chan eutils.XMLBlock, args []string) { switch args[0] { case "-xml": - xml = eutils.GetStringArg(args, "-xml argument") - args = args[2:] + args = args[1:] + // -xml argument must be followed by value to use in xml line + if len(args) < 1 || strings.HasPrefix(args[0], "-") { + fmt.Fprintf(os.Stderr, "\nERROR: -xml argument is missing\n") + os.Exit(1) + } + xml = args[0] + args = args[1:] case "-doctype": - doctype = eutils.GetStringArg(args, "-doctype argument") - args = args[2:] - + args = args[1:] + if len(args) > 0 { + // if -doctype argument followed by value, use instead of DOCTYPE line + doctype = args[0] + args = args[1:] + } /* // allow setting of unicode, script, and mathml flags within -format case "-unicode": @@ -487,7 +494,7 @@ func ProcessFormat(rdr <-chan eutils.XMLBlock, args []string) { } } - tknq := eutils.CreateTokenizer("", "", rdr) + tknq := eutils.CreateTokenizer(rdr) frgs := eutils.FormatArgs{ Format: format, XML: xml, Doctype: doctype, @@ -499,14 +506,14 @@ func ProcessFormat(rdr <-chan eutils.XMLBlock, args []string) { eutils.ChanToStdout(frm) } -// ProcessTokens shows individual tokens in stream (undocumented) -func ProcessTokens(rdr <-chan eutils.XMLBlock) { +// processTokens shows individual tokens in stream (undocumented) +func processTokens(rdr <-chan eutils.XMLBlock) { if rdr == nil { return } - tknq := eutils.CreateTokenizer("", "", rdr) + tknq := eutils.CreateTokenizer(rdr) if tknq == nil { fmt.Fprintf(os.Stderr, "\nERROR: Unable to create debug tokenizer\n") @@ -650,14 +657,14 @@ func ProcessTokens(rdr <-chan eutils.XMLBlock) { } } -// ProcessOutline displays outline of XML structure -func ProcessOutline(rdr <-chan eutils.XMLBlock) { +// processOutline displays outline of XML structure +func processOutline(rdr <-chan eutils.XMLBlock) { if rdr == nil { return } - tknq := eutils.CreateTokenizer("", "", rdr) + tknq := eutils.CreateTokenizer(rdr) if tknq == nil { fmt.Fprintf(os.Stderr, "\nERROR: Unable to create outline tokenizer\n") @@ -725,14 +732,14 @@ func ProcessOutline(rdr <-chan eutils.XMLBlock) { } } -// ProcessSynopsis displays paths to XML elements -func ProcessSynopsis(rdr <-chan eutils.XMLBlock, leaf bool, delim string) { +// processSynopsis displays paths to XML elements +func processSynopsis(rdr <-chan eutils.XMLBlock, leaf bool, delim string) { if rdr == nil { return } - tknq := eutils.CreateTokenizer("", "", rdr) + tknq := eutils.CreateTokenizer(rdr) if tknq == nil { fmt.Fprintf(os.Stderr, "\nERROR: Unable to create synopsis tokenizer\n") @@ -840,14 +847,14 @@ func ProcessSynopsis(rdr <-chan eutils.XMLBlock, leaf bool, delim string) { } } -// ProcessFilter modifies XML content, comments, or CDATA -func ProcessFilter(rdr <-chan eutils.XMLBlock, args []string) { +// processFilter modifies XML content, comments, or CDATA +func processFilter(rdr <-chan eutils.XMLBlock, args []string) { if rdr == nil || args == nil { return } - tknq := eutils.CreateTokenizer("", "", rdr) + tknq := eutils.CreateTokenizer(rdr) if tknq == nil { fmt.Fprintf(os.Stderr, "\nERROR: Unable to create filter tokenizer\n") @@ -1155,7 +1162,7 @@ func ProcessFilter(rdr <-chan eutils.XMLBlock, args []string) { // STRING CONVERTERS -func EncodeURL(inp io.Reader) { +func encodeURL(inp io.Reader) { if inp == nil { return @@ -1173,7 +1180,7 @@ func EncodeURL(inp io.Reader) { } } -func DecodeURL(inp io.Reader) { +func decodeURL(inp io.Reader) { if inp == nil { return @@ -1191,7 +1198,7 @@ func DecodeURL(inp io.Reader) { } } -func EncodeB64(inp io.Reader) { +func encodeB64(inp io.Reader) { if inp == nil { return @@ -1207,7 +1214,7 @@ func EncodeB64(inp io.Reader) { } } -func DecodeB64(inp io.Reader) { +func decodeB64(inp io.Reader) { if inp == nil { return @@ -1224,7 +1231,7 @@ func DecodeB64(inp io.Reader) { } } -func DecodeHGVS(inp io.Reader) { +func decodeHGVS(inp io.Reader) { if inp == nil { return @@ -1243,8 +1250,8 @@ func DecodeHGVS(inp io.Reader) { // COLUMN ALIGNMENT FORMATTER -// ProcessAlign aligns a tab-delimited table by individual column widths -func ProcessAlign(inp io.Reader, args []string) { +// processAlign aligns a tab-delimited table by individual column widths +func processAlign(inp io.Reader, args []string) { // tab-delimited-table to padded-by-spaces alignment inspired by // Steve Kinzler's align script - see http://kinzler.com/me/align/ @@ -1253,13 +1260,9 @@ func ProcessAlign(inp io.Reader, args []string) { return } - spcs := " " - - mrg := "" - pad := " " - - lettrs := make(map[int]rune) - lst := 'l' + mrg := 0 + pdg := 0 + aln := "" // skip past command name args = args[1:] @@ -1268,19 +1271,13 @@ func ProcessAlign(inp io.Reader, args []string) { switch args[0] { case "-g": - val := eutils.GetNumericArg(args, "-g spacing between columns", 0, 1, 30) - pad = spcs[0:val] + pdg = eutils.GetNumericArg(args, "-g spacing between columns", 0, 1, 30) args = args[2:] case "-h": - val := eutils.GetNumericArg(args, "-i indent before columns", 0, 1, 30) - mrg = spcs[0:val] + mrg = eutils.GetNumericArg(args, "-i indent before columns", 0, 1, 30) args = args[2:] case "-a": - val := eutils.GetStringArg(args, "-a column alignment code string") - for i, ch := range val { - lettrs[i] = ch - lst = ch - } + aln = eutils.GetStringArg(args, "-a column alignment code string") args = args[2:] default: fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized option after -align command\n") @@ -1288,232 +1285,21 @@ func ProcessAlign(inp io.Reader, args []string) { } } - var arry []string - - width := make(map[int]int) - whole := make(map[int]int) - fract := make(map[int]int) - - scanr := bufio.NewScanner(inp) - - row := 0 - numCols := 0 - - // allows leading plus or minus, digits interspersed with optional commas, decimal point, and digits - isNumeric := func(str string) bool { + algn := eutils.AlignColumns(inp, mrg, pdg, aln) - has_num := false - has_period := false - - for i, ch := range str { - switch ch { - case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': - has_num = true - case '+', '-': - if i > 0 { - return false - } - case '.': - has_period = true - case ',': - if has_period { - return false - } - default: - return false - } - } - - return has_num - } - - processLine := func(line string) string { - - var flds []string - - cols := strings.Split(line, "\t") - if numCols == 0 { - numCols = len(cols) - } else if numCols != len(cols) { - fmt.Fprintf(os.Stderr, "ERROR: Mismatched number of columns in row ") - fmt.Fprintf(os.Stderr, strconv.Itoa(row)) - fmt.Fprintf(os.Stderr, ": actual ") - fmt.Fprintf(os.Stderr, strconv.Itoa(len(cols))) - fmt.Fprintf(os.Stderr, ", expected ") - fmt.Fprintf(os.Stderr, strconv.Itoa(numCols)) - fmt.Fprintf(os.Stderr, "\n") - // os.Exit(1) - } - - for i, str := range cols { - - str = eutils.CompressRunsOfSpaces(str) - str = strings.TrimSpace(str) - - flds = append(flds, str) - - // determine maximum length in each column - ln := utf8.RuneCountInString(str) - if ln > width[i] { - width[i] = ln - } - - code, ok := lettrs[i] - if !ok { - code = lst - } - - switch code { - case 'n', 'N', 'z', 'Z': - if isNumeric(str) { - // determine maximum length of decimal number parts - wh, fr := eutils.SplitInTwoLeft(str, ".") - if fr != "" { - fr = "." + fr - } - - lf := utf8.RuneCountInString(wh) - if lf > whole[i] { - whole[i] = lf - } - rt := utf8.RuneCountInString(fr) - if rt > fract[i] { - fract[i] = rt - } - ln = whole[i] + fract[i] - if ln > width[i] { - width[i] = ln - } - } - } - } - - return strings.Join(flds, "\t") - } - - for i := 0; i < numCols; i++ { - - code, ok := lettrs[i] - if !ok { - code = lst - } - - switch code { - case 'n', 'N', 'z', 'Z': - // adjust maximum widths with aligned decimal points - ln := whole[i] + fract[i] - if ln > width[i] { - width[i] = ln - } - } - } - - // clean up spaces, calculate column widths - for scanr.Scan() { - - row++ - line := scanr.Text() - if line == "" { - continue - } - - line = processLine(line) - arry = append(arry, line) + if algn == nil { + fmt.Fprintf(os.Stderr, "\nERROR: Unable to create alignment function\n") + os.Exit(1) } - var buffer strings.Builder - - for _, line := range arry { - - buffer.Reset() + eutils.ChanToStdout(algn) - cols := strings.Split(line, "\t") - - btwn := mrg - for i, str := range cols { - - buffer.WriteString(btwn) - - code, ok := lettrs[i] - if !ok { - code = lst - } - - ln := utf8.RuneCountInString(str) - mx := width[i] - diff := mx - ln - lft := 0 - rgt := 0 - lft_pad := " " - rgt_pad := " " - - if diff > 0 { - switch code { - case 'l': - rgt = diff - case 'c': - lft = diff / 2 - rgt = diff - lft - case 'r': - lft = diff - case 'n', 'N', 'z', 'Z': - lft = diff - if isNumeric(str) { - switch code { - case 'N': - rgt_pad = "0" - case 'z': - lft_pad = "0" - case 'Z': - lft_pad = "0" - rgt_pad = "0" - } - sn := whole[i] - rc := fract[i] - wh, fr := eutils.SplitInTwoLeft(str, ".") - if fract[i] > 0 { - if fr == "" { - fr = "." - } else { - fr = "." + fr - } - lf := utf8.RuneCountInString(wh) - lft = sn - lf - rt := utf8.RuneCountInString(fr) - rgt = rc - rt - str = wh + fr - } - } - default: - rgt = diff - } - } - - for lft > 0 { - lft-- - buffer.WriteString(lft_pad) - } - - buffer.WriteString(str) - btwn = pad - - for rgt > 0 { - rgt-- - buffer.WriteString(rgt_pad) - } - } - - txt := buffer.String() - txt = strings.TrimRight(txt, " ") - - os.Stdout.WriteString(txt) - os.Stdout.WriteString("\n") - } + return } // SEQUENCE EDITING -func SequenceRemove(inp io.Reader, args []string) { +func sequenceRemove(inp io.Reader, args []string) { if inp == nil { return @@ -1552,7 +1338,7 @@ func SequenceRemove(inp io.Reader, args []string) { } } -func SequenceRetain(inp io.Reader, args []string) { +func sequenceRetain(inp io.Reader, args []string) { if inp == nil { return @@ -1589,7 +1375,7 @@ func SequenceRetain(inp io.Reader, args []string) { } } -func SequenceReplace(inp io.Reader, args []string) { +func sequenceReplace(inp io.Reader, args []string) { if inp == nil { return @@ -1636,7 +1422,7 @@ func SequenceReplace(inp io.Reader, args []string) { } } -func SequenceExtract(inp io.Reader, args []string) { +func sequenceExtract(inp io.Reader, args []string) { if inp == nil { return @@ -1665,8 +1451,8 @@ func SequenceExtract(inp io.Reader, args []string) { // REVERSE SEQUENCE -// SeqFlip reverses without complementing - e.g., minus strand proteins translated in reverse order -func SeqFlip(inp io.Reader) { +// seqFlip reverses without complementing - e.g., minus strand proteins translated in reverse order +func seqFlip(inp io.Reader) { if inp == nil { return @@ -1684,7 +1470,7 @@ func SeqFlip(inp io.Reader) { // REVERSE COMPLEMENT -func NucRevComp(inp io.Reader) { +func nucRevComp(inp io.Reader) { if inp == nil { return @@ -1702,7 +1488,7 @@ func NucRevComp(inp io.Reader) { // FASTA DIFFERENCES -func PrintFastaPairs(frst, scnd string) { +func printFastaPairs(frst, scnd string) { frst = strings.ToLower(frst) scnd = strings.ToLower(scnd) @@ -1782,7 +1568,7 @@ func PrintFastaPairs(frst, scnd string) { } } -func FastaDiff(inp io.Reader, args []string) { +func fastaDiff(inp io.Reader, args []string) { if inp == nil { return @@ -1807,18 +1593,18 @@ func FastaDiff(inp io.Reader, args []string) { } // sequences are assumed to be aligned, this code highlight mismatches - PrintFastaPairs(frstFasta, scndFasta) + printFastaPairs(frstFasta, scndFasta) } // PROTEIN WEIGHT -func ProtWeight(inp io.Reader, args []string) { +func protWeight(inp io.Reader, args []string) { if inp == nil { return } - trim_leading_met := true + trimLeadingMet := true // skip past command name args = args[1:] @@ -1827,7 +1613,7 @@ func ProtWeight(inp io.Reader, args []string) { switch args[0] { case "-met": - trim_leading_met = false + trimLeadingMet = false args = args[1:] default: fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized option after -molwt command\n") @@ -1837,7 +1623,7 @@ func ProtWeight(inp io.Reader, args []string) { str := eutils.ReadAllIntoSequence(inp) - str = eutils.ProteinWeight(str, trim_leading_met) + str = eutils.ProteinWeight(str, trimLeadingMet) os.Stdout.WriteString(str) if !strings.HasSuffix(str, "\n") { @@ -1845,8 +1631,8 @@ func ProtWeight(inp io.Reader, args []string) { } } -// CdRegionToProtein reads all of stdin as sequence data -func CdRegionToProtein(inp io.Reader, args []string) { +// cdRegionToProtein reads all of stdin as sequence data +func cdRegionToProtein(inp io.Reader, args []string) { if inp == nil { return @@ -1854,11 +1640,11 @@ func CdRegionToProtein(inp io.Reader, args []string) { genCode := 1 frame := 0 - include_stop := false - do_every_codon := false - remove_trailing_X := false - is_5prime_complete := true - is_3prime_complete := true + includeStop := false + doEveryCodon := false + removeTrailingX := false + is5primeComplete := true + is3primeComplete := true repeat := 1 @@ -1875,19 +1661,19 @@ func CdRegionToProtein(inp io.Reader, args []string) { frame = eutils.GetNumericArg(args, "offset into coding sequence", 0, 1, 30) args = args[2:] case "-stop", "-stops": - include_stop = true + includeStop = true args = args[1:] case "-every", "-all": - do_every_codon = true + doEveryCodon = true args = args[1:] case "-trim", "-trailing": - remove_trailing_X = true + removeTrailingX = true args = args[1:] case "-part5", "-partial5", "-lt5": - is_5prime_complete = false + is5primeComplete = false args = args[1:] case "-part3", "-partial3", "-gt3": - is_3prime_complete = false + is3primeComplete = false args = args[1:] case "-repeat": repeat = eutils.GetNumericArg(args, "number of repetitions for testing", 1, 1, 100) @@ -1903,7 +1689,7 @@ func CdRegionToProtein(inp io.Reader, args []string) { for i := 0; i < repeat; i++ { // repeat multiple times for performance testing (undocumented) - str := eutils.TranslateCdRegion(txt, genCode, frame, include_stop, do_every_codon, remove_trailing_X, is_5prime_complete, is_3prime_complete) + str := eutils.TranslateCdRegion(txt, genCode, frame, includeStop, doEveryCodon, removeTrailingX, is5primeComplete, is3primeComplete) os.Stdout.WriteString(str) if !strings.HasSuffix(str, "\n") { @@ -1938,8 +1724,6 @@ func main() { doMixed := false deAccent := false doASCII := false - doStem := false - deStop := false /* doUnicode := false @@ -2048,9 +1832,9 @@ func main() { // previously visible processing flags (undocumented) case "-stems", "-stem": - doStem = true + // ignore case "-stops", "-stop": - deStop = true + // ignore // allow setting of unicode, script, and mathml flags (undocumented) case "-unicode": @@ -2115,9 +1899,9 @@ func main() { case "mixed": doMixed = true case "stems", "stem": - doStem = true + // ignore case "stops", "stop": - deStop = true + // ignore case "none", "default": default: if flgs != "" { @@ -2160,7 +1944,7 @@ func main() { eutils.SetTunings(numProcs, numServe, serverRatio, chanDepth, farmSize, heapSize, goGc) - eutils.SetOptions(doStrict, doMixed, deAccent, doASCII, doCompress, doCleanup, doStem, deStop, false) + eutils.SetOptions(doStrict, doMixed, deAccent, doASCII, doCompress, doCleanup) // -stats prints number of CPUs and performance tuning values if no other arguments (undocumented) if stts && len(args) < 1 { @@ -2279,6 +2063,9 @@ func main() { return nxt, true } + // The several converter functions that follow must be called + // before CreateXMLStreamer starts draining stdin + // JSON TO XML CONVERTER if args[0] == "-j2x" || args[0] == "-json2xml" { @@ -2441,7 +2228,6 @@ func main() { // READ TAB-DELIMITED FILE AND WRAP IN XML FIELDS - // must be called before CreateXMLStreamer starts draining stdin doTable := func(delim string) { // skip past command name @@ -2567,7 +2353,6 @@ func main() { // READ GENBANK FLATFILE AND TRANSLATE TO INSDSEQ XML - // must be called before CreateXMLStreamer starts draining stdin if len(args) > 0 && args[0] == "-g2x" { gbk := eutils.GenBankConverter(in) @@ -2627,35 +2412,35 @@ func main() { switch args[0] { case "-encodeURL": - EncodeURL(in) + encodeURL(in) case "-decodeURL": - DecodeURL(in) + decodeURL(in) case "-encode64", "-encodeB64", "-encodeBase64": - EncodeB64(in) + encodeB64(in) case "-decode64", "-decodeB64", "-decodeBase64": - DecodeB64(in) + decodeB64(in) case "-hgvs": - DecodeHGVS(in) + decodeHGVS(in) case "-align": - ProcessAlign(in, args) + processAlign(in, args) case "-remove": - SequenceRemove(in, args) + sequenceRemove(in, args) case "-retain": - SequenceRetain(in, args) + sequenceRetain(in, args) case "-replace": - SequenceReplace(in, args) + sequenceReplace(in, args) case "-extract": - SequenceExtract(in, args) + sequenceExtract(in, args) case "-revcomp": - NucRevComp(in) + nucRevComp(in) case "-reverse": - SeqFlip(in) + seqFlip(in) case "-molwt": - ProtWeight(in, args) + protWeight(in, args) case "-cds2prot": - CdRegionToProtein(in, args) + cdRegionToProtein(in, args) case "-diff": - FastaDiff(in, args) + fastaDiff(in, args) default: // if not any of the conversion commands, keep going inSwitch = false @@ -2701,9 +2486,9 @@ func main() { switch args[0] { case "-format": - ProcessFormat(rdr, args) + processFormat(rdr, args) case "-filter": - ProcessFilter(rdr, args) + processFilter(rdr, args) case "-normalize", "-normal": if len(args) < 2 { fmt.Fprintf(os.Stderr, "\nERROR: No database supplied to -normalize\n") @@ -2713,7 +2498,7 @@ func main() { nrm := eutils.NormalizeXML(rdr, db) eutils.ChanToStdout(nrm) case "-outline": - ProcessOutline(rdr) + processOutline(rdr) case "-contour": leaf = true fallthrough @@ -2726,9 +2511,9 @@ func main() { delim = "/" } } - ProcessSynopsis(rdr, leaf, delim) + processSynopsis(rdr, leaf, delim) case "-tokens": - ProcessTokens(rdr) + processTokens(rdr) default: // if not any of the formatting commands, keep going inSwitch = false @@ -2893,7 +2678,7 @@ func main() { } xmlq := eutils.CreateXMLProducer(topPattern, star, rdr) - fchq := CreateFormatters(topPattern, format, xmlq) + fchq := createFormatters(topPattern, format, xmlq) unsq := eutils.CreateXMLUnshuffler(fchq) if xmlq == nil || fchq == nil || unsq == nil { diff --git a/cmd/xtract.go b/cmd/xtract.go index 45e6cf9..b06bcbe 100644 --- a/cmd/xtract.go +++ b/cmd/xtract.go @@ -774,10 +774,19 @@ Remove Suffix %% ${FILE%%.*} -> example ` +// GLOBAL VARIABLES + +var ( + doStem bool + deStop bool +) + // TYPED CONSTANTS +// LevelType is the integer type for exploration arguments type LevelType int +// LevelType keys for exploration arguments const ( _ LevelType = iota UNIT @@ -791,8 +800,10 @@ const ( PATTERN ) +// IndentType is the integer type for XML formatting type IndentType int +// IndentType keys for XML formatting const ( SINGULARITY IndentType = iota COMPACT @@ -802,8 +813,10 @@ const ( WRAPPED ) +// OpType is the integer type for operations type OpType int +// OpType keys for operations const ( UNSET OpType = iota ELEMENT @@ -920,8 +933,10 @@ const ( UNRECOGNIZED ) +// ArgumentType is the integer type for argument classification type ArgumentType int +// ArgumentType keys for argument classification const ( _ ArgumentType = iota EXPLORATION @@ -930,8 +945,10 @@ const ( CUSTOMIZATION ) +// RangeType is the integer type for element range choices type RangeType int +// RangeType keys for element range choices const ( NORANGE RangeType = iota STRINGRANGE @@ -939,8 +956,10 @@ const ( INTEGERRANGE ) +// SeqEndType is used for -ucsc-based decisions type SeqEndType int +// SeqEndType keys for -ucsc-based decisions const ( _ SeqEndType = iota ISSTART @@ -948,6 +967,7 @@ const ( ISPOS ) +// SequenceType is used to record XML tag and position for -ucsc-based type SequenceType struct { Based int Which SeqEndType @@ -1776,6 +1796,7 @@ var ncbi4naToIupac = map[int]string{ // DATA OBJECTS +// Step contains parameters for executing a single command step type Step struct { Type OpType Value string @@ -1792,12 +1813,14 @@ type Step struct { Wild bool } +// Operation breaks commands into sequential steps type Operation struct { Type OpType Value string Stages []*Step } +// Block contains nested instructions for executing commands type Block struct { Visit string Parent string @@ -1814,6 +1837,7 @@ type Block struct { Subtasks []*Block } +// Limiter is used for collecting specific nodes (e.g., first and last) type Limiter struct { Obj *eutils.XMLNode Idx int @@ -1822,18 +1846,58 @@ type Limiter struct { // UTILITIES -func ParseFlag(str string) OpType { +func hasSpaceOrHyphen(str string) bool { + + for _, ch := range str { + if ch == ' ' || ch == '-' { + return true + } + } + + return false +} + +func isAllCapsOrDigits(str string) bool { + + for _, ch := range str { + if !unicode.IsUpper(ch) && !unicode.IsDigit(ch) { + return false + } + } + + return true +} + +// sortStringByWords sorts the individual words in a string +func sortStringByWords(str string) string { + + str = eutils.RemoveCommaOrSemicolon(str) + + // check for multiple words + if hasSpaceOrHyphen(str) { + flds := strings.Fields(str) + sort.Slice(flds, func(i, j int) bool { return flds[i] < flds[j] }) + str = strings.Join(flds, " ") + str = strings.Replace(str, "-", " ", -1) + str = eutils.CompressRunsOfSpaces(str) + str = strings.TrimRight(str, ".?:") + } + + return str +} + +func parseFlag(str string) OpType { op, ok := opTypeIs[str] if ok { return op } - if len(str) > 1 && str[0] == '-' && eutils.IsAllCapsOrDigits(str[1:]) { + if len(str) > 1 && str[0] == '-' && isAllCapsOrDigits(str[1:]) { return VARIABLE } - if len(str) > 2 && strings.HasPrefix(str, "--") && eutils.IsAllCapsOrDigits(str[2:]) { + if len(str) > 2 && strings.HasPrefix(str, "--") && isAllCapsOrDigits(str[2:]) { return ACCUMULATOR } @@ -1844,7 +1908,7 @@ func ParseFlag(str string) OpType { return UNSET } -func ParseMarkup(str, cmd string) int { +func parseMarkup(str, cmd string) int { switch str { case "fuse", "fused": @@ -1907,8 +1971,8 @@ func DebugBlock(blk *Block, depth int) { // PARSE COMMAND-LINE ARGUMENTS -// ParseArguments parses nested exploration instruction from command-line arguments -func ParseArguments(cmdargs []string, pttrn string) *Block { +// parseArguments parses nested exploration instruction from command-line arguments +func parseArguments(cmdargs []string, pttrn string) *Block { // different names of exploration control arguments allow multiple levels of nested "for" loops in a linear command line // (capitalized versions for backward-compatibility with original Perl implementation handling of recursive definitions) @@ -2272,7 +2336,7 @@ func ParseArguments(cmdargs []string, pttrn string) *Block { if len(str) > 1 { switch str[0] { case '&': - if eutils.IsAllCapsOrDigits(str[1:]) { + if isAllCapsOrDigits(str[1:]) { status = VARIABLE str = str[1:] } else if strings.Contains(str, ":") { @@ -2373,7 +2437,7 @@ func ParseArguments(cmdargs []string, pttrn string) *Block { switch status { case UNSET: - status = ParseFlag(str) + status = parseFlag(str) case POSITION: if cmds.Position != "" { fmt.Fprintf(os.Stderr, "\nERROR: -position '%s' conflicts with existing '%s'\n", str, cmds.Position) @@ -2424,7 +2488,7 @@ func ParseArguments(cmdargs []string, pttrn string) *Block { // first character may be backslash protecting dash (undocumented) str = str[1:] } - str = eutils.SortStringByWords(str) + str = sortStringByWords(str) tsk := &Step{Type: status, Value: str} op.Stages = append(op.Stages, tsk) op = nil @@ -2560,7 +2624,7 @@ func ParseArguments(cmdargs []string, pttrn string) *Block { // parse next argument nextStatus := func(str string) OpType { - status := ParseFlag(str) + status := parseFlag(str) switch status { case VARIABLE: @@ -2629,7 +2693,7 @@ func ParseArguments(cmdargs []string, pttrn string) *Block { if len(item) > 1 { switch item[0] { case '&': - if eutils.IsAllCapsOrDigits(item[1:]) { + if isAllCapsOrDigits(item[1:]) { status = VARIABLE item = item[1:] } else { @@ -2901,7 +2965,7 @@ func ParseArguments(cmdargs []string, pttrn string) *Block { } } - // ParseArguments + // parseArguments head := &Block{} @@ -2955,140 +3019,8 @@ func ParseArguments(cmdargs []string, pttrn string) *Block { return head } -// ExploreElements returns matching element values to callback -func ExploreElements(curr *eutils.XMLNode, mask, prnt, match, attrib string, wildcard, unescape bool, level int, proc func(string, int)) { - - if curr == nil || proc == nil { - return - } - - // **/Object performs deep exploration of recursive data (*/Object also supported) - deep := false - if prnt == "**" || prnt == "*" { - prnt = "" - deep = true - } - - var exploreChildren func(curr *eutils.XMLNode, acc func(string)) - - exploreChildren = func(curr *eutils.XMLNode, acc func(string)) { - - if curr.Contents != "" { - acc(curr.Contents) - } - for chld := curr.Children; chld != nil; chld = chld.Next { - if chld.Name != "" { - acc("<" + chld.Name + ">") - } - exploreChildren(chld, acc) - if chld.Name != "" { - acc("</" + chld.Name + ">") - } - } - } - - // exploreElements recursive definition - var exploreElements func(curr *eutils.XMLNode, skip string, lev int) - - exploreElements = func(curr *eutils.XMLNode, skip string, lev int) { - - if !deep && curr.Name == skip { - // do not explore within recursive object - return - } - - if curr.Name == match || - // parent/* matches any subfield - (match == "*" && prnt != "") || - // wildcard (internal colon) matches any namespace prefix - (wildcard && strings.HasPrefix(match, ":") && strings.HasSuffix(curr.Name, match)) || - (match == "" && attrib != "") { - - if prnt == "" || - curr.Parent == prnt || - (wildcard && strings.HasPrefix(prnt, ":") && strings.HasSuffix(curr.Parent, prnt)) { - - if attrib != "" { - if curr.Attributes != "" && curr.Attribs == nil { - // parse attributes on-the-fly if queried - curr.Attribs = eutils.ParseAttributes(curr.Attributes) - } - for i := 0; i < len(curr.Attribs)-1; i += 2 { - // attributes now parsed into array as [ tag, value, tag, value, tag, value, ... ] - if curr.Attribs[i] == attrib || - (wildcard && strings.HasPrefix(attrib, ":") && strings.HasSuffix(curr.Attribs[i], attrib)) { - proc(curr.Attribs[i+1], level) - return - } - } - - } else if curr.Contents != "" { - - str := curr.Contents[:] - - if unescape && eutils.HasAmpOrNotASCII(str) { - // processing of <, >, &, ", and ' characters is now delayed until element contents is requested - str = html.UnescapeString(str) - } - - proc(str, level) - return - - } else if curr.Children != nil { - - if eutils.DoMixed() { - // match with mixed contents - send all child strings - var buffr strings.Builder - exploreChildren(curr, func(str string) { - if str != "" { - buffr.WriteString(str) - } - }) - str := buffr.String() - - // clean up reconstructed mixed content - str = eutils.DoTrimFlankingHTML(str) - if eutils.HasBadSpace(str) { - str = eutils.CleanupBadSpaces(str) - } - if eutils.HasAdjacentSpaces(str) { - str = eutils.CompressRunsOfSpaces(str) - } - if eutils.NeedsTightening(str) { - str = eutils.TightenParentheses(str) - } - if unescape && eutils.HasAmpOrNotASCII(str) { - str = html.UnescapeString(str) - } - - proc(str, level) - return - } - - // for XML container object, send empty string to callback to increment count - proc("", level) - // and continue exploring - - } else if curr.Attributes != "" { - - // for self-closing object, indicate presence by sending empty string to callback - proc("", level) - return - } - } - } - - for chld := curr.Children; chld != nil; chld = chld.Next { - // inner exploration is subject to recursive object exclusion - exploreElements(chld, mask, lev+1) - } - } - - exploreElements(curr, "", level) -} - -// PrintSubtree supports compression styles selected by -element "*" through "****" -func PrintSubtree(node *eutils.XMLNode, style IndentType, printAttrs bool, proc func(string)) { +// printSubtree supports compression styles selected by -element "*" through "****" +func printSubtree(node *eutils.XMLNode, style IndentType, printAttrs bool, proc func(string)) { if node == nil || proc == nil { return @@ -3258,8 +3190,8 @@ var ( replx map[string]*regexp.Regexp ) -// ProcessClause handles comma-separated -element arguments -func ProcessClause(curr *eutils.XMLNode, stages []*Step, mask, prev, pfx, sfx, plg, sep, def, reg, exp string, wrp bool, status OpType, index, level int, variables map[string]string, transform map[string]string, histogram map[string]int) (string, bool) { +// processClause handles comma-separated -element arguments +func processClause(curr *eutils.XMLNode, stages []*Step, mask, prev, pfx, sfx, plg, sep, def, reg, exp string, wrp bool, status OpType, index, level int, variables map[string]string, transform map[string]string, histogram map[string]int) (string, bool) { if curr == nil || stages == nil { return "", false @@ -3298,9 +3230,9 @@ func ProcessClause(curr *eutils.XMLNode, stages []*Step, mask, prev, pfx, sfx, p wildcard := stage.Wild unescape := (stat != INDICES) - // exploreElements is a wrapper for ExploreElements, obtaining most arguments as closures + // exploreElements is a wrapper for eutils.ExploreElements, obtaining most arguments as closures exploreElements := func(proc func(string, int)) { - ExploreElements(curr, mask, prnt, match, attrib, wildcard, unescape, level, proc) + eutils.ExploreElements(curr, mask, prnt, match, attrib, wildcard, unescape, level, proc) } // sendSlice applies optional [min:max] range restriction and sends result to accumulator @@ -3579,7 +3511,7 @@ func ProcessClause(curr *eutils.XMLNode, stages []*Step, mask, prev, pfx, sfx, p case ORDER: exploreElements(func(str string, lvl int) { if str != "" { - str = eutils.SortStringByWords(str) + str = sortStringByWords(str) sendSlice(str) } }) @@ -3729,7 +3661,7 @@ func ProcessClause(curr *eutils.XMLNode, stages []*Step, mask, prev, pfx, sfx, p var buffer strings.Builder - PrintSubtree(curr, style, printAttrs, + printSubtree(curr, style, printAttrs, func(str string) { if str != "" { buffer.WriteString(str) @@ -4223,7 +4155,7 @@ func ProcessClause(curr *eutils.XMLNode, stages []*Step, mask, prev, pfx, sfx, p str = eutils.RepairTableMarkup(str, eutils.SPACE) str = eutils.RepairScriptMarkup(str, eutils.SPACE) str = eutils.RepairMathMLMarkup(str, eutils.SPACE) - // RemoveEmbeddedMarkup must be called before UnescapeString, which was suppressed in ExploreElements + // RemoveEmbeddedMarkup must be called before UnescapeString, which was suppressed in eutils.ExploreElements str = eutils.RemoveEmbeddedMarkup(str) } @@ -4301,7 +4233,7 @@ func ProcessClause(curr *eutils.XMLNode, stages []*Step, mask, prev, pfx, sfx, p } // optional stop word removal - if eutils.DeStop() && eutils.IsStopWord(item) { + if deStop && eutils.IsStopWord(item) { continue } @@ -4404,12 +4336,12 @@ func ProcessClause(curr *eutils.XMLNode, stages []*Step, mask, prev, pfx, sfx, p }) for _, item := range words { item = strings.ToLower(item) - if eutils.DeStop() { + if deStop { if eutils.IsStopWord(item) { continue } } - if eutils.DoStem() { + if doStem { item = porter2.Stem(item) item = strings.TrimSpace(item) } @@ -4448,13 +4380,13 @@ func ProcessClause(curr *eutils.XMLNode, stages []*Step, mask, prev, pfx, sfx, p continue } item = strings.ToLower(item) - if eutils.DeStop() { + if deStop { if eutils.IsStopWord(item) { past = "" continue } } - if eutils.DoStem() { + if doStem { item = porter2.Stem(item) item = strings.TrimSpace(item) } @@ -4485,12 +4417,12 @@ func ProcessClause(curr *eutils.XMLNode, stages []*Step, mask, prev, pfx, sfx, p } for _, item := range words { item = strings.ToLower(item) - if eutils.DeStop() { + if deStop { if eutils.IsStopWord(item) { continue } } - if eutils.DoStem() { + if doStem { item = porter2.Stem(item) item = strings.TrimSpace(item) } @@ -4680,8 +4612,8 @@ func ProcessClause(curr *eutils.XMLNode, stages []*Step, mask, prev, pfx, sfx, p return txt, true } -// ProcessInstructions performs extraction commands on a subset of XML -func ProcessInstructions(commands []*Operation, curr *eutils.XMLNode, mask, tab, ret string, index, level int, variables map[string]string, transform map[string]string, histogram map[string]int, accum func(string)) (string, string) { +// processInstructions performs extraction commands on a subset of XML +func processInstructions(commands []*Operation, curr *eutils.XMLNode, mask, tab, ret string, index, level int, variables map[string]string, transform map[string]string, histogram map[string]int, accum func(string)) (string, string) { if accum == nil { return tab, ret @@ -4731,7 +4663,7 @@ func ProcessInstructions(commands []*Operation, curr *eutils.XMLNode, mask, tab, TERMS, WORDS, PAIRS, REVERSE, LETTERS, CLAUSES, INDICES, MESHCODE, MATRIX, ACCENTED, NUM, LEN, SUM, MIN, MAX, INC, DEC, SUB, AVG, DEV, MED, MUL, DIV, MOD, BIN, BIT, ZEROBASED, ONEBASED, UCSCBASED, REVCOMP, NUCLEIC, FASTA, NCBI2NA, NCBI4NA, MOLWT, HGVS: - txt, ok := ProcessClause(curr, op.Stages, mask, tab, pfx, sfx, plg, sep, def, reg, exp, wrp, op.Type, index, level, variables, transform, histogram) + txt, ok := processClause(curr, op.Stages, mask, tab, pfx, sfx, plg, sep, def, reg, exp, wrp, op.Type, index, level, variables, transform, histogram) if ok { plg = "" lst = elg @@ -4744,7 +4676,7 @@ func ProcessInstructions(commands []*Operation, curr *eutils.XMLNode, mask, tab, } } case HISTOGRAM: - txt, ok := ProcessClause(curr, op.Stages, mask, "", "", "", "", "", "", "", "", wrp, op.Type, index, level, variables, transform, histogram) + txt, ok := processClause(curr, op.Stages, mask, "", "", "", "", "", "", "", "", wrp, op.Type, index, level, variables, transform, histogram) if ok { accum(txt) } @@ -4878,7 +4810,7 @@ func ProcessInstructions(commands []*Operation, curr *eutils.XMLNode, mask, tab, // -if "&VARIABLE" will fail if initialized with empty string "" delete(variables, varname) } else { - txt, ok := ProcessClause(curr, op.Stages, mask, "", pfx, sfx, plg, sep, def, reg, exp, wrp, op.Type, index, level, variables, transform, histogram) + txt, ok := processClause(curr, op.Stages, mask, "", pfx, sfx, plg, sep, def, reg, exp, wrp, op.Type, index, level, variables, transform, histogram) if ok { plg = "" lst = elg @@ -4910,8 +4842,8 @@ func ProcessInstructions(commands []*Operation, curr *eutils.XMLNode, mask, tab, // CONDITIONAL EXECUTION USES -if AND -unless STATEMENT, WITH SUPPORT FOR DEPRECATED -match AND -avoid STATEMENTS -// ConditionsAreSatisfied tests a set of conditions to determine if extraction should proceed -func ConditionsAreSatisfied(conditions []*Operation, curr *eutils.XMLNode, mask string, index, level int, variables map[string]string) bool { +// conditionsAreSatisfied tests a set of conditions to determine if extraction should proceed +func conditionsAreSatisfied(conditions []*Operation, curr *eutils.XMLNode, mask string, index, level int, variables map[string]string) bool { if curr == nil { return false @@ -4955,9 +4887,9 @@ func ConditionsAreSatisfied(conditions []*Operation, curr *eutils.XMLNode, mask found := false number := "" - // exploreElements is a wrapper for ExploreElements, obtaining most arguments as closures + // exploreElements is a wrapper for eutils.ExploreElements, obtaining most arguments as closures exploreElements := func(proc func(string, int)) { - ExploreElements(curr, mask, prnt, match, attrib, wildcard, unescape, level, proc) + eutils.ExploreElements(curr, mask, prnt, match, attrib, wildcard, unescape, level, proc) } // test string or numeric constraints @@ -5014,7 +4946,7 @@ func ConditionsAreSatisfied(conditions []*Operation, curr *eutils.XMLNode, mask return true } case RESEMBLES: - if eutils.SortStringByWords(str) == strings.ToLower(val) { + if sortStringByWords(str) == strings.ToLower(val) { return true } default: @@ -5027,13 +4959,13 @@ func ConditionsAreSatisfied(conditions []*Operation, curr *eutils.XMLNode, mask switch ch { case '#': count := 0 - ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) { + eutils.ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) { count++ }) val = strconv.Itoa(count) case '%': length := 0 - ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) { + eutils.ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) { if stn != "" { length += len(stn) } @@ -5041,12 +4973,12 @@ func ConditionsAreSatisfied(conditions []*Operation, curr *eutils.XMLNode, mask val = strconv.Itoa(length) case '^': depth := 0 - ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) { + eutils.ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) { depth = lvl }) val = strconv.Itoa(depth) default: - ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) { + eutils.ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) { if stn != "" { val = stn } @@ -5075,13 +5007,13 @@ func ConditionsAreSatisfied(conditions []*Operation, curr *eutils.XMLNode, mask switch ch { case '#': count := 0 - ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) { + eutils.ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) { count++ }) val = strconv.Itoa(count) case '%': length := 0 - ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) { + eutils.ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) { if stn != "" { length += len(stn) } @@ -5089,12 +5021,12 @@ func ConditionsAreSatisfied(conditions []*Operation, curr *eutils.XMLNode, mask val = strconv.Itoa(length) case '^': depth := 0 - ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) { + eutils.ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) { depth = lvl }) val = strconv.Itoa(depth) default: - ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) { + eutils.ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) { if stn != "" { _, errz := strconv.Atoi(stn) if errz == nil { @@ -5391,8 +5323,8 @@ func ConditionsAreSatisfied(conditions []*Operation, curr *eutils.XMLNode, mask // RECURSIVELY PROCESS EXPLORATION COMMANDS AND XML DATA STRUCTURE -// ProcessCommands visits XML nodes, performs conditional tests, and executes data extraction instructions -func ProcessCommands(cmds *Block, curr *eutils.XMLNode, tab, ret string, index, level int, variables map[string]string, transform map[string]string, histogram map[string]int, accum func(string)) (string, string) { +// processCommands visits XML nodes, performs conditional tests, and executes data extraction instructions +func processCommands(cmds *Block, curr *eutils.XMLNode, tab, ret string, index, level int, variables map[string]string, transform map[string]string, histogram map[string]int, accum func(string)) (string, string) { if accum == nil { return tab, ret @@ -5424,23 +5356,23 @@ func ProcessCommands(cmds *Block, curr *eutils.XMLNode, tab, ret string, index, processNode := func(node *eutils.XMLNode, idx, lvl int) { // apply -if or -unless tests - if ConditionsAreSatisfied(cmds.Conditions, node, match, idx, lvl, variables) { + if conditionsAreSatisfied(cmds.Conditions, node, match, idx, lvl, variables) { // execute data extraction commands if len(cmds.Commands) > 0 { - tab, ret = ProcessInstructions(cmds.Commands, node, match, tab, ret, idx, lvl, variables, transform, histogram, accum) + tab, ret = processInstructions(cmds.Commands, node, match, tab, ret, idx, lvl, variables, transform, histogram, accum) } // process sub commands on child node for _, sub := range cmds.Subtasks { - tab, ret = ProcessCommands(sub, node, tab, ret, 1, lvl, variables, transform, histogram, accum) + tab, ret = processCommands(sub, node, tab, ret, 1, lvl, variables, transform, histogram, accum) } } else { // execute commands after -else statement if len(cmds.Failure) > 0 { - tab, ret = ProcessInstructions(cmds.Failure, node, match, tab, ret, idx, lvl, variables, transform, histogram, accum) + tab, ret = processInstructions(cmds.Failure, node, match, tab, ret, idx, lvl, variables, transform, histogram, accum) } } } @@ -5677,8 +5609,8 @@ func ProcessCommands(cmds *Block, curr *eutils.XMLNode, tab, ret string, index, // PROCESS ONE XML COMPONENT RECORD -// ProcessQuery perform data extraction driven by command-line arguments -func ProcessQuery(text, parent string, index int, hd, tl string, transform map[string]string, histogram map[string]int, cmds *Block) string { +// processQuery perform data extraction driven by command-line arguments +func processQuery(text, parent string, index int, hd, tl string, transform map[string]string, histogram map[string]int, cmds *Block) string { if text == "" || cmds == nil { return "" @@ -5706,7 +5638,7 @@ func ProcessQuery(text, parent string, index int, hd, tl string, transform map[s if cmds.Position == "select" { - if ConditionsAreSatisfied(cmds.Conditions, pat, cmds.Match, index, 1, variables) { + if conditionsAreSatisfied(cmds.Conditions, pat, cmds.Match, index, 1, variables) { ok = true buffer.WriteString(text) ret = "\n" @@ -5715,7 +5647,7 @@ func ProcessQuery(text, parent string, index int, hd, tl string, transform map[s } else { // start processing at top of command tree and top of XML subregion selected by -pattern - _, ret = ProcessCommands(cmds, pat, "", "", index, 1, variables, transform, histogram, + _, ret = processCommands(cmds, pat, "", "", index, 1, variables, transform, histogram, func(str string) { if str != "" { ok = true @@ -5752,8 +5684,8 @@ func ProcessQuery(text, parent string, index int, hd, tl string, transform map[s // e.g., xtract -insd complete mat_peptide "%peptide" product peptide -// ProcessINSD generates extraction commands for GenBank/RefSeq records in INSDSet format -func ProcessINSD(args []string, isPipe, addDash, doIndex bool) []string { +// processINSD generates extraction commands for GenBank/RefSeq records in INSDSet format +func processINSD(args []string, isPipe, addDash, doIndex bool) []string { // legal GenBank / GenPept / RefSeq features @@ -6457,8 +6389,8 @@ func ProcessINSD(args []string, isPipe, addDash, doIndex bool) []string { // BIOTHINGS EXTRACTION COMMAND GENERATOR -// ProcessBiopath generates extraction commands for BioThings resources (undocumented) -func ProcessBiopath(args []string, isPipe bool) []string { +// processBiopath generates extraction commands for BioThings resources (undocumented) +func processBiopath(args []string, isPipe bool) []string { // nquire -get "http://myvariant.info/v1/variant/chr6:g.26093141G>A" \ // -fields clinvar.rcv.conditions.identifiers \ @@ -6515,8 +6447,8 @@ func ProcessBiopath(args []string, isPipe bool) []string { // HYDRA CITATION MATCHER COMMAND GENERATOR -// ProcessHydra generates extraction commands for NCBI's in-house citation matcher (undocumented) -func ProcessHydra(isPipe bool) []string { +// processHydra generates extraction commands for NCBI's in-house citation matcher (undocumented) +func processHydra(isPipe bool) []string { var acc []string @@ -6533,8 +6465,8 @@ func ProcessHydra(isPipe bool) []string { // ENTREZ2INDEX COMMAND GENERATOR -// ProcessE2Index generates extraction commands to create input for Entrez2Index -func ProcessE2Index(args []string, tform string, isPipe bool) []string { +// processE2Index generates extraction commands to create input for Entrez2Index +func processE2Index(args []string, tform string, isPipe bool) []string { var acc []string @@ -6548,7 +6480,29 @@ func ProcessE2Index(args []string, tform string, isPipe bool) []string { patrn := args[0] args = args[1:] - if eutils.IsAllNumeric(patrn) { + isAllNumeric := func(str string) bool { + + for _, ch := range str { + if !unicode.IsDigit(ch) && + ch != '.' && + ch != '+' && + ch != '-' && + ch != '*' && + ch != '/' && + ch != ',' && + ch != '$' && + ch != '#' && + ch != '%' && + ch != '(' && + ch != ')' { + return false + } + } + + return true + } + + if isAllNumeric(patrn) { year = patrn patrn = args[0] args = args[1:] @@ -6558,10 +6512,10 @@ func ProcessE2Index(args []string, tform string, isPipe bool) []string { args = args[1:] if !isPipe { - if !eutils.DeStop() { + if !deStop { acc = append(acc, "-stops") } - if eutils.DoStem() { + if doStem { acc = append(acc, "-stems") } } @@ -6635,7 +6589,7 @@ func ProcessE2Index(args []string, tform string, isPipe bool) []string { // processes with single goroutine call defer close(out) so consumer(s) can range over channel // processes with multiple instances call defer wg.Done(), separate goroutine uses wg.Wait() to delay close(out) -func CreateConsumers(cmds *Block, parent, hd, tl string, transform map[string]string, histogram map[string]int, inp <-chan eutils.XMLRecord) <-chan eutils.XMLRecord { +func createConsumers(cmds *Block, parent, hd, tl string, transform map[string]string, histogram map[string]int, inp <-chan eutils.XMLRecord) <-chan eutils.XMLRecord { if inp == nil { return nil @@ -6661,14 +6615,14 @@ func CreateConsumers(cmds *Block, parent, hd, tl string, transform map[string]st if text == "" { // should never see empty input data - out <- eutils.XMLRecord{idx, "", text, nil} + out <- eutils.XMLRecord{Index: idx, Text: text} continue } - str := ProcessQuery(text[:], parent, idx, hd, tl, transform, histogram, cmds) + str := processQuery(text[:], parent, idx, hd, tl, transform, histogram, cmds) // send even if empty to get all record counts for reordering - out <- eutils.XMLRecord{idx, "", str, nil} + out <- eutils.XMLRecord{Index: idx, Text: str} } } @@ -6689,7 +6643,7 @@ func CreateConsumers(cmds *Block, parent, hd, tl string, transform map[string]st return out } -func CreateSelectors(parent, indx string, order map[string]bool, inp <-chan eutils.XMLRecord) <-chan eutils.XMLRecord { +func createSelectors(parent, indx string, order map[string]bool, inp <-chan eutils.XMLRecord) <-chan eutils.XMLRecord { if parent == "" || indx == "" || order == nil || inp == nil { return nil @@ -6718,7 +6672,7 @@ func CreateSelectors(parent, indx string, order map[string]bool, inp <-chan euti eutils.FindIdentifiers(text[:], parent, find, func(id string) { - id = eutils.SortStringByWords(id) + id = sortStringByWords(id) _, ok := order[id] if ok { found = true @@ -6727,12 +6681,12 @@ func CreateSelectors(parent, indx string, order map[string]bool, inp <-chan euti if !found { // identifier field not found or not in identifier list, send empty placeholder for unshuffler - out <- eutils.XMLRecord{ext.Index, "", "", nil} + out <- eutils.XMLRecord{Index: ext.Index} continue } // send selected record - out <- eutils.XMLRecord{ext.Index, "", text, nil} + out <- eutils.XMLRecord{Index: ext.Index, Text: text} } } @@ -6781,8 +6735,8 @@ func main() { doMixed := false deAccent := false doASCII := false - doStem := false - deStop := false + doStem = false + deStop = true /* doUnicode := false @@ -6892,7 +6846,7 @@ func main() { case "-stems", "-stem": doStem = true case "-stops", "-stop": - deStop = true + deStop = false // allow setting of unicode, script, and mathml flags (undocumented) case "-unicode": @@ -6951,7 +6905,7 @@ func main() { case "stems", "stem": doStem = true case "stops", "stop": - deStop = true + deStop = false case "none", "default": default: if flgs != "" { @@ -6961,9 +6915,9 @@ func main() { } /* - UnicodeFix = ParseMarkup(unicodePolicy, "-unicode") - ScriptFix = ParseMarkup(scriptPolicy, "-script") - MathMLFix = ParseMarkup(mathmlPolicy, "-mathml") + UnicodeFix = parseMarkup(unicodePolicy, "-unicode") + ScriptFix = parseMarkup(scriptPolicy, "-script") + MathMLFix = parseMarkup(mathmlPolicy, "-mathml") if UnicodeFix != NOMARKUP { doUnicode = true @@ -6994,7 +6948,7 @@ func main() { eutils.SetTunings(numProcs, numServe, serverRatio, chanDepth, farmSize, heapSize, goGc) - eutils.SetOptions(doStrict, doMixed, deAccent, doASCII, doCompress, doCleanup, doStem, deStop, false) + eutils.SetOptions(doStrict, doMixed, deAccent, doASCII, doCompress, doCleanup) // -stats prints number of CPUs and performance tuning values if no other arguments (undocumented) if stts && len(args) < 1 { @@ -7180,7 +7134,7 @@ func main() { args = args[1:] - insd := ProcessINSD(args, isPipe || usingFile, addDash, doIndex) + insd := processINSD(args, isPipe || usingFile, addDash, doIndex) if !isPipe && !usingFile { // no piped input, so write output instructions @@ -7201,7 +7155,7 @@ func main() { // -hydra filters HydraResponse output by relevance score (undocumented) if args[0] == "-hydra" { - hydra := ProcessHydra(isPipe || usingFile) + hydra := processHydra(isPipe || usingFile) if !isPipe && !usingFile { // no piped input, so write output instructions @@ -7224,7 +7178,7 @@ func main() { args = args[1:] - biopath := ProcessBiopath(args, isPipe || usingFile) + biopath := processBiopath(args, isPipe || usingFile) if !isPipe && !usingFile { // no piped input, so write output instructions @@ -7281,7 +7235,7 @@ func main() { } } - res := ProcessE2Index(args, tform, isPipe || usingFile) + res := processE2Index(args, tform, isPipe || usingFile) if !isPipe && !usingFile { // no piped input, so write output instructions @@ -7414,7 +7368,7 @@ func main() { if args[0] == "-token" { - eutils.StreamTokens("", "", rdr, + eutils.StreamTokens(rdr, func(tkn eutils.XMLToken) { recordCount++ byteCount += len(tkn.Name) + len(tkn.Attr) @@ -7609,7 +7563,7 @@ func main() { line := scanr.Text() id, _ := eutils.SplitInTwoLeft(line, "\t") - id = eutils.SortStringByWords(id) + id = sortStringByWords(id) // add identifier to map order[id] = true @@ -7618,7 +7572,7 @@ func main() { fl.Close() xmlq := eutils.CreateXMLProducer(topPattern, star, rdr) - fchq := CreateSelectors(topPattern, indx, order, xmlq) + fchq := createSelectors(topPattern, indx, order, xmlq) unsq := eutils.CreateXMLUnshuffler(fchq) if xmlq == nil || fchq == nil || unsq == nil { @@ -7993,7 +7947,7 @@ func main() { // PARSE AND VALIDATE EXTRACTION ARGUMENTS // parse nested exploration instruction from command-line arguments - cmds := ParseArguments(args, topPattern) + cmds := parseArguments(args, topPattern) if cmds == nil { fmt.Fprintf(os.Stderr, "\nERROR: Problem parsing command-line arguments\n") os.Exit(1) @@ -8016,7 +7970,7 @@ func main() { func(str string) { rec++ beginTime := time.Now() - ProcessQuery(str[:], parent, rec, hd, tl, transform, histogram, cmds) + processQuery(str[:], parent, rec, hd, tl, transform, histogram, cmds) endTime := time.Now() duration := endTime.Sub(beginTime) micro := int(float64(duration.Nanoseconds()) / 1e3) @@ -8066,7 +8020,7 @@ func main() { } xmlq := eutils.CreateXMLProducer(topPattern, star, trdr) - tblq := CreateConsumers(cmds, parent, hd, tl, transform, histogram, xmlq) + tblq := createConsumers(cmds, parent, hd, tl, transform, histogram, xmlq) if xmlq == nil || tblq == nil { fmt.Fprintf(os.Stderr, "\nERROR: Unable to create servers\n") @@ -8181,7 +8135,7 @@ func main() { cmds.Position = "" // process single selected record - res := ProcessQuery(qry[:], parent, idx, hd, tl, transform, histogram, cmds) + res := processQuery(qry[:], parent, idx, hd, tl, transform, histogram, cmds) if res != "" { fmt.Printf("%s", res) @@ -8196,7 +8150,7 @@ func main() { xmlq := eutils.CreateXMLProducer(topPattern, star, rdr) // launch consumer goroutines to parse and explore partitioned XML objects - tblq := CreateConsumers(cmds, parent, hd, tl, transform, histogram, xmlq) + tblq := createConsumers(cmds, parent, hd, tl, transform, histogram, xmlq) // launch unshuffler goroutine to restore order of results unsq := eutils.CreateXMLUnshuffler(tblq) @@ -8312,9 +8266,9 @@ func main() { for curr := range unsq { if beg == nil { - beg = &eutils.XMLRecord{curr.Index, curr.Ident, curr.Text, nil} + beg = &eutils.XMLRecord{Index: curr.Index, Ident: curr.Ident, Text: curr.Text} } else { - end = &eutils.XMLRecord{curr.Index, curr.Ident, curr.Text, nil} + end = &eutils.XMLRecord{Index: curr.Index, Ident: curr.Ident, Text: curr.Text} } recordCount++ @@ -8340,7 +8294,7 @@ func main() { first = false } else { prev = next - next = &eutils.XMLRecord{curr.Index, curr.Ident, curr.Text, nil} + next = &eutils.XMLRecord{Index: curr.Index, Ident: curr.Ident, Text: curr.Text} } if prev != nil { diff --git a/download-ncbi-data b/download-ncbi-data index 798fc50..992df6d 100755 --- a/download-ncbi-data +++ b/download-ncbi-data @@ -1,5 +1,8 @@ #!/bin/sh +# Public domain notice for all NCBI EDirect scripts is located at: +# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE + cmd="$1" shift diff --git a/download-pubmed b/download-pubmed index 4bf60cf..22b6f65 100755 --- a/download-pubmed +++ b/download-pubmed @@ -1,5 +1,8 @@ #!/bin/sh +# Public domain notice for all NCBI EDirect scripts is located at: +# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE + download() { dir="$1" nquire -lst ftp.ncbi.nlm.nih.gov "pubmed" "$dir" | diff --git a/download-sequence b/download-sequence index 56f3a17..0e7039f 100755 --- a/download-sequence +++ b/download-sequence @@ -1,5 +1,8 @@ #!/bin/sh +# Public domain notice for all NCBI EDirect scripts is located at: +# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE + filt="" while [ "$#" -gt 0 ] do @@ -1,159 +0,0 @@ -#!/bin/sh - -# =========================================================================== -# -# PUBLIC DOMAIN NOTICE -# National Center for Biotechnology Information (NCBI) -# -# This software/database is a "United States Government Work" under the -# terms of the United States Copyright Act. It was written as part of -# the author's official duties as a United States Government employee and -# thus cannot be copyrighted. This software/database is freely available -# to the public for use. The National Library of Medicine and the U.S. -# Government do not place any restriction on its use or reproduction. -# We would, however, appreciate having the NCBI and the author cited in -# any work or product based on this material. -# -# Although all reasonable efforts have been taken to ensure the accuracy -# and reliability of the software and data, the NLM and the U.S. -# Government do not and cannot warrant the performance or results that -# may be obtained by using this software or data. The NLM and the U.S. -# Government disclaim all warranties, express or implied, including -# warranties of performance, merchantability or fitness for any particular -# purpose. -# -# =========================================================================== -# -# File Name: eblast -# -# Author: Jonathan Kans -# -# Version Creation Date: 03/05/2021 -# -# ========================================================================== - -# read sequence from stdin -seq=$( cat ) - -# remove FASTA definition line, all whitespace including newlines -seq=$( - echo "$seq" | - grep -v '>' | - tr -d " \t\n\r" -) - -# send BLAST request -blst=$( - nquire -url https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi \ - -CMD Put -PROGRAM blastp -DATABASE nr -QUERY "$seq" -) - -# extract request id from result -rid=$( - echo "$blst" | - grep "^ RID" | - sed -e 's/^ RID = //g' | - tr -d " \t\n\r" -) - -if [ -z "$rid" ] -then - echo "ERROR Unable to create RID" >&2 - exit 1 -fi - -# ASCII terminal character color variables -RD='\033[0;31m' -BL='\033[0;34m' -NC='\033[0m' - -# echo RID (to stderr) -echo "${RD}RID:${BL} $rid${NC}" >&2 - -goOn=true -count=0 - -# polling loop -while [ "$goOn" = true ] -do - - # check for result every 60 seconds to avoid server overuse - for i in $(seq 1 4) - do - count=$((count + 1)) - sleep 15 - # but print progress indicator every 15 seconds (to stderr) - printf "${RD}.${NC}" >&2 - done - - # send polling request - poll=$( - nquire -get https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi \ - -CMD Get -FORMAT_OBJECT SearchInfo -RID "$rid" - ) - - # obtain the polling status - stts=$( - echo "$poll" | - grep " Status=" | - sed -e 's/ Status=//g' | - tr -d " \t\n\r" - ) - - case "$stts" in - WAITING ) - if [ "$count" -gt 40 ] - then - echo "ERROR Search $rid timed out" >&2 - exit 1 - fi - # continue - ;; - FAILED ) - echo "ERROR Search $rid failed" >&2 - exit 4 - ;; - UNKNOWN ) - echo "ERROR Search $rid expired" >&2 - exit 3 - ;; - READY ) - hits=$( - echo "$poll" | - grep "ThereAreHits=yes" | - sed -e 's/ThereAreHits=//g' | - tr -d " \t\n\r" - ) - # end row of progress dots with newline (to stderr) - printf "\n" >&2 - if [ "$hits" = "yes" ] - then - # set flag to exit loop - goOn=false - else - echo "ERROR No hits found for $rid" >&2 - exit 2 - fi - ;; - * ) - echo "ERROR Unknown status: $stts" >&2 - exit 5 - ;; - esac -done - -sleep 10 - -#fetch result -res=$( - nquire -get https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi \ - -CMD Get -FORMAT_TYPE XML -RID "$rid" | - transmute -format indent -doctype "-" -) - -# sort accessions by score, remove score column -echo "$res" | -xtract -pattern Hit -element Hsp_score Hit_accession | -sort -nr | -cut -f 2 - @@ -47,12 +47,12 @@ esac PERL="" -internal=no +internal=false while [ "$#" -ne 0 ] do case "$1" in -internal ) - internal=yes + internal=true shift ;; -newmode ) @@ -68,7 +68,7 @@ do ;; esac done -if [ "$internal" = yes ] +if [ "$internal" = true ] then set _ -internal "$@" shift @@ -47,12 +47,12 @@ esac PERL="" -internal=no +internal=false while [ "$#" -ne 0 ] do case "$1" in -internal ) - internal=yes + internal=true shift ;; -newmode ) @@ -68,7 +68,7 @@ do ;; esac done -if [ "$internal" = yes ] +if [ "$internal" = true ] then set _ -internal "$@" shift @@ -47,12 +47,12 @@ esac PERL="" -internal=no +internal=false while [ "$#" -ne 0 ] do case "$1" in -internal ) - internal=yes + internal=true shift ;; -newmode ) @@ -68,7 +68,7 @@ do ;; esac done -if [ "$internal" = yes ] +if [ "$internal" = true ] then set _ -internal "$@" shift @@ -114,6 +114,16 @@ fi PrintHelp() { echo "einfo $version" + echo "" + sfx="" + if [ "$external" = true ] + then + sfx=" - external" + elif [ "$internal" = true ] + then + sfx=" - internal" + fi + echo "$( uname -s ) - $( uname -m )${sfx}" cat << "EOF" Database Selection @@ -47,12 +47,12 @@ esac PERL="" -internal=no +internal=false while [ "$#" -ne 0 ] do case "$1" in -internal ) - internal=yes + internal=true shift ;; -newmode ) @@ -68,7 +68,7 @@ do ;; esac done -if [ "$internal" = yes ] +if [ "$internal" = true ] then set _ -internal "$@" shift @@ -47,12 +47,12 @@ esac PERL="" -internal=no +internal=false while [ "$#" -ne 0 ] do case "$1" in -internal ) - internal=yes + internal=true shift ;; -newmode ) @@ -68,7 +68,7 @@ do ;; esac done -if [ "$internal" = yes ] +if [ "$internal" = true ] then set _ -internal "$@" shift @@ -1,5 +1,8 @@ #!/bin/sh +# Public domain notice for all NCBI EDirect scripts is located at: +# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE + do_help() { cat <<EOF @@ -47,12 +47,12 @@ esac PERL="" -internal=no +internal=false while [ "$#" -ne 0 ] do case "$1" in -internal ) - internal=yes + internal=true shift ;; -newmode ) @@ -68,7 +68,7 @@ do ;; esac done -if [ "$internal" = yes ] +if [ "$internal" = true ] then set _ -internal "$@" shift @@ -1418,17 +1418,13 @@ fi # although only 'or' and 'not' actually cause misinterpretation of: # -db biosample -query "package metagenome or environmental version 1 0 [PROP]" +# changed to replace all internal spaces with underscore, except leaving one +# space before field bracket, and added assembly to list of databases given +# this special processing of FILT, PROP, and ORGN controlled vocabularies + ProtectWithUnderscores() { - item="$1" - case "$item" in - *" and "* | *" or "* | *" not "* ) - echo "$item" | sed -e "s/ and /_and_/g; s/ or /_or_/g; s/ not /_not_/g" - ;; - * ) - echo "$item" - ;; - esac + echo "$1" | sed -e 's/ \[/\[/g; s/ /_/g; s/\[/ \[/g; s/\[/ \[/g; s/_ \[/ \[/g' } ProcessEntrezQuery() { @@ -1443,16 +1439,17 @@ ProcessEntrezQuery() { while read item do item=$( echo "$item" | sed -e 's/^ *//g; s/ *$//g; s/ */ /g' ) - case "$item" in + opt=$( echo "$item" | tr '[:upper:]' '[:lower:]' ) + case "$opt" in "" ) ;; - *"[FILT]" | *"[Filter]" | *"[filter]" ) + *"[filt]" | *"[filter]" ) ProtectWithUnderscores "$item" ;; - *"[PROP]" | *"[Properties]" | *"[properties]" ) + *"[prop]" | *"[properties]" ) ProtectWithUnderscores "$item" ;; - *"[ORGN]" | *"[Organism]" | *"[organism]" ) + *"[orgn]" | *"[organism]" ) ProtectWithUnderscores "$item" ;; * ) @@ -1463,7 +1460,7 @@ ProcessEntrezQuery() { } case "$dbase" in - nuc* | prot* | gene | genome | popset | taxonomy | clinvar | cdd | sra | ipg | bio* ) + nuc* | prot* | gene | genome | popset | taxonomy | assembly | clinvar | cdd | sra | ipg | bio* ) case "$query" in *\|* ) # skip if query contains an embedded vertical bar, reserved for splitting in ProcessEntrezQuery @@ -91,12 +91,12 @@ fi PERL="" -internal=no +internal=false while [ "$#" -ne 0 ] do case "$1" in -internal ) - internal=yes + internal=true shift ;; -newmode ) @@ -112,7 +112,7 @@ do ;; esac done -if [ "$internal" = yes ] +if [ "$internal" = true ] then set _ -internal "$@" shift diff --git a/eutils/align.go b/eutils/align.go new file mode 100644 index 0000000..9e20a9a --- /dev/null +++ b/eutils/align.go @@ -0,0 +1,310 @@ +// =========================================================================== +// +// PUBLIC DOMAIN NOTICE +// National Center for Biotechnology Information (NCBI) +// +// This software/database is a "United States Government Work" under the +// terms of the United States Copyright Act. It was written as part of +// the author's official duties as a United States Government employee and +// thus cannot be copyrighted. This software/database is freely available +// to the public for use. The National Library of Medicine and the U.S. +// Government do not place any restriction on its use or reproduction. +// We would, however, appreciate having the NCBI and the author cited in +// any work or product based on this material. +// +// Although all reasonable efforts have been taken to ensure the accuracy +// and reliability of the software and data, the NLM and the U.S. +// Government do not and cannot warrant the performance or results that +// may be obtained by using this software or data. The NLM and the U.S. +// Government disclaim all warranties, express or implied, including +// warranties of performance, merchantability or fitness for any particular +// purpose. +// +// =========================================================================== +// +// File Name: align.go +// +// Author: Jonathan Kans +// +// ========================================================================== + +package eutils + +import ( + "bufio" + "fmt" + "io" + "os" + "strconv" + "strings" + "unicode/utf8" +) + +// AlignColumns aligns a tab-delimited table by individual column widths +func AlignColumns(inp io.Reader, margin, padding int, align string) <-chan string { + + if inp == nil { + return nil + } + + out := make(chan string, chanDepth) + if out == nil { + fmt.Fprintf(os.Stderr, "Unable to create alignment channel\n") + os.Exit(1) + } + + spcs := " " + + mrg := "" + pad := " " + + lettrs := make(map[int]rune) + lst := 'l' + + if margin > 0 && margin < 30 { + mrg = spcs[0:margin] + } + + if padding > 0 && padding < 30 { + pad = spcs[0:padding] + } + + for i, ch := range align { + lettrs[i] = ch + lst = ch + } + + alignTable := func(inp io.Reader, out chan<- string) { + + // close channel when all chunks have been sent + defer close(out) + + var arry []string + + width := make(map[int]int) + whole := make(map[int]int) + fract := make(map[int]int) + + scanr := bufio.NewScanner(inp) + + row := 0 + numCols := 0 + + // allows leading plus or minus, digits interspersed with optional commas, decimal point, and digits + isNumeric := func(str string) bool { + + hasNum := false + hasPeriod := false + + for i, ch := range str { + switch ch { + case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': + hasNum = true + case '+', '-': + if i > 0 { + return false + } + case '.': + hasPeriod = true + case ',': + if hasPeriod { + return false + } + default: + return false + } + } + + return hasNum + } + + processLine := func(line string) string { + + var flds []string + + cols := strings.Split(line, "\t") + if numCols == 0 { + numCols = len(cols) + } else if numCols != len(cols) { + fmt.Fprintf(os.Stderr, "ERROR: Mismatched number of columns in row ") + fmt.Fprintf(os.Stderr, strconv.Itoa(row)) + fmt.Fprintf(os.Stderr, ": actual ") + fmt.Fprintf(os.Stderr, strconv.Itoa(len(cols))) + fmt.Fprintf(os.Stderr, ", expected ") + fmt.Fprintf(os.Stderr, strconv.Itoa(numCols)) + fmt.Fprintf(os.Stderr, "\n") + // os.Exit(1) + } + + for i, str := range cols { + + str = CompressRunsOfSpaces(str) + str = strings.TrimSpace(str) + + flds = append(flds, str) + + // determine maximum length in each column + ln := utf8.RuneCountInString(str) + if ln > width[i] { + width[i] = ln + } + + code, ok := lettrs[i] + if !ok { + code = lst + } + + switch code { + case 'n', 'N', 'z', 'Z': + if isNumeric(str) { + // determine maximum length of decimal number parts + wh, fr := SplitInTwoLeft(str, ".") + if fr != "" { + fr = "." + fr + } + + lf := utf8.RuneCountInString(wh) + if lf > whole[i] { + whole[i] = lf + } + rt := utf8.RuneCountInString(fr) + if rt > fract[i] { + fract[i] = rt + } + ln = whole[i] + fract[i] + if ln > width[i] { + width[i] = ln + } + } + } + } + + return strings.Join(flds, "\t") + } + + for i := 0; i < numCols; i++ { + + code, ok := lettrs[i] + if !ok { + code = lst + } + + switch code { + case 'n', 'N', 'z', 'Z': + // adjust maximum widths with aligned decimal points + ln := whole[i] + fract[i] + if ln > width[i] { + width[i] = ln + } + } + } + + // clean up spaces, calculate column widths + for scanr.Scan() { + + row++ + line := scanr.Text() + if line == "" { + continue + } + + line = processLine(line) + arry = append(arry, line) + } + + var buffer strings.Builder + + for _, line := range arry { + + buffer.Reset() + + cols := strings.Split(line, "\t") + + btwn := mrg + for i, str := range cols { + + buffer.WriteString(btwn) + + code, ok := lettrs[i] + if !ok { + code = lst + } + + ln := utf8.RuneCountInString(str) + mx := width[i] + diff := mx - ln + lft := 0 + rgt := 0 + lftPad := " " + rgtPad := " " + + if diff > 0 { + switch code { + case 'l': + rgt = diff + case 'c': + lft = diff / 2 + rgt = diff - lft + case 'r': + lft = diff + case 'n', 'N', 'z', 'Z': + lft = diff + if isNumeric(str) { + switch code { + case 'N': + rgtPad = "0" + case 'z': + lftPad = "0" + case 'Z': + lftPad = "0" + rgtPad = "0" + } + sn := whole[i] + rc := fract[i] + wh, fr := SplitInTwoLeft(str, ".") + if fract[i] > 0 { + if fr == "" { + fr = "." + } else { + fr = "." + fr + } + lf := utf8.RuneCountInString(wh) + lft = sn - lf + rt := utf8.RuneCountInString(fr) + rgt = rc - rt + str = wh + fr + } + } + default: + rgt = diff + } + } + + for lft > 0 { + lft-- + buffer.WriteString(lftPad) + } + + buffer.WriteString(str) + btwn = pad + + for rgt > 0 { + rgt-- + buffer.WriteString(rgtPad) + } + } + + txt := buffer.String() + txt = strings.TrimRight(txt, " ") + "\n" + + if txt != "" { + out <- txt + } + } + } + + // launch single alignment goroutine + go alignTable(inp, out) + + return out +} diff --git a/eutils/chan.go b/eutils/chan.go new file mode 100644 index 0000000..3b6d2c1 --- /dev/null +++ b/eutils/chan.go @@ -0,0 +1,113 @@ +// =========================================================================== +// +// PUBLIC DOMAIN NOTICE +// National Center for Biotechnology Information (NCBI) +// +// This software/database is a "United States Government Work" under the +// terms of the United States Copyright Act. It was written as part of +// the author's official duties as a United States Government employee and +// thus cannot be copyrighted. This software/database is freely available +// to the public for use. The National Library of Medicine and the U.S. +// Government do not place any restriction on its use or reproduction. +// We would, however, appreciate having the NCBI and the author cited in +// any work or product based on this material. +// +// Although all reasonable efforts have been taken to ensure the accuracy +// and reliability of the software and data, the NLM and the U.S. +// Government do not and cannot warrant the performance or results that +// may be obtained by using this software or data. The NLM and the U.S. +// Government disclaim all warranties, express or implied, including +// warranties of performance, merchantability or fitness for any particular +// purpose. +// +// =========================================================================== +// +// File Name: chan.go +// +// Author: Jonathan Kans +// +// ========================================================================== + +package eutils + +import ( + "io" + "os" + "strings" +) + +// stringChanReader connect a string output channel to an io.Reader interface +type stringChanReader struct { + c <-chan string + s string +} + +func (r *stringChanReader) Read(b []byte) (n int, err error) { + + if r.s != "" { + n = copy(b, []byte(r.s)) + r.s = r.s[n:] + return + } + + for str := range r.c { + r.s = str + n = copy(b, []byte(r.s)) + r.s = r.s[n:] + return + } + + return 0, io.EOF +} + +// ChanToReader converts a string channel to an ioReader +func ChanToReader(inp <-chan string) io.Reader { + + if inp == nil { + return nil + } + + return &stringChanReader{c: inp} +} + +// ChanToStdout sends a string channel to stdout +func ChanToStdout(inp <-chan string) { + + if inp == nil { + return + } + + last := "" + + for str := range inp { + last = str + os.Stdout.WriteString(str) + } + + if !strings.HasSuffix(last, "\n") { + os.Stdout.WriteString("\n") + } +} + +// ChanToString converts a string channel to a string +func ChanToString(inp <-chan string) string { + + if inp == nil { + return "" + } + + var buffer strings.Builder + + last := "" + + for str := range inp { + last = str + buffer.WriteString(str) + } + + if !strings.HasSuffix(last, "\n") { + buffer.WriteString("\n") + } + + return buffer.String() +} diff --git a/eutils/format.go b/eutils/format.go index 6dcb6fd..010cd1a 100644 --- a/eutils/format.go +++ b/eutils/format.go @@ -368,7 +368,7 @@ func xmlFormatter(rcrd, prnt string, inp <-chan XMLToken, offset int, doXML bool return txt } - doCleanup := func(tkn XMLToken, nxtTag int, nxtName, nxtAttr string) { + cleanToken := func(tkn XMLToken, nxtTag int, nxtName, nxtAttr string) { if skip > 0 { skip-- @@ -541,7 +541,7 @@ func xmlFormatter(rcrd, prnt string, inp <-chan XMLToken, offset int, doXML bool } if primed { - doCleanup(prev, tkn.Tag, tkn.Name, tkn.Attr) + cleanToken(prev, tkn.Tag, tkn.Name, tkn.Attr) } prev = XMLToken{tkn.Tag, tkn.Cont, tkn.Name, tkn.Attr, tkn.Index, tkn.Line} diff --git a/eutils/misc.go b/eutils/misc.go index fe06c09..61b073a 100644 --- a/eutils/misc.go +++ b/eutils/misc.go @@ -35,8 +35,8 @@ import ( "golang.org/x/text/runes" "golang.org/x/text/transform" "golang.org/x/text/unicode/norm" + "html" "os" - "sort" "strconv" "strings" "sync" @@ -768,6 +768,56 @@ func CleanupContents(str string, ascii, amper, mixed bool) string { return str } +// CleanupQuery performs optional operations on XML query strings +func CleanupQuery(str string, exactMatch, removeBrackets bool) string { + + if exactMatch { + str = html.EscapeString(str) + } + + // cleanup string + if IsNotASCII(str) { + str = DoAccentTransform(str) + if HasUnicodeMarkup(str) { + str = RepairUnicodeMarkup(str, SPACE) + } + } + + if exactMatch { + str = strings.ToLower(str) + } + + if HasBadSpace(str) { + str = CleanupBadSpaces(str) + } + + if removeBrackets { + if HasAngleBracket(str) { + str = RepairEncodedMarkup(str) + str = RepairScriptMarkup(str, SPACE) + str = RepairMathMLMarkup(str, SPACE) + // RemoveEmbeddedMarkup must be called before UnescapeString, which was suppressed in ExploreElements + str = RemoveEmbeddedMarkup(str) + } + } + + if HasAmpOrNotASCII(str) { + str = html.UnescapeString(str) + } + + if IsNotASCII(str) { + if HasGreek(str) { + str = SpellGreek(str) + str = CompressRunsOfSpaces(str) + } + if !exactMatch { + str = UnicodeToASCII(str) + } + } + + return str +} + // CompressRunsOfSpaces turns runs of spaces into a single space func CompressRunsOfSpaces(str string) string { @@ -1012,47 +1062,6 @@ func FixSpecialCases(str string) string { return str } -// FixThemeCases expands Global Network of Biomedical Relationships theme abbreviations -func FixThemeCases(str string) string { - - if !strings.Contains(str, "[thme]") && !strings.Contains(str, "[conv]") { - return str - } - - var arry []string - - terms := strings.Fields(str) - - for _, item := range terms { - - switch item { - case "a+": - arry = append(arry, "ap") - case "e+": - arry = append(arry, "ep") - case "ec+": - arry = append(arry, "ecp") - case "eg+": - arry = append(arry, "egp") - case "v+": - arry = append(arry, "vp") - case "a-": - arry = append(arry, "am") - case "e-": - arry = append(arry, "em") - case "ec-": - arry = append(arry, "ecm") - default: - arry = append(arry, item) - } - } - - // reconstruct string from transformed words - str = strings.Join(arry, " ") - - return str -} - // FlattenMathML removes embedded MathML structure func FlattenMathML(str string, policy int) string { @@ -1285,30 +1294,6 @@ func HasHyphenOrApostrophe(str string) bool { return false } -// HasPlusOrMinus reports on plus or minus symbols -func HasPlusOrMinus(str string) bool { - - for _, ch := range str { - if ch == '-' || ch == '+' { - return true - } - } - - return false -} - -// HasSpaceOrHyphen reports if multiple words exist in string -func HasSpaceOrHyphen(str string) bool { - - for _, ch := range str { - if ch == ' ' || ch == '-' { - return true - } - } - - return false -} - // HasUnicodeMarkup checks for Unicode superscript or subscript characters func HasUnicodeMarkup(str string) bool { @@ -1389,18 +1374,6 @@ func HTMLRepair(str string) (string, bool) { return res, ok } -// IsAllCapsOrDigits matches upper-case letters or digits -func IsAllCapsOrDigits(str string) bool { - - for _, ch := range str { - if !unicode.IsUpper(ch) && !unicode.IsDigit(ch) { - return false - } - } - - return true -} - // IsAllDigits matches only digits func IsAllDigits(str string) bool { @@ -1425,29 +1398,6 @@ func IsAllDigitsOrPeriod(str string) bool { return true } -// IsAllNumeric accepts digits and arithmetic operator symbols -func IsAllNumeric(str string) bool { - - for _, ch := range str { - if !unicode.IsDigit(ch) && - ch != '.' && - ch != '+' && - ch != '-' && - ch != '*' && - ch != '/' && - ch != ',' && - ch != '$' && - ch != '#' && - ch != '%' && - ch != '(' && - ch != ')' { - return false - } - } - - return true -} - // IsNotASCII returns true for any character greater than 7-bits func IsNotASCII(str string) bool { @@ -2170,57 +2120,6 @@ func RepairUnicodeMarkup(str string, policy int) string { return buffer.String() } -// ReverseComplement returns the reverse complement of a sequence -func ReverseComplement(seq string) string { - - runes := []rune(seq) - // reverse sequence letters - middle base in odd-length sequence is not touched - for i, j := 0, len(runes)-1; i < j; i, j = i+1, j-1 { - runes[i], runes[j] = runes[j], runes[i] - } - found := false - // now complement every base, also handling uracil, leaving case intact - for i, ch := range runes { - runes[i], found = revComp[ch] - if !found { - runes[i] = 'X' - } - } - seq = string(runes) - - return seq -} - -// SequenceReverse reverses a sequence, but does not complement the bases -func SequenceReverse(seq string) string { - - runes := []rune(seq) - // reverse sequence letters - middle base in odd-length sequence is not touched - for i, j := 0, len(runes)-1; i < j; i, j = i+1, j-1 { - runes[i], runes[j] = runes[j], runes[i] - } - seq = string(runes) - - return seq -} - -// SortStringByWords sorts the individual words in a string -func SortStringByWords(str string) string { - - str = RemoveCommaOrSemicolon(str) - - if HasSpaceOrHyphen(str) { - flds := strings.Fields(str) - sort.Slice(flds, func(i, j int) bool { return flds[i] < flds[j] }) - str = strings.Join(flds, " ") - str = strings.Replace(str, "-", " ", -1) - str = CompressRunsOfSpaces(str) - str = strings.TrimRight(str, ".?:") - } - - return str -} - // SpellGreek spells Greek letters (e..g, alpha, beta) for easier searching func SpellGreek(str string) string { diff --git a/eutils/normal.go b/eutils/normal.go index de06363..f6a4505 100644 --- a/eutils/normal.go +++ b/eutils/normal.go @@ -50,7 +50,7 @@ func NormalizeXML(rdr <-chan XMLBlock, db string) <-chan string { os.Exit(1) } - tknq := CreateTokenizer("", "", rdr) + tknq := CreateTokenizer(rdr) if tknq == nil { fmt.Fprintf(os.Stderr, "\nERROR: Unable to create normalize tokenizer\n") diff --git a/eutils/parse.go b/eutils/parse.go index 23c2403..407d0a2 100644 --- a/eutils/parse.go +++ b/eutils/parse.go @@ -32,6 +32,7 @@ package eutils import ( "fmt" + "html" "os" "strings" ) @@ -102,7 +103,7 @@ type XMLToken struct { Line int } -// ParseAttributes produces tag/value pairs, only run on request +// ParseAttributes produces tag/value pairs, only run on request. func ParseAttributes(attrb string) []string { if attrb == "" { @@ -195,9 +196,9 @@ func ParseAttributes(attrb string) []string { return arry } -// parseXML calls XML parser on a partitioned string or on an XML reader, optimized for -// maximum processing speed, sending tokens for CDATA and COMMENT sections, and optionally -// tracking line numbers +// parseXML calls XML parser on a partitioned string or on an XMLBlock channel of trimmed strings. +// It is optimized for maximum processing speed, sends tokens for CDATA and COMMENT sections (for +// unpacking by NormalizeXML), and optionally tracks line numbers (for ValidateXML). func parseXML(record, parent string, inp <-chan XMLBlock, tokens func(XMLToken), find *XMLFind, ids func(string)) (*XMLNode, string) { if record == "" && (inp == nil || tokens == nil) { @@ -216,6 +217,7 @@ func parseXML(record, parent string, inp <-chan XMLBlock, tokens func(XMLToken), which := NOTAG skipTo := "" + // updateLineCount is used to keep track of the correct line count for XML validation updateLineCount := func(max int) { // count lines for i := lag; i < max; i++ { @@ -226,7 +228,7 @@ func parseXML(record, parent string, inp <-chan XMLBlock, tokens func(XMLToken), lag = Idx } - // calculate for warning messages, do not update lineNum or lag variables + // currentLineCount calculates correct line for warning messages, does not update lineNum or lag variables currentLineCount := func(max int) int { line := lineNum for i := lag; i < max; i++ { @@ -237,7 +239,7 @@ func parseXML(record, parent string, inp <-chan XMLBlock, tokens func(XMLToken), return line } - // get next XML token + // nextToken returns the type and content fields for the next XML token nextToken := func(idx int) (int, int, string, string, int) { if record == "" { @@ -669,7 +671,7 @@ func parseXML(record, parent string, inp <-chan XMLBlock, tokens func(XMLToken), farmMax := farmSize farmItems := make([]XMLNode, farmMax) - // allocate multiple nodes in a large array for memory management efficiency + // nextNode allocates multiple nodes in a large array for memory management efficiency nextNode := func(strt, attr, prnt string) *XMLNode { // if farm array slots used up, allocate new array @@ -699,13 +701,13 @@ func parseXML(record, parent string, inp <-chan XMLBlock, tokens func(XMLToken), // parseSpecial recursive definition var parseSpecial func(string, string, string) (*XMLNode, bool) - // parse XML tags into tree structure for searching, no contentMods flags set + // parseSpecial parses XML tags into tree structure for searching, no contentMods flags set parseSpecial = func(strt, attr, prnt string) (*XMLNode, bool) { var obj *XMLNode ok := true - // obtain next node from farm + // nextNode obtains next node from farm node := nextNode(strt, attr, prnt) if node == nil { return nil, false @@ -785,7 +787,7 @@ func parseXML(record, parent string, inp <-chan XMLBlock, tokens func(XMLToken), // parseLevel recursive definition var parseLevel func(string, string, string) (*XMLNode, bool) - // parse XML tags into tree structure for searching, some contentMods flags set + // parseLevel parses XML tags into tree structure for searching, some contentMods flags set parseLevel = func(strt, attr, prnt string) (*XMLNode, bool) { var obj *XMLNode @@ -902,7 +904,7 @@ func parseXML(record, parent string, inp <-chan XMLBlock, tokens func(XMLToken), // parseIndex recursive definition var parseIndex func(string, string, string) string - // parse XML tags looking for trie index element + // parseIndex parses XML tags looking for trie index element parseIndex = func(strt, attr, prnt string) string { versn := "" @@ -1098,7 +1100,7 @@ func parseXML(record, parent string, inp <-chan XMLBlock, tokens func(XMLToken), return top, "" } -// ParseRecord is the main public access to parseXML +// ParseRecord is the main public access to parseXML. func ParseRecord(text, parent string) *XMLNode { pat, _ := parseXML(text, parent, nil, nil, nil, nil) @@ -1106,7 +1108,7 @@ func ParseRecord(text, parent string) *XMLNode { return pat } -// FindIdentifier returns a single identifier +// FindIdentifier returns a single identifier. func FindIdentifier(text, parent string, find *XMLFind) string { _, id := parseXML(text, parent, nil, nil, find, nil) @@ -1114,19 +1116,19 @@ func FindIdentifier(text, parent string, find *XMLFind) string { return id } -// FindIdentifiers returns a set of identifiers through a callback +// FindIdentifiers returns a set of identifiers through a callback. func FindIdentifiers(text, parent string, find *XMLFind, ids func(string)) { parseXML(text, parent, nil, nil, find, ids) } -// StreamTokens streams tokens through a callback -func StreamTokens(text, parent string, inp <-chan XMLBlock, streamer func(tkn XMLToken)) { +// StreamTokens streams tokens from a reader through a callback. +func StreamTokens(inp <-chan XMLBlock, streamer func(tkn XMLToken)) { - parseXML(text, parent, inp, streamer, nil, nil) + parseXML("", "", inp, streamer, nil, nil) } -// StreamValues streams token values through a callback +// StreamValues streams token values from a parsed record through a callback. func StreamValues(text, parent string, stream func(string, string, string)) { elementName := "" @@ -1148,8 +1150,8 @@ func StreamValues(text, parent string, stream func(string, string, string)) { parseXML(text, parent, nil, streamer, nil, nil) } -// CreateTokenizer streams tokens through a channel -func CreateTokenizer(text, parent string, inp <-chan XMLBlock) <-chan XMLToken { +// CreateTokenizer streams tokens through a channel. +func CreateTokenizer(inp <-chan XMLBlock) <-chan XMLToken { if inp == nil { return nil @@ -1162,17 +1164,153 @@ func CreateTokenizer(text, parent string, inp <-chan XMLBlock) <-chan XMLToken { } // xmlTokenizer sends XML tokens through channel - xmlTokenizer := func(text, parent string, inp <-chan XMLBlock, out chan<- XMLToken) { + xmlTokenizer := func(inp <-chan XMLBlock, out chan<- XMLToken) { // close channel when all records have been processed defer close(out) // parse XML and send tokens through channel - parseXML(text, parent, inp, func(tkn XMLToken) { out <- tkn }, nil, nil) + parseXML("", "", inp, func(tkn XMLToken) { out <- tkn }, nil, nil) } // launch single tokenizer goroutine - go xmlTokenizer(text, parent, inp, out) + go xmlTokenizer(inp, out) return out } + +// ExploreElements returns matching element values to callback. +func ExploreElements(curr *XMLNode, mask, prnt, match, attrib string, wildcard, unescape bool, level int, proc func(string, int)) { + + if curr == nil || proc == nil { + return + } + + // **/Object performs deep exploration of recursive data (*/Object also supported) + deep := false + if prnt == "**" || prnt == "*" { + prnt = "" + deep = true + } + + // exploreChildren recursive definition + var exploreChildren func(curr *XMLNode, acc func(string)) + + // exploreChildren handles mixed-content chains of embedded tags + exploreChildren = func(curr *XMLNode, acc func(string)) { + + if curr.Contents != "" { + acc(curr.Contents) + } + for chld := curr.Children; chld != nil; chld = chld.Next { + if chld.Name != "" { + acc("<" + chld.Name + ">") + } + exploreChildren(chld, acc) + if chld.Name != "" { + acc("</" + chld.Name + ">") + } + } + } + + // exploreElements recursive definition + var exploreElements func(curr *XMLNode, skip string, lev int) + + // exploreElements visits nodes looking for matches to requested object + exploreElements = func(curr *XMLNode, skip string, lev int) { + + if !deep && curr.Name == skip { + // do not explore within recursive object + return + } + + if curr.Name == match || + // parent/* matches any subfield + (match == "*" && prnt != "") || + // wildcard (internal colon) matches any namespace prefix + (wildcard && strings.HasPrefix(match, ":") && strings.HasSuffix(curr.Name, match)) || + (match == "" && attrib != "") { + + if prnt == "" || + curr.Parent == prnt || + (wildcard && strings.HasPrefix(prnt, ":") && strings.HasSuffix(curr.Parent, prnt)) { + + if attrib != "" { + if curr.Attributes != "" && curr.Attribs == nil { + // parse attributes on-the-fly if queried + curr.Attribs = ParseAttributes(curr.Attributes) + } + for i := 0; i < len(curr.Attribs)-1; i += 2 { + // attributes now parsed into array as [ tag, value, tag, value, tag, value, ... ] + if curr.Attribs[i] == attrib || + (wildcard && strings.HasPrefix(attrib, ":") && strings.HasSuffix(curr.Attribs[i], attrib)) { + proc(curr.Attribs[i+1], level) + return + } + } + + } else if curr.Contents != "" { + + str := curr.Contents[:] + + if unescape && HasAmpOrNotASCII(str) { + // processing of <, >, &, ", and ' characters is now delayed until element contents is requested + str = html.UnescapeString(str) + } + + proc(str, level) + return + + } else if curr.Children != nil { + + if doMixed { + // match with mixed contents - send all child strings + var buffr strings.Builder + exploreChildren(curr, func(str string) { + if str != "" { + buffr.WriteString(str) + } + }) + str := buffr.String() + + // clean up reconstructed mixed content + str = DoTrimFlankingHTML(str) + if HasBadSpace(str) { + str = CleanupBadSpaces(str) + } + if HasAdjacentSpaces(str) { + str = CompressRunsOfSpaces(str) + } + if NeedsTightening(str) { + str = TightenParentheses(str) + } + if unescape && HasAmpOrNotASCII(str) { + str = html.UnescapeString(str) + } + + proc(str, level) + return + } + + // for XML container object, send empty string to callback to increment count + proc("", level) + // and continue exploring + + } else if curr.Attributes != "" { + + // for self-closing object, indicate presence by sending empty string to callback + proc("", level) + return + } + } + } + + for chld := curr.Children; chld != nil; chld = chld.Next { + // inner exploration is subject to recursive object exclusion + exploreElements(chld, mask, lev+1) + } + } + + // start recursive exploration from current scope + exploreElements(curr, "", level) +} diff --git a/eutils/spdi.go b/eutils/spdi.go index 40b7177..177e17d 100644 --- a/eutils/spdi.go +++ b/eutils/spdi.go @@ -320,3 +320,37 @@ func SequenceExtract(seq, featLoc string) string { return buffer.String() } + +// ReverseComplement returns the reverse complement of a sequence +func ReverseComplement(seq string) string { + + runes := []rune(seq) + // reverse sequence letters - middle base in odd-length sequence is not touched + for i, j := 0, len(runes)-1; i < j; i, j = i+1, j-1 { + runes[i], runes[j] = runes[j], runes[i] + } + found := false + // now complement every base, also handling uracil, leaving case intact + for i, ch := range runes { + runes[i], found = revComp[ch] + if !found { + runes[i] = 'X' + } + } + seq = string(runes) + + return seq +} + +// SequenceReverse reverses a sequence, but does not complement the bases +func SequenceReverse(seq string) string { + + runes := []rune(seq) + // reverse sequence letters - middle base in odd-length sequence is not touched + for i, j := 0, len(runes)-1; i < j; i, j = i+1, j-1 { + runes[i], runes[j] = runes[j], runes[i] + } + seq = string(runes) + + return seq +} diff --git a/eutils/split.go b/eutils/split.go index dcb940e..a01c866 100644 --- a/eutils/split.go +++ b/eutils/split.go @@ -34,20 +34,25 @@ import ( "strings" ) -// PartitionPattern splits XML input from <pattern> to </pattern> and sends individual records to a callback +// PartitionPattern splits XML input from <pattern> to </pattern> and sends +// individual records to a callback. Requiring the input to be an XMLBlock +// channel of trimmed strings, generated by CreateXMLStreamer, simplifies the +// code by eliminating the need to check for an incomplete object tag at the end. func PartitionPattern(pat, star string, inp <-chan XMLBlock, proc func(string)) { if pat == "" || inp == nil || proc == nil { return } + // Scanner stores the precomputed Boyer-Moore-Horspool pattern matching table. + // By experiment, this was slightly (but reproducibly) faster than the Boyer-Moore-Sunday variant. type Scanner struct { Pattern string PatLength int CharSkip [256]int } - // initialize <pattern> to </pattern> scanner + // newScanner initializes <pattern> to </pattern> scanner. newScanner := func(pattern string) *Scanner { if pattern == "" { @@ -74,7 +79,7 @@ func PartitionPattern(pat, star string, inp <-chan XMLBlock, proc func(string)) return scr } - // check surroundings of match candidate + // isAnElement checks surroundings of match candidate. isAnElement := func(text string, lf, rt, mx int) bool { if (lf >= 0 && text[lf] == '<') || (lf > 0 && text[lf] == '/' && text[lf-1] == '<') { @@ -86,7 +91,7 @@ func PartitionPattern(pat, star string, inp <-chan XMLBlock, proc func(string)) return false } - // modified Boyer-Moore-Horspool search function + // findNextMatch is a modified Boyer-Moore-Horspool search function for maximum partitioning speed. findNextMatch := func(scr *Scanner, text string, offset int) (int, int, int) { if scr == nil || text == "" { @@ -132,8 +137,10 @@ func PartitionPattern(pat, star string, inp <-chan XMLBlock, proc func(string)) return -1, -1, -1 } + // PatternType is the integer type for XML tag classification type PatternType int + // PatternType keys for XML parsing const ( NOPATTERN PatternType = iota STARTPATTERN @@ -141,7 +148,7 @@ func PartitionPattern(pat, star string, inp <-chan XMLBlock, proc func(string)) STOPPATTERN ) - // find next element with pattern name + // nextPattern finds next element with pattern name. nextPattern := func(scr *Scanner, text string, pos int) (PatternType, int, int, int) { if scr == nil || text == "" { @@ -168,8 +175,7 @@ func PartitionPattern(pat, star string, inp <-chan XMLBlock, proc func(string)) } } - // -pattern Object construct - + // doNormal handles -pattern Object construct, keeping track of nesting level. doNormal := func() { // current depth of -pattern objects @@ -239,9 +245,9 @@ func PartitionPattern(pat, star string, inp <-chan XMLBlock, proc func(string)) } } - // -pattern Parent/* construct now works with catenated files, but not if components - // are recursive or self-closing objects, process those through -format first - + // doStar handles -pattern Parent/* construct for heterogeneous objects. It now works + // with concatenated files, but not if components are recursive or self-closing objects. + // Process the latter through transmute -format -self first. doStar := func() { // current depth of -pattern objects @@ -302,7 +308,7 @@ func PartitionPattern(pat, star string, inp <-chan XMLBlock, proc func(string)) } if tag[0] == '/' { if strings.HasPrefix(tag[1:], pat) { - //should be </pattern> at end, want to continue if catenated files + //should be </pattern> at end, want to continue if concatenated files return "/" } return "" @@ -335,7 +341,7 @@ func PartitionPattern(pat, star string, inp <-chan XMLBlock, proc func(string)) return } - // check for catenated parent set files + // check for concatenated parent set files if tag[0] == '/' { scr = newScanner(pat) if scr == nil { diff --git a/eutils/utils.go b/eutils/utils.go index 7ec3638..a2f8cb6 100644 --- a/eutils/utils.go +++ b/eutils/utils.go @@ -34,12 +34,10 @@ import ( "fmt" "github.com/klauspost/cpuid" "github.com/pbnjay/memory" - "io" "os" "runtime" "runtime/debug" "strconv" - "strings" "time" ) @@ -83,8 +81,6 @@ var ( doASCII bool doCompress bool doCleanup bool - doStem bool - deStop bool ) // additional options @@ -189,7 +185,7 @@ func SetTunings(nmProcs, nmServe, svRatio, chnDepth, frmSize, hepSize, gogc int) } // SetOptions sets processing options -func SetOptions(strict, mixed, accent, ascii, compress, cleanup, stems, stops, count bool) { +func SetOptions(strict, mixed, accent, ascii, compress, cleanup bool) { doStrict = strict doMixed = mixed @@ -200,10 +196,7 @@ func SetOptions(strict, mixed, accent, ascii, compress, cleanup, stems, stops, c doCompress = compress doCleanup = cleanup - doStem = stems - deStop = !stops - - countLines = count + countLines = false // set dependent flags countLines = doMixed @@ -223,122 +216,16 @@ func NumServe() int { return numServe } -// DoStrict returns the -strict value -func DoStrict() bool { - - return doStrict -} - -// DoMixed returns the -mixed value -func DoMixed() bool { - - return doMixed -} - -// DeAccent returns the -accent value -func DeAccent() bool { - - return deAccent -} - -// DoASCII returns the -ascii value -func DoASCII() bool { - - return doASCII -} - -// DoCompress returns the -compress value -func DoCompress() bool { - - return doCompress -} - -// DeStop returns the -stops value -func DeStop() bool { - - return deStop -} - -// DoStem returns the -stems value -func DoStem() bool { - - return doStem -} - -// stringChanReader connect a string output channel to an io.Reader interface -type stringChanReader struct { - c <-chan string - s string -} - -func (r *stringChanReader) Read(b []byte) (n int, err error) { - - if r.s != "" { - n = copy(b, []byte(r.s)) - r.s = r.s[n:] - return - } - - for str := range r.c { - r.s = str - n = copy(b, []byte(r.s)) - r.s = r.s[n:] - return - } - - return 0, io.EOF -} - -// ChanToIoReader converts a string channel to an ioReader -func ChanToIoReader(inp <-chan string) io.Reader { +// GetTunings returns performance parameter values +func GetTunings() (nmProcs, nmServe, svRatio, chnDepth, frmSize, hepSize, gogc int) { - if inp == nil { - return nil - } - - return &stringChanReader{c: inp, s: ""} + return numProcs, numServe, serverRatio, chanDepth, farmSize, heapSize, goGc } -// ChanToStdout sends a string channel to stdout -func ChanToStdout(inp <-chan string) { - - if inp == nil { - return - } - - last := "" - - for str := range inp { - last = str - os.Stdout.WriteString(str) - } - - if !strings.HasSuffix(last, "\n") { - os.Stdout.WriteString("\n") - } -} - -// ChanToString converts a string channel to a string -func ChanToString(inp <-chan string) string { - - if inp == nil { - return "" - } - - var buffer strings.Builder - - last := "" - - for str := range inp { - last = str - buffer.WriteString(str) - } - - if !strings.HasSuffix(last, "\n") { - buffer.WriteString("\n") - } +// GetOptions returns processing option values +func GetOptions() (strict, mixed, accent, ascii, compress, cleanup bool) { - return buffer.String() + return doStrict, doMixed, deAccent, doASCII, doCompress, doCleanup } // GetNumericArg returns an integer argument, reporting an error if no remaining arguments @@ -556,7 +443,7 @@ func init() { inAsnBits['\''] = false // initialize reading and cleaning options with default values - SetOptions(false, false, false, false, false, false, false, false, false) + SetOptions(false, false, false, false, false, false) // initialize performance tuning variables with default values SetTunings(0, 0, 0, 0, 0, 0, 0) diff --git a/eutils/valid.go b/eutils/valid.go index 0c30530..9bddf45 100644 --- a/eutils/valid.go +++ b/eutils/valid.go @@ -45,7 +45,7 @@ func ValidateXML(rdr <-chan XMLBlock, fnd string, html bool) int { countLines = true - tknq := CreateTokenizer("", "", rdr) + tknq := CreateTokenizer(rdr) if tknq == nil { fmt.Fprintf(os.Stderr, "\nERROR: Unable to create validator tokenizer\n") diff --git a/eutils/xml.go b/eutils/xml.go index aed00f3..6a4be7c 100644 --- a/eutils/xml.go +++ b/eutils/xml.go @@ -38,10 +38,14 @@ import ( "os" ) -// XMLBlock is a string with a leading left angle bracket and trailing right angle bracket +// XMLBlock is a string that begins with a left angle bracket and is trimmed back to +// end with a right angle bracket. The excluded characters are saved and prepended +// to the next buffer. Providing complete object tags simplifies subsequent parsing. type XMLBlock string -// CreateXMLStreamer reads XML input file into a channel of trimmed blocks +// CreateXMLStreamer reads XML input into a channel of trimmed strings that are +// then split by PartitionPattern into individual records (which can be processed +// concurrently), or parsed directly into a channel of tokens by CreateTokenizer. func CreateXMLStreamer(in io.Reader) <-chan XMLBlock { if in == nil { @@ -54,15 +58,17 @@ func CreateXMLStreamer(in io.Reader) <-chan XMLBlock { os.Exit(1) } - // xmlReader sends XML blocks through channel + // xmlReader sends trimmed XML blocks through the output channel. xmlReader := func(in io.Reader, out chan<- XMLBlock) { // close channel when all blocks have been processed defer close(out) - // 65536 appears to be the maximum number of characters presented to io.Reader when input is piped from stdin - // increasing size of buffer when input is from a file does not improve program performance - // additional 16384 bytes are reserved for copying previous remainder to start of buffer before next read + // 65536 appears to be the maximum number of characters presented to io.Reader + // when input is piped from stdin. Increasing the buffer size when input is from + // a file does not improve program performance. An additional 16384 bytes are + // reserved for copying the previous remainder to the beginning of the buffer + // before the next read. const XMLBUFSIZE = 65536 + 16384 buffer := make([]byte, XMLBUFSIZE) @@ -71,10 +77,15 @@ func CreateXMLStreamer(in io.Reader) <-chan XMLBlock { delta := 0 isClosed := false + // htmlBehind is used in strict mode to trim back further when a lower-case tag + // is encountered. This may be a formatting decoration, such as <i> or </i> for + // italics. Processing HTML, which may have embedded mixed content, requires use + // of mixed mode. htmlBehind := func(bufr []byte, pos, txtlen int) bool { for pos >= 0 { if bufr[pos] == '<' { + // detect lower-case markup tags, or DispFormula in PubMed return HTMLAhead(string(bufr), pos, txtlen) != 0 } pos-- @@ -83,7 +94,10 @@ func CreateXMLStreamer(in io.Reader) <-chan XMLBlock { return false } - // read one buffer, trim at last > and retain remainder for next call, signal if no > character + // nextBuffer reads one buffer, trims back to the right-most > character, and + // retains the remainder for prepending in the next call. It also signals if + // there was no > character, resulting in subsequent calls to nextBuffer to + // continue reading a large content string. nextBuffer := func() ([]byte, bool, bool) { if isClosed { @@ -94,30 +108,34 @@ func CreateXMLStreamer(in io.Reader) <-chan XMLBlock { m := copy(buffer, remainder) remainder = "" if m > 16384 { - // previous remainder is larger than reserved section, write and signal need to continue reading + // previous remainder is larger than reserved section, + // write and signal the need to continue reading. return buffer[:m], true, false } // read next block, append behind copied remainder from previous read n, err := in.Read(buffer[m:]) - // with data piped through stdin, read function may not always return the same number of bytes each time + // with data piped through stdin, read function may not always return the + // same number of bytes each time if err != nil { if err != io.EOF { - // real error + // real error. fmt.Fprintf(os.Stderr, "\nERROR: %s\n", err.Error()) - // Ignore bytes - non-conforming implementations of io.Reader may returned mangled data on non-EOF errors + // ignore bytes - non-conforming implementations of io.Reader may + // return mangled data on non-EOF errors isClosed = true return nil, false, true } - // end of file + // end of file. isClosed = true if n == 0 { - // if EOF and no more data, do not send final remainder (not terminated by right angle bracket that is used as a sentinel) + // if EOF and no more data, do not send final remainder (not terminated + // by right angle bracket that is used as a sentinel) return nil, false, true } } if n < 0 { - // Reality check - non-conforming implementations of io.Reader may return -1 + // reality check - non-conforming implementations of io.Reader may return -1 fmt.Fprintf(os.Stderr, "\nERROR: io.Reader returned negative count %d\n", n) // treat as n == 0 in order to update file offset and avoid losing previous remainder n = 0 @@ -130,13 +148,14 @@ func CreateXMLStreamer(in io.Reader) <-chan XMLBlock { // slice of actual characters read bufr := buffer[:n+m] - // look for last > character - // safe to back up on UTF-8 rune array when looking for 7-bit ASCII character + // Look for last > character. It is safe to back up on UTF-8 rune array when looking + // for a 7-bit ASCII character. pos := -1 for pos = len(bufr) - 1; pos >= 0; pos-- { if bufr[pos] == '>' { if doStrict { - // optionally skip backwards past embedded i, b, u, sub, and sup HTML open, close, and empty tags, and MathML + // optionally skip backwards past embedded i, b, u, sub, and sup + // HTML open, close, and empty tags, and MathML instructions if htmlBehind(bufr, pos, len(bufr)) { continue } @@ -157,8 +176,9 @@ func CreateXMLStreamer(in io.Reader) <-chan XMLBlock { return bufr[:], true, false } - // nextBlock reads buffer, concatenates if necessary to place long element content into a single string - // all result strings end in > character that is used as a sentinel in subsequent code + // nextBlock reads buffer, concatenates if necessary to place long element content + // into a single string. All result strings end in > character that is used as a + // sentinel in subsequent code. nextBlock := func() string { // read next buffer @@ -169,8 +189,8 @@ func CreateXMLStreamer(in io.Reader) <-chan XMLBlock { return "" } - // if buffer does not end with > character if cont { + // current line does not end with > character var buff bytes.Buffer // keep reading long content blocks @@ -202,7 +222,7 @@ func CreateXMLStreamer(in io.Reader) <-chan XMLBlock { // trimming spaces here would throw off line tracking - // optionally compress/cleanup tags/attributes and contents (undocumented) + // optionally compress/cleanup tags/attributes and contents if doCleanup { if HasBadSpace(str) { str = CleanupBadSpaces(str) @@ -227,7 +247,9 @@ func CreateXMLStreamer(in io.Reader) <-chan XMLBlock { return out } -// XMLRecord structure wraps a numbered XML record sent down a channel +// XMLRecord wraps a numbered XML record or the results of data extraction on +// that record. The Index field stores the record's original position in the +// input stream. The Data field is used for binary compressed PubmedArticle XML. type XMLRecord struct { Index int Ident string @@ -235,30 +257,9 @@ type XMLRecord struct { Data []byte } -type xmlRecordHeap []XMLRecord - -// methods that satisfy heap.Interface -func (h xmlRecordHeap) Len() int { - return len(h) -} -func (h xmlRecordHeap) Less(i, j int) bool { - return h[i].Index < h[j].Index -} -func (h xmlRecordHeap) Swap(i, j int) { - h[i], h[j] = h[j], h[i] -} -func (h *xmlRecordHeap) Push(x interface{}) { - *h = append(*h, x.(XMLRecord)) -} -func (h *xmlRecordHeap) Pop() interface{} { - old := *h - n := len(old) - x := old[n-1] - *h = old[0 : n-1] - return x -} - -// CreateXMLProducer partitions an XML set and sends it down a channel +// CreateXMLProducer partitions an XML set and sends records down a channel. +// After processing asynchronously in multiple concurrent go routines, the +// original order can be restored by passage through the XMLUnshuffler. func CreateXMLProducer(pat, star string, rdr <-chan XMLBlock) <-chan XMLRecord { if rdr == nil { @@ -271,7 +272,7 @@ func CreateXMLProducer(pat, star string, rdr <-chan XMLBlock) <-chan XMLRecord { os.Exit(1) } - // xmlProducer sends partitioned XML strings through channel + // xmlProducer sends partitioned XML strings through channel. xmlProducer := func(pat, star string, rdr <-chan XMLBlock, out chan<- XMLRecord) { // close channel when all records have been processed @@ -293,7 +294,32 @@ func CreateXMLProducer(pat, star string, rdr <-chan XMLBlock) <-chan XMLRecord { return out } -// CreateXMLUnshuffler uses heap to restore output of multiple consumers to original record order +// xmlRecordHeap collects asynchronous processing results for presentation in the original order. +type xmlRecordHeap []XMLRecord + +// methods that satisfy heap.Interface +func (h xmlRecordHeap) Len() int { + return len(h) +} +func (h xmlRecordHeap) Less(i, j int) bool { + return h[i].Index < h[j].Index +} +func (h xmlRecordHeap) Swap(i, j int) { + h[i], h[j] = h[j], h[i] +} +func (h *xmlRecordHeap) Push(x interface{}) { + *h = append(*h, x.(XMLRecord)) +} +func (h *xmlRecordHeap) Pop() interface{} { + old := *h + n := len(old) + x := old[n-1] + *h = old[0 : n-1] + return x +} + +// CreateXMLUnshuffler passes the output of multiple concurrent processors to +// a heap, which releases results in the same order as the original records. func CreateXMLUnshuffler(inp <-chan XMLRecord) <-chan XMLRecord { if inp == nil { @@ -306,7 +332,7 @@ func CreateXMLUnshuffler(inp <-chan XMLRecord) <-chan XMLRecord { os.Exit(1) } - // xmlUnshuffler restores original order with heap + // xmlUnshuffler restores original order with heap. xmlUnshuffler := func(inp <-chan XMLRecord, out chan<- XMLRecord) { // close channel when all records have been processed @@ -326,7 +352,8 @@ func CreateXMLUnshuffler(inp <-chan XMLRecord) <-chan XMLRecord { // push result onto heap heap.Push(hp, ext) - // read several values before checking to see if next record to print has been processed + // Read several values before checking to see if next record to print has been processed. + // The default heapSize value has been tuned by experiment for maximum performance. if delay < heapSize { delay++ continue @@ -356,11 +383,11 @@ func CreateXMLUnshuffler(inp <-chan XMLRecord) <-chan XMLRecord { next++ } - // keep checking heap to see if next result is already available + // continue to check heap to see if next result is already available } } - // send remainder of heap to output + // flush remainder of heap to output for hp.Len() > 0 { curr := heap.Pop(hp).(XMLRecord) diff --git a/exclude-uid-lists b/exclude-uid-lists index 3d21831..0c2792b 100755 --- a/exclude-uid-lists +++ b/exclude-uid-lists @@ -1,5 +1,8 @@ #!/bin/bash -norc +# Public domain notice for all NCBI EDirect scripts is located at: +# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE + # Usage: exclude-uid-lists FILE1 FILE2 comm -23 <(sort -f "$1") <(sort -f "$2") | sort -n diff --git a/expand-current b/expand-current index 1d00fc2..2a7e1fa 100755 --- a/expand-current +++ b/expand-current @@ -1,5 +1,8 @@ #!/bin/sh +# Public domain notice for all NCBI EDirect scripts is located at: +# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE + while [ $# -gt 0 ] do case "$1" in diff --git a/fetch-pubmed b/fetch-pubmed index 4984cb0..c4346d2 100755 --- a/fetch-pubmed +++ b/fetch-pubmed @@ -1,5 +1,8 @@ #!/bin/sh +# Public domain notice for all NCBI EDirect scripts is located at: +# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE + doall=false dofresh=false flag="none" diff --git a/filter-stop-words b/filter-stop-words index 51a3fa2..82ed462 100755 --- a/filter-stop-words +++ b/filter-stop-words @@ -1,5 +1,8 @@ #!/bin/bash -norc +# Public domain notice for all NCBI EDirect scripts is located at: +# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE + stop_words="#a#about#above#abs#accordingly#across#after#afterwards#again#\ against#all#almost#alone#along#already#also#although#always#am#among#\ amongst#an#analyze#and#another#any#anyhow#anyone#anything#anywhere#\ diff --git a/filter-table b/filter-table new file mode 100755 index 0000000..8be2240 --- /dev/null +++ b/filter-table @@ -0,0 +1,8 @@ +#!/bin/sh + +# Public domain notice for all NCBI EDirect scripts is located at: +# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE + +# MUST be called with single quotes, e.g.: +# filter-table '10 <= $2 && $2 <= 30' +awk -F '\t' -v 'OFS=\t' "( $* ) {print}" @@ -1,3 +1,6 @@ #!/bin/sh +# Public domain notice for all NCBI EDirect scripts is located at: +# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE + transmute -g2x "$@" diff --git a/hlp-xtract.txt b/hlp-xtract.txt index a259e9f..df9944d 100644 --- a/hlp-xtract.txt +++ b/hlp-xtract.txt @@ -119,6 +119,14 @@ Citation Lookup efilter -days 365 | efetch -format abstract +Stopwords and Stemming + + pm=$( efetch -db pubmed -id 2005826 -format xml ) + echo "$pm" | xtract -pattern PubmedArticle -sep " " -words ArticleTitle + echo "$pm" | xtract -stops -pattern PubmedArticle -sep " " -words ArticleTitle + echo "$pm" | xtract -stems -pattern PubmedArticle -sep " " -words ArticleTitle + echo "$pm" | xtract -stops -stems -pattern PubmedArticle -sep " " -words ArticleTitle + DOI Extraction esearch -db pubmed -query "Rowley JD [AUTH]" | @@ -347,7 +355,8 @@ Genome Range -min ChrStart,ChrStop -element "&NAME" "&DESC" | sort -k 1,1n | cut -f 2- | grep -v pseudogene | grep -v uncharacterized | - between-two-genes ASMT IL3RA + between-two-genes ASMT IL3RA | + align-columns -g 4 IL3RA interleukin 3 receptor subunit alpha SLC25A6 solute carrier family 25 member 6 @@ -361,7 +370,7 @@ Genome Range Centromere Position nquire -ftp ftp.ncbi.nlm.nih.gov pub/gdp ideogram_9606_GCF_000001305.14_850_V1 | - grep acen | cut -f 1,2,6,7 | grep "^X\t" + grep acen | cut -f 1,2,6,7 | grep "^X" X p 58100001 61000000 X q 61000001 63800000 diff --git a/index-extras b/index-extras index adbe5cb..5359443 100755 --- a/index-extras +++ b/index-extras @@ -1,5 +1,8 @@ #!/bin/sh +# Public domain notice for all NCBI EDirect scripts is located at: +# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE + while [ $# -gt 0 ] do case "$1" in diff --git a/index-pubmed b/index-pubmed index ce6e7cf..003b33d 100755 --- a/index-pubmed +++ b/index-pubmed @@ -1,5 +1,8 @@ #!/bin/sh +# Public domain notice for all NCBI EDirect scripts is located at: +# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE + startat=0 while [ $# -gt 0 ] diff --git a/intersect-uid-lists b/intersect-uid-lists index 23cc2f0..61281b5 100755 --- a/intersect-uid-lists +++ b/intersect-uid-lists @@ -1,5 +1,8 @@ #!/bin/bash -norc +# Public domain notice for all NCBI EDirect scripts is located at: +# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE + # Usage: intersect-uid-lists FILE1 FILE2 comm -12 <(sort -f "$1") <(sort -f "$2") | sort -n diff --git a/join-into-groups-of b/join-into-groups-of index 22bb6c4..7832a13 100755 --- a/join-into-groups-of +++ b/join-into-groups-of @@ -1,3 +1,7 @@ #!/bin/sh + +# Public domain notice for all NCBI EDirect scripts is located at: +# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE + xargs -n "$@" echo | sed 's/ /,/g' @@ -1,5 +1,8 @@ #!/bin/sh +# Public domain notice for all NCBI EDirect scripts is located at: +# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE + if [ "$#" -eq 0 ] then echo "Must supply path for archive files" @@ -1,5 +1,8 @@ #!/bin/sh +# Public domain notice for all NCBI EDirect scripts is located at: +# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE + if [ "$#" -eq 0 ] then echo "Must supply path for indexed files" @@ -1,5 +1,8 @@ #!/bin/sh +# Public domain notice for all NCBI EDirect scripts is located at: +# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE + if [ "$#" -eq 0 ] then echo "Must supply path for inverted files" @@ -1,5 +1,8 @@ #!/bin/sh +# Public domain notice for all NCBI EDirect scripts is located at: +# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE + if [ "$#" -eq 0 ] then echo "Must supply path for merged files" @@ -1,5 +1,8 @@ #!/bin/sh +# Public domain notice for all NCBI EDirect scripts is located at: +# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE + if [ "$#" -eq 0 ] then echo "Must supply path to archive files" @@ -1,5 +1,8 @@ #!/bin/sh +# Public domain notice for all NCBI EDirect scripts is located at: +# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE + if [ "$#" -eq 0 ] then echo "Must supply path for postings files" @@ -1,5 +1,8 @@ #!/bin/sh +# Public domain notice for all NCBI EDirect scripts is located at: +# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE + if [ "$#" -eq 0 ] then echo "Must supply path for archive files" @@ -1,5 +1,8 @@ #!/bin/sh +# Public domain notice for all NCBI EDirect scripts is located at: +# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE + if [ "$#" -eq 0 ] then echo "Must supply path to archive files" diff --git a/print-columns b/print-columns new file mode 100755 index 0000000..42aeb6f --- /dev/null +++ b/print-columns @@ -0,0 +1,8 @@ +#!/bin/sh + +# Public domain notice for all NCBI EDirect scripts is located at: +# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE + +# MUST be called with single quotes, e.g.: +# print-columns '$1, $2+1, $3, $4-1, $5' +awk -F '\t' -v 'OFS=\t' "{print $*}" diff --git a/reorder-columns b/reorder-columns index eaa0b9f..f8fc2dd 100755 --- a/reorder-columns +++ b/reorder-columns @@ -1,12 +1,15 @@ #!/bin/sh +# Public domain notice for all NCBI EDirect scripts is located at: +# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE + # Usage: reorder-columns COLUMN NUMBERS... cmd="" com="$" for col in "$@" do - cmd=`echo "$cmd$com$col"` + cmd=$( echo "$cmd$com$col" ) com=", $" done awk -F '\t' -v 'OFS=\t' "{print $cmd}" diff --git a/run-ncbi-converter b/run-ncbi-converter index c8a0f20..222e8c7 100755 --- a/run-ncbi-converter +++ b/run-ncbi-converter @@ -1,4 +1,8 @@ #!/usr/bin/env perl + +# Public domain notice for all NCBI EDirect scripts is located at: +# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE + use warnings; use strict; diff --git a/setup-deps.pl b/setup-deps.pl index f2ac7d2..081a406 100755 --- a/setup-deps.pl +++ b/setup-deps.pl @@ -1,4 +1,8 @@ #!/usr/bin/env perl + +# Public domain notice for all NCBI EDirect scripts is located at: +# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE + use warnings; use strict; use CPAN::MyConfig; @@ -1,5 +1,8 @@ #!/bin/bash -norc +# Public domain notice for all NCBI EDirect scripts is located at: +# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE + PERL=perl case "`uname -s`" in Darwin ) diff --git a/skip-if-file-exists b/skip-if-file-exists index 0cb450e..5759fb8 100755 --- a/skip-if-file-exists +++ b/skip-if-file-exists @@ -1,4 +1,8 @@ #!/bin/bash -norc + +# Public domain notice for all NCBI EDirect scripts is located at: +# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE + while IFS=$'\t' read fl do if [ ! -f "$fl" ] diff --git a/sort-table b/sort-table new file mode 100755 index 0000000..b632ef4 --- /dev/null +++ b/sort-table @@ -0,0 +1,6 @@ +#!/bin/sh + +# Public domain notice for all NCBI EDirect scripts is located at: +# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE + +sort -t "$(printf '\t')" "$@" diff --git a/sort-uniq-count b/sort-uniq-count index 1fcdd8e..69fc193 100755 --- a/sort-uniq-count +++ b/sort-uniq-count @@ -1,13 +1,18 @@ #!/bin/sh + +# Public domain notice for all NCBI EDirect scripts is located at: +# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE + flags="f" if [ -n "$*" ] then - flags=`echo " $*" | sed 's/[^bfinrs]//g'` + flags=$( echo " $*" | sed 's/[^bfinrs]//g' ) if [ -z "$flags" ] then flags="s" fi fi +grep '.' | sort "-$flags" | uniq -i -c | awk '{ n=$1; sub(/[ \t]*[0-9]+[ \t]/, ""); print n "\t" $0 }' diff --git a/sort-uniq-count-rank b/sort-uniq-count-rank index 7aa67c5..b363daf 100755 --- a/sort-uniq-count-rank +++ b/sort-uniq-count-rank @@ -1,13 +1,18 @@ #!/bin/sh + +# Public domain notice for all NCBI EDirect scripts is located at: +# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE + flags="f" if [ -n "$*" ] then - flags=`echo " $*" | sed 's/[^bfinrs]//g'` + flags=$( echo " $*" | sed 's/[^bfinrs]//g' ) if [ -z "$flags" ] then flags="s" fi fi +grep '.' | sort "-$flags" | uniq -i -c | awk '{ n=$1; sub(/[ \t]*[0-9]+[ \t]/, ""); print n "\t" $0 }' | diff --git a/stream-pubmed b/stream-pubmed index 0419ee3..b3d9e60 100755 --- a/stream-pubmed +++ b/stream-pubmed @@ -1,5 +1,8 @@ #!/bin/sh +# Public domain notice for all NCBI EDirect scripts is located at: +# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE + flag="none" while [ $# -gt 0 ] diff --git a/test-edirect b/test-edirect index 3b1c5fa..0af9687 100755 --- a/test-edirect +++ b/test-edirect @@ -198,6 +198,14 @@ PrintTimeAndTitle "Citation Lookup" efilter -days 365 | efetch -format abstract +PrintTimeAndTitle "Stopwords and Stemming" + + pm=$( efetch -db pubmed -id 2005826 -format xml ) + echo "$pm" | xtract -pattern PubmedArticle -sep " " -words ArticleTitle + echo "$pm" | xtract -stops -pattern PubmedArticle -sep " " -words ArticleTitle + echo "$pm" | xtract -stems -pattern PubmedArticle -sep " " -words ArticleTitle + echo "$pm" | xtract -stops -stems -pattern PubmedArticle -sep " " -words ArticleTitle + PrintTimeAndTitle "DOI Extraction" esearch -db pubmed -query "Rowley JD [AUTH]" | @@ -313,12 +321,13 @@ PrintTimeAndTitle "Genome Range" -min ChrStart,ChrStop -element "&NAME" "&DESC" | sort -k 1,1n | cut -f 2- | grep -v pseudogene | grep -v uncharacterized | - between-two-genes ASMT IL3RA + between-two-genes ASMT IL3RA | + align-columns -g 4 PrintTimeAndTitle "Centromere Position" nquire -ftp ftp.ncbi.nlm.nih.gov pub/gdp ideogram_9606_GCF_000001305.14_850_V1 | - grep acen | cut -f 1,2,6,7 | grep "^X\t" + grep acen | cut -f 1,2,6,7 | grep "^X" PrintTimeAndTitle "Gene Regions" @@ -442,6 +451,15 @@ PrintTimeAndTitle "Structural Similarity" -if PdbClass -equals Hydrolase \ -element PdbAcc PdbDescr +PrintTimeAndTitle "Underscore Protection" + + esearch -db biosample -query "package metagenome or environmental version 1 0 [PROP]" | + xtract -pattern ENTREZ_DIRECT -element Count + + esearch -db assembly -query "algae [ORGN] AND complete genome [FILT]" | + efilter -query "refseq has annotation [PROP] NOT anomalous [FILT]" | + xtract -pattern ENTREZ_DIRECT -element Count + PrintTimeAndTitle "Amino Acid Substitutions" esearch -db gene -query "OPN1MW [PREF] AND human [ORGN]" | diff --git a/test-pubmed-index b/test-pubmed-index index 554cc04..26675c6 100755 --- a/test-pubmed-index +++ b/test-pubmed-index @@ -1,5 +1,8 @@ #!/bin/bash +# Public domain notice for all NCBI EDirect scripts is located at: +# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE + seconds_start=$(date "+%s") for i in {1..100} do diff --git a/theme-aliases b/theme-aliases index a903480..f35ca0f 100755 --- a/theme-aliases +++ b/theme-aliases @@ -1,5 +1,8 @@ #!/bin/bash +# Public domain notice for all NCBI EDirect scripts is located at: +# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE + alias ChemCodeToName='phrase-search -convert chem Code Name' alias ChemCodeToTerm='phrase-search -convert chem Code Term' alias ChemCodeToTree='phrase-search -convert chem Code Tree' diff --git a/tst-elink.txt b/tst-elink.txt index aeff820..498dd81 100644 --- a/tst-elink.txt +++ b/tst-elink.txt @@ -1,3 +1,4 @@ +assembly nuccore 9513491 cdd pubmed 274590 gds pubmed 1336 gds taxonomy 1336 diff --git a/word-at-a-time b/word-at-a-time index 53ed73c..b72a2b1 100755 --- a/word-at-a-time +++ b/word-at-a-time @@ -1,4 +1,8 @@ #!/bin/bash -norc + +# Public domain notice for all NCBI EDirect scripts is located at: +# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE + sed 's/[^a-zA-Z0-9]/ /g; s/^ *//' | tr 'A-Z' 'a-z' | fmt -w 1 @@ -1,5 +1,8 @@ #!/bin/sh +# Public domain notice for all NCBI EDirect scripts is located at: +# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE + xtract -pattern INSDSeq -pfx ">Feature " \ -first INSDSeqid,INSDSeq_accession-version \ -group INSDFeature -FKEY INSDFeature_key \ @@ -1,4 +1,8 @@ #!/bin/sh + +# Public domain notice for all NCBI EDirect scripts is located at: +# https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE + # For Mac, please obtain command-line-enabled Plot2x from http://apps.micw.org/apps/plot2/downloads.php # For Unix or PC/Cygwin, please obtain gnuplot from http://gnuplot.sourceforge.net/download.html plot2x= |