diff options
Diffstat (limited to 'rchive.go')
-rw-r--r-- | rchive.go | 195 |
1 files changed, 180 insertions, 15 deletions
@@ -62,7 +62,7 @@ import ( // RCHIVE VERSION AND HELP MESSAGE TEXT -const rchiveVersion = "10.3" +const rchiveVersion = "10.4" const rchiveHelp = ` Processing Flags @@ -175,8 +175,9 @@ Large-Scale Record Retrieval XML Data Transformation + seconds_start=$(date "+%s") esearch -db pubmed -query "PNAS [JOUR]" -pub abstract | - efetch -format uid | fetch-pubmed | + efetch -format uid | stream-pubmed | gunzip -c | xtract -stops -wrp Set,Rec -pattern PubmedArticle \ -wrp "Year" -year "PubDate/*" \ -wrp "Abst" -words Abstract/AbstractText | @@ -192,6 +193,9 @@ XML Data Transformation tee /dev/tty | xy-plot verbosity.png rm countsByYear.xml + seconds_end=$(date "+%s") + seconds=$((seconds_end - seconds_start)) + echo "$seconds seconds" Medical Subject Heading Code Viewer @@ -383,6 +387,7 @@ MeSH Codes A15 – hemic and immune systems A16 – embryonic structures A17 – integumentary system + B01 – animals B02 – algae B03 – bacteria @@ -391,6 +396,7 @@ MeSH Codes B06 – plants B07 – archaea B08 – mesomycetozoea + C01 – bacterial infections and mycoses C02 – virus diseases C03 – parasitic diseases @@ -414,6 +420,7 @@ MeSH Codes C21 – disorders of environmental origin C22 – animal diseases C23 – pathological conditions, signs and symptoms + D01 – inorganic chemicals D02 – organic chemicals D03 – heterocyclic compounds @@ -432,6 +439,7 @@ MeSH Codes D25 – biomedical and dental materials D26 – pharmaceutical preparations D27 – chemical actions and uses + E01 – diagnosis E02 – therapeutics E03 – anesthesia and analgesia @@ -439,10 +447,12 @@ MeSH Codes E05 – investigative techniques E06 – dentistry E07 – equipment and supplies + F01 – behavior and behavior mechanisms F02 – psychological phenomena and processes F03 – mental disorders F04 – behavioral disciplines and activities + G01 – biological sciences G02 – health occupations G03 – environment and public health @@ -457,24 +467,33 @@ MeSH Codes G12 – chemical and pharmacologic phenomena G13 – genetic phenomena G14 – genetic structures + H01 – natural sciences + I01 – social sciences I02 – education I03 – human activities + J01 – technology, industry, and agriculture J02 – food and beverages + K01 – humanities + L01 – information science + M01 – persons + N01 – population characteristics N02 – health care facilities, manpower, and services N03 – health care economics and organizations N04 – health services administration N05 – health care quality, access, and evaluation + V01 – publication components (publication type) V02 – publication formats (publication type) V03 – study characteristics (publication type) V04 – support of research + Z01 – geographic locations MeSH Subheadings @@ -1159,25 +1178,95 @@ func ReadOffsetData(dpath, key, field string, offset int32, size int32) []int16 return data } +func ReadMasterIndexFuture(dpath, key, field string) <-chan []Master { + + out := make(chan []Master, ChanDepth) + if out == nil { + fmt.Fprintf(os.Stderr, "\nERROR: Unable to create master index channel\n") + os.Exit(1) + } + + // masterIndexFuture asynchronously gets the master file and sends results through channel + masterIndexFuture := func(dpath, key, field string, out chan<- []Master) { + + data := ReadMasterIndex(dpath, key, field) + + out <- data + + close(out) + } + + // launch single future goroutine + go masterIndexFuture(dpath, key, field, out) + + return out +} + +func ReadTermListFuture(dpath, key, field string) <-chan []byte { + + out := make(chan []byte, ChanDepth) + if out == nil { + fmt.Fprintf(os.Stderr, "\nERROR: Unable to create term list channel\n") + os.Exit(1) + } + + // termListFuture asynchronously gets posting IDs and sends results through channel + termListFuture := func(dpath, key, field string, out chan<- []byte) { + + data := ReadTermList(dpath, key, field) + + out <- data + + close(out) + } + + // launch single future goroutine + go termListFuture(dpath, key, field, out) + + return out +} + func GetPostingIDs(prom, term, field string) []int32 { - var arry [516]rune + var ( + arry [516]rune + indx []Master + trms []byte + ) + dpath, key := PostingPath(prom, term, arry) if dpath == "" { return nil } - indx := ReadMasterIndex(dpath, key, field) + if UseFutures { + + // schedule asynchronous fetching + mi := ReadMasterIndexFuture(dpath, key, field) + + tl := ReadTermListFuture(dpath, key, field) + + // fetch master index and term list + indx = <-mi + + trms = <-tl + + } else { + + indx = ReadMasterIndex(dpath, key, field) + + trms = ReadTermList(dpath, key, field) + } + if indx == nil || len(indx) < 1 { return nil } - trms := ReadTermList(dpath, key, field) if trms == nil || len(trms) < 1 { return nil } - strs := make([]string, len(indx)-1) + strs := make([]string, len(indx)) if strs == nil || len(strs) < 1 { return nil } @@ -1287,23 +1376,45 @@ func GetPostingIDs(prom, term, field string) []int32 { func GetPostingIDsEx(prom, term, field string) ([]int32, [][]int16) { - var arry [516]rune + var ( + arry [516]rune + indx []Master + trms []byte + ) + dpath, key := PostingPath(prom, term, arry) if dpath == "" { return nil, nil } - indx := ReadMasterIndex(dpath, key, field) + if UseFutures { + + // schedule asynchronous fetching + mi := ReadMasterIndexFuture(dpath, key, field) + + tl := ReadTermListFuture(dpath, key, field) + + // fetch master index and term list + indx = <-mi + + trms = <-tl + + } else { + + indx = ReadMasterIndex(dpath, key, field) + + trms = ReadTermList(dpath, key, field) + } + if indx == nil || len(indx) < 1 { return nil, nil } - trms := ReadTermList(dpath, key, field) if trms == nil || len(trms) < 1 { return nil, nil } - strs := make([]string, len(indx)-1) + strs := make([]string, len(indx)) if strs == nil || len(strs) < 1 { return nil, nil } @@ -1366,7 +1477,7 @@ func GetPostingIDsEx(prom, term, field string) ([]int32, [][]int16) { // read relevant postings list section data := ReadPostingData(dpath, key, field, offset, size) - if data == nil { + if data == nil || len(data) < 1 { return nil, nil } @@ -1477,7 +1588,7 @@ func GetPostingIDsEx(prom, term, field string) ([]int32, [][]int16) { } // make array of int16 arrays, populate for each UID - arrs := make([][]int16, ulen-1) + arrs := make([][]int16, ulen) if arrs == nil || len(arrs) < 1 { return nil, nil } @@ -1535,7 +1646,7 @@ func PrintTermCounts(base, term, field string) int { return 0 } - strs := make([]string, len(indx)-1) + strs := make([]string, len(indx)) if strs == nil { return 0 } @@ -3035,6 +3146,8 @@ func ProcessCount(base, phrase string, plrl, psns, rlxd bool) int { checkTermCounts(item) } + runtime.Gosched() + return count } @@ -3551,7 +3664,7 @@ func CreateDispensers(nvrt string, inp <-chan Extract) <-chan []string { } // xmlDispenser prepares UID, term, and position strings for inversion - xmlDispenser := func(wg *sync.WaitGroup, inp <-chan Extract) { + xmlDispenser := func(wg *sync.WaitGroup, inp <-chan Extract, out chan<- []string) { defer wg.Done() @@ -3610,7 +3723,7 @@ func CreateDispensers(nvrt string, inp <-chan Extract) <-chan []string { // launch multiple dispenser goroutines for i := 0; i < NumServe; i++ { wg.Add(1) - go xmlDispenser(&wg, inp) + go xmlDispenser(&wg, inp, out) } // launch separate anonymous goroutine to wait until all dispensers are done @@ -5726,6 +5839,32 @@ func main() { // -merge combines inverted files, distributes by prefix if merg != "" && fild != "" { + // environment variable can override garbage collector (undocumented) + gcEnv := os.Getenv("EDIRECT_MERGE_GOGC") + if gcEnv != "" { + val, err := strconv.Atoi(gcEnv) + if err == nil { + if val >= 50 && val <= 1000 { + debug.SetGCPercent(val) + } else { + debug.SetGCPercent(100) + } + } + } + + // environment variable can override number of servers (undocumented) + svEnv := os.Getenv("EDIRECT_MERGE_SERV") + if svEnv != "" { + val, err := strconv.Atoi(svEnv) + if err == nil { + if val >= 1 && val <= 128 { + NumServe = val + } else { + NumServe = 1 + } + } + } + chns := CreatePresenters(args) mfld := CreateManifold(chns) mrgr := CreateMergers(fild, mfld) @@ -6282,6 +6421,32 @@ func main() { // -invert NORM reads IdxDocumentSet XML and creates an inverted index if nvrt != "" { + // environment variable can override garbage collector (undocumented) + gcEnv := os.Getenv("EDIRECT_INVERT_GOGC") + if gcEnv != "" { + val, err := strconv.Atoi(gcEnv) + if err == nil { + if val >= 50 && val <= 1000 { + debug.SetGCPercent(val) + } else { + debug.SetGCPercent(100) + } + } + } + + // environment variable can override number of servers (undocumented) + svEnv := os.Getenv("EDIRECT_INVERT_SERV") + if svEnv != "" { + val, err := strconv.Atoi(svEnv) + if err == nil { + if val >= 1 && val <= 128 { + NumServe = val + } else { + NumServe = 1 + } + } + } + colq := CreateProducer("IdxDocument", "", rdr) dspq := CreateDispensers(nvrt, colq) invq := CreateInverters(nvrt, dspq) |