summaryrefslogtreecommitdiff
path: root/rchive.go
diff options
context:
space:
mode:
authorAaron M. Ucko <ucko@debian.org>2018-11-18 21:39:43 -0500
committerAaron M. Ucko <ucko@debian.org>2018-11-18 21:39:43 -0500
commit73b810c583735a0b2b89afeac101209767df681c (patch)
tree630f40cf4b5e7f886f6fda2aa60a3bf3c50db65c /rchive.go
parenta04cc894215a8c30302e175fdf5bcf5417ebe61a (diff)
New upstream version 10.4.20181106+ds
Diffstat (limited to 'rchive.go')
-rw-r--r--rchive.go195
1 files changed, 180 insertions, 15 deletions
diff --git a/rchive.go b/rchive.go
index 7f5757d..69495d7 100644
--- a/rchive.go
+++ b/rchive.go
@@ -62,7 +62,7 @@ import (
// RCHIVE VERSION AND HELP MESSAGE TEXT
-const rchiveVersion = "10.3"
+const rchiveVersion = "10.4"
const rchiveHelp = `
Processing Flags
@@ -175,8 +175,9 @@ Large-Scale Record Retrieval
XML Data Transformation
+ seconds_start=$(date "+%s")
esearch -db pubmed -query "PNAS [JOUR]" -pub abstract |
- efetch -format uid | fetch-pubmed |
+ efetch -format uid | stream-pubmed | gunzip -c |
xtract -stops -wrp Set,Rec -pattern PubmedArticle \
-wrp "Year" -year "PubDate/*" \
-wrp "Abst" -words Abstract/AbstractText |
@@ -192,6 +193,9 @@ XML Data Transformation
tee /dev/tty |
xy-plot verbosity.png
rm countsByYear.xml
+ seconds_end=$(date "+%s")
+ seconds=$((seconds_end - seconds_start))
+ echo "$seconds seconds"
Medical Subject Heading Code Viewer
@@ -383,6 +387,7 @@ MeSH Codes
A15 – hemic and immune systems
A16 – embryonic structures
A17 – integumentary system
+
B01 – animals
B02 – algae
B03 – bacteria
@@ -391,6 +396,7 @@ MeSH Codes
B06 – plants
B07 – archaea
B08 – mesomycetozoea
+
C01 – bacterial infections and mycoses
C02 – virus diseases
C03 – parasitic diseases
@@ -414,6 +420,7 @@ MeSH Codes
C21 – disorders of environmental origin
C22 – animal diseases
C23 – pathological conditions, signs and symptoms
+
D01 – inorganic chemicals
D02 – organic chemicals
D03 – heterocyclic compounds
@@ -432,6 +439,7 @@ MeSH Codes
D25 – biomedical and dental materials
D26 – pharmaceutical preparations
D27 – chemical actions and uses
+
E01 – diagnosis
E02 – therapeutics
E03 – anesthesia and analgesia
@@ -439,10 +447,12 @@ MeSH Codes
E05 – investigative techniques
E06 – dentistry
E07 – equipment and supplies
+
F01 – behavior and behavior mechanisms
F02 – psychological phenomena and processes
F03 – mental disorders
F04 – behavioral disciplines and activities
+
G01 – biological sciences
G02 – health occupations
G03 – environment and public health
@@ -457,24 +467,33 @@ MeSH Codes
G12 – chemical and pharmacologic phenomena
G13 – genetic phenomena
G14 – genetic structures
+
H01 – natural sciences
+
I01 – social sciences
I02 – education
I03 – human activities
+
J01 – technology, industry, and agriculture
J02 – food and beverages
+
K01 – humanities
+
L01 – information science
+
M01 – persons
+
N01 – population characteristics
N02 – health care facilities, manpower, and services
N03 – health care economics and organizations
N04 – health services administration
N05 – health care quality, access, and evaluation
+
V01 – publication components (publication type)
V02 – publication formats (publication type)
V03 – study characteristics (publication type)
V04 – support of research
+
Z01 – geographic locations
MeSH Subheadings
@@ -1159,25 +1178,95 @@ func ReadOffsetData(dpath, key, field string, offset int32, size int32) []int16
return data
}
+func ReadMasterIndexFuture(dpath, key, field string) <-chan []Master {
+
+ out := make(chan []Master, ChanDepth)
+ if out == nil {
+ fmt.Fprintf(os.Stderr, "\nERROR: Unable to create master index channel\n")
+ os.Exit(1)
+ }
+
+ // masterIndexFuture asynchronously gets the master file and sends results through channel
+ masterIndexFuture := func(dpath, key, field string, out chan<- []Master) {
+
+ data := ReadMasterIndex(dpath, key, field)
+
+ out <- data
+
+ close(out)
+ }
+
+ // launch single future goroutine
+ go masterIndexFuture(dpath, key, field, out)
+
+ return out
+}
+
+func ReadTermListFuture(dpath, key, field string) <-chan []byte {
+
+ out := make(chan []byte, ChanDepth)
+ if out == nil {
+ fmt.Fprintf(os.Stderr, "\nERROR: Unable to create term list channel\n")
+ os.Exit(1)
+ }
+
+ // termListFuture asynchronously gets posting IDs and sends results through channel
+ termListFuture := func(dpath, key, field string, out chan<- []byte) {
+
+ data := ReadTermList(dpath, key, field)
+
+ out <- data
+
+ close(out)
+ }
+
+ // launch single future goroutine
+ go termListFuture(dpath, key, field, out)
+
+ return out
+}
+
func GetPostingIDs(prom, term, field string) []int32 {
- var arry [516]rune
+ var (
+ arry [516]rune
+ indx []Master
+ trms []byte
+ )
+
dpath, key := PostingPath(prom, term, arry)
if dpath == "" {
return nil
}
- indx := ReadMasterIndex(dpath, key, field)
+ if UseFutures {
+
+ // schedule asynchronous fetching
+ mi := ReadMasterIndexFuture(dpath, key, field)
+
+ tl := ReadTermListFuture(dpath, key, field)
+
+ // fetch master index and term list
+ indx = <-mi
+
+ trms = <-tl
+
+ } else {
+
+ indx = ReadMasterIndex(dpath, key, field)
+
+ trms = ReadTermList(dpath, key, field)
+ }
+
if indx == nil || len(indx) < 1 {
return nil
}
- trms := ReadTermList(dpath, key, field)
if trms == nil || len(trms) < 1 {
return nil
}
- strs := make([]string, len(indx)-1)
+ strs := make([]string, len(indx))
if strs == nil || len(strs) < 1 {
return nil
}
@@ -1287,23 +1376,45 @@ func GetPostingIDs(prom, term, field string) []int32 {
func GetPostingIDsEx(prom, term, field string) ([]int32, [][]int16) {
- var arry [516]rune
+ var (
+ arry [516]rune
+ indx []Master
+ trms []byte
+ )
+
dpath, key := PostingPath(prom, term, arry)
if dpath == "" {
return nil, nil
}
- indx := ReadMasterIndex(dpath, key, field)
+ if UseFutures {
+
+ // schedule asynchronous fetching
+ mi := ReadMasterIndexFuture(dpath, key, field)
+
+ tl := ReadTermListFuture(dpath, key, field)
+
+ // fetch master index and term list
+ indx = <-mi
+
+ trms = <-tl
+
+ } else {
+
+ indx = ReadMasterIndex(dpath, key, field)
+
+ trms = ReadTermList(dpath, key, field)
+ }
+
if indx == nil || len(indx) < 1 {
return nil, nil
}
- trms := ReadTermList(dpath, key, field)
if trms == nil || len(trms) < 1 {
return nil, nil
}
- strs := make([]string, len(indx)-1)
+ strs := make([]string, len(indx))
if strs == nil || len(strs) < 1 {
return nil, nil
}
@@ -1366,7 +1477,7 @@ func GetPostingIDsEx(prom, term, field string) ([]int32, [][]int16) {
// read relevant postings list section
data := ReadPostingData(dpath, key, field, offset, size)
- if data == nil {
+ if data == nil || len(data) < 1 {
return nil, nil
}
@@ -1477,7 +1588,7 @@ func GetPostingIDsEx(prom, term, field string) ([]int32, [][]int16) {
}
// make array of int16 arrays, populate for each UID
- arrs := make([][]int16, ulen-1)
+ arrs := make([][]int16, ulen)
if arrs == nil || len(arrs) < 1 {
return nil, nil
}
@@ -1535,7 +1646,7 @@ func PrintTermCounts(base, term, field string) int {
return 0
}
- strs := make([]string, len(indx)-1)
+ strs := make([]string, len(indx))
if strs == nil {
return 0
}
@@ -3035,6 +3146,8 @@ func ProcessCount(base, phrase string, plrl, psns, rlxd bool) int {
checkTermCounts(item)
}
+ runtime.Gosched()
+
return count
}
@@ -3551,7 +3664,7 @@ func CreateDispensers(nvrt string, inp <-chan Extract) <-chan []string {
}
// xmlDispenser prepares UID, term, and position strings for inversion
- xmlDispenser := func(wg *sync.WaitGroup, inp <-chan Extract) {
+ xmlDispenser := func(wg *sync.WaitGroup, inp <-chan Extract, out chan<- []string) {
defer wg.Done()
@@ -3610,7 +3723,7 @@ func CreateDispensers(nvrt string, inp <-chan Extract) <-chan []string {
// launch multiple dispenser goroutines
for i := 0; i < NumServe; i++ {
wg.Add(1)
- go xmlDispenser(&wg, inp)
+ go xmlDispenser(&wg, inp, out)
}
// launch separate anonymous goroutine to wait until all dispensers are done
@@ -5726,6 +5839,32 @@ func main() {
// -merge combines inverted files, distributes by prefix
if merg != "" && fild != "" {
+ // environment variable can override garbage collector (undocumented)
+ gcEnv := os.Getenv("EDIRECT_MERGE_GOGC")
+ if gcEnv != "" {
+ val, err := strconv.Atoi(gcEnv)
+ if err == nil {
+ if val >= 50 && val <= 1000 {
+ debug.SetGCPercent(val)
+ } else {
+ debug.SetGCPercent(100)
+ }
+ }
+ }
+
+ // environment variable can override number of servers (undocumented)
+ svEnv := os.Getenv("EDIRECT_MERGE_SERV")
+ if svEnv != "" {
+ val, err := strconv.Atoi(svEnv)
+ if err == nil {
+ if val >= 1 && val <= 128 {
+ NumServe = val
+ } else {
+ NumServe = 1
+ }
+ }
+ }
+
chns := CreatePresenters(args)
mfld := CreateManifold(chns)
mrgr := CreateMergers(fild, mfld)
@@ -6282,6 +6421,32 @@ func main() {
// -invert NORM reads IdxDocumentSet XML and creates an inverted index
if nvrt != "" {
+ // environment variable can override garbage collector (undocumented)
+ gcEnv := os.Getenv("EDIRECT_INVERT_GOGC")
+ if gcEnv != "" {
+ val, err := strconv.Atoi(gcEnv)
+ if err == nil {
+ if val >= 50 && val <= 1000 {
+ debug.SetGCPercent(val)
+ } else {
+ debug.SetGCPercent(100)
+ }
+ }
+ }
+
+ // environment variable can override number of servers (undocumented)
+ svEnv := os.Getenv("EDIRECT_INVERT_SERV")
+ if svEnv != "" {
+ val, err := strconv.Atoi(svEnv)
+ if err == nil {
+ if val >= 1 && val <= 128 {
+ NumServe = val
+ } else {
+ NumServe = 1
+ }
+ }
+ }
+
colq := CreateProducer("IdxDocument", "", rdr)
dspq := CreateDispensers(nvrt, colq)
invq := CreateInverters(nvrt, dspq)