diff options
-rw-r--r-- | common.go | 44 | ||||
-rwxr-xr-x | pm-index | 13 | ||||
-rw-r--r-- | rchive.go | 230 | ||||
-rw-r--r-- | xtract.go | 24 |
4 files changed, 248 insertions, 63 deletions
@@ -960,47 +960,6 @@ func CompressRunsOfSpaces(str string) string { return buffer.String() } -func HasAdjacentDoubleQuotes(str string) bool { - - doubleQuote := false - - for _, ch := range str { - if ch == '"' { - // " - if doubleQuote { - return true - } - doubleQuote = true - } else { - doubleQuote = false - } - } - - return false -} - -func CompressRunsOfDoubleQuote(str string) string { - - doubleQuote := false - var buffer strings.Builder - - for _, ch := range str { - if ch == '"' { - // " - if !doubleQuote { - buffer.WriteRune('"') - // " - } - doubleQuote = true - } else { - buffer.WriteRune(ch) - doubleQuote = false - } - } - - return buffer.String() -} - func NeedsTightening(str string) bool { if len(str) < 2 { @@ -2219,9 +2178,6 @@ func CleanupContents(str string, ascii, amper, mixed bool) string { if HasAdjacentSpacesOrNewline(str) { str = CompressRunsOfSpaces(str) } - if HasAdjacentDoubleQuotes(str) { - str = CompressRunsOfDoubleQuote(str) - } } if DoUnicode { if ascii && HasUnicodeMarkup(str) { @@ -19,9 +19,16 @@ do base=${fl%.xml.gz} echo "$base" SECONDS=0 - gunzip -c "$fl" | - xtract -e2index | - gzip -1 > "$target/$base.e2x.gz" + if [ -s "$dir/meshtree.txt" ] + then + gunzip -c "$fl" | + xtract -transform "$dir/meshtree.txt" -e2index | + gzip -1 > "$target/$base.e2x.gz" + else + gunzip -c "$fl" | + xtract -e2index | + gzip -1 > "$target/$base.e2x.gz" + fi echo "$SECONDS seconds" sleep 1 done @@ -291,6 +291,222 @@ Execution Profiling go tool pprof --pdf ./rchive ./cpu.pprof > ./callgraph.pdf ` +const meshCodes = ` +MeSH Categories + + A – Anatomy + B – Organisms + C – Diseases + D – Chemicals and Drugs + E – Analytical, Diagnostic and Therapeutic Techniques and Equipment + F – Psychiatry and Psychology + G – Biological Sciences + H – Physical Sciences + I – Anthropology, Education, Sociology and Social Phenomena + J – Technology and Food and Beverages + K – Humanities + L – Information Science + M – Persons + N – Health Care + V – Publication Characteristics + Z – Geographic Locations + +MeSH Codes + + A01 – body regions + A02 – musculoskeletal system + A03 – digestive system + A04 – respiratory system + A05 – urogenital system + A06 – endocrine system + A07 – cardiovascular system + A08 – nervous system + A09 – sense organs + A10 – tissues + A11 – cells + A12 – fluids and secretions + A13 – animal structures + A14 – stomatognathic system + A15 – hemic and immune systems + A16 – embryonic structures + A17 – integumentary system + B01 – animals + B02 – algae + B03 – bacteria + B04 – viruses + B05 – fungi + B06 – plants + B07 – archaea + B08 – mesomycetozoea + C01 – bacterial infections and mycoses + C02 – virus diseases + C03 – parasitic diseases + C04 – neoplasms + C05 – musculoskeletal diseases + C06 – digestive system diseases + C07 – stomatognathic diseases + C08 – respiratory tract diseases + C09 – otorhinolaryngologic diseases + C10 – nervous system diseases + C11 – eye diseases + C12 – urologic and male genital diseases + C13 – female genital diseases and pregnancy complications + C14 – cardiovascular diseases + C15 – hemic and lymphatic diseases + C16 – congenital, hereditary, and neonatal diseases and abnormalities + C17 – skin and connective tissue diseases + C18 – nutritional and metabolic diseases + C19 – endocrine system diseases + C20 – immune system diseases + C21 – disorders of environmental origin + C22 – animal diseases + C23 – pathological conditions, signs and symptoms + D01 – inorganic chemicals + D02 – organic chemicals + D03 – heterocyclic compounds + D04 – polycyclic compounds + D05 – macromolecular substances + D06 – hormones, hormone substitutes, and hormone antagonists + D07 – none (enzymes and coenzymes) + D08 – enzymes and coenzymes (carbohydrates) + D09 – carbohydrates (lipids) + D10 – lipids (amino acids, peptides, and proteins) + D11 – none (nucleic acids, nucleotides, and nucleosides) + D12 – amino acids, peptides, and proteins (complex mixtures) + D13 – nucleic acids, nucleotides, and nucleosides (biological factors) + D20 – complex mixtures + D23 – biological factors + D25 – biomedical and dental materials + D26 – pharmaceutical preparations + D27 – chemical actions and uses + E01 – diagnosis + E02 – therapeutics + E03 – anesthesia and analgesia + E04 – surgical procedures, operative + E05 – investigative techniques + E06 – dentistry + E07 – equipment and supplies + F01 – behavior and behavior mechanisms + F02 – psychological phenomena and processes + F03 – mental disorders + F04 – behavioral disciplines and activities + G01 – biological sciences + G02 – health occupations + G03 – environment and public health + G04 – biological phenomena, cell phenomena, and immunity + G05 – genetic processes + G06 – biochemical phenomena, metabolism, and nutrition + G07 – physiological processes + G08 – reproductive and urinary physiology + G09 – circulatory and respiratory physiology + G10 – digestive, oral, and skin physiology + G11 – musculoskeletal, neural, and ocular physiology + G12 – chemical and pharmacologic phenomena + G13 – genetic phenomena + G14 – genetic structures + H01 – natural sciences + I01 – social sciences + I02 – education + I03 – human activities + J01 – technology, industry, and agriculture + J02 – food and beverages + K01 – humanities + L01 – information science + M01 – persons + N01 – population characteristics + N02 – health care facilities, manpower, and services + N03 – health care economics and organizations + N04 – health services administration + N05 – health care quality, access, and evaluation + V01 – publication components (publication type) + V02 – publication formats (publication type) + V03 – study characteristics (publication type) + V04 – support of research + Z01 – geographic locations + +MeSH Subheadings + + abnormalities + administration & dosage + adverse effects + agonists + analogs & derivatives + analysis + anatomy & histology + antagonists & inhibitors + biosynthesis + blood + blood supply + cerebrospinal fluid + chemical synthesis + chemically induced + chemistry + classification + complications + congenital + cytology + deficiency + diagnosis + diagnostic imaging + diet therapy + drug effects + drug therapy + economics + education + embryology + enzymology + epidemiology + ethics + ethnology + etiology + genetics + growth & development + history + immunology + injuries + innervation + instrumentation + isolation & purification + legislation & jurisprudence + manpower + metabolism + methods + microbiology + mortality + nursing + organization & administration + parasitology + pathogenicity + pathology + pharmacokinetics + pharmacology + physiology + physiopathology + poisoning + prevention & control + psychology + radiation effects + radiotherapy + rehabilitation + secondary + secretion + standards + statistics & numerical data + supply & distribution + surgery + therapeutic use + therapy + toxicity + transmission + transplantation + trends + ultrastructure + urine + utilization + veterinary + virology +` + // DATA OBJECTS type Master struct { @@ -2922,9 +3138,9 @@ func CreateStashers(stash, parent, indx string, hash, zipp bool, inp <-chan Extr } res += "\n" - out <- res - runtime.Gosched() + + out <- res } } @@ -3037,9 +3253,9 @@ func CreateFetchers(stash string, zipp bool, inp <-chan Extract) <-chan Extract str := fetchRecord(ext.Text, buf) - out <- Extract{ext.Index, "", str, nil} - runtime.Gosched() + + out <- Extract{ext.Index, "", str, nil} } } @@ -3124,9 +3340,9 @@ func CreateStreamers(stash string, inp <-chan Extract) <-chan Extract { data := getRecord(ext.Text, buf) - out <- Extract{ext.Index, "", "", data} - runtime.Gosched() + + out <- Extract{ext.Index, "", "", data} } } @@ -5244,6 +5460,8 @@ func main() { fmt.Printf("rchive %s\n%s\n", rchiveVersion, rchiveExtras) case "-internal": fmt.Printf("rchive %s\n%s\n", rchiveVersion, rchiveInternal) + case "-mesh": + fmt.Printf("%s\n", meshCodes) default: // if not any of the documentation commands, keep going inSwitch = false @@ -5785,7 +5785,7 @@ func ProcessHydra(isPipe bool) []string { // ENTREZ2INDEX COMMAND GENERATOR // ProcessE2Index generates extraction commands to create input for Entrez2Index -func ProcessE2Index(args []string, isPipe bool) []string { +func ProcessE2Index(args []string, tform string, isPipe bool) []string { var acc []string @@ -5838,10 +5838,10 @@ func ProcessE2Index(args []string, isPipe bool) []string { for _, str := range args { acc = append(acc, str) } - /* + if tform != "" { acc = append(acc, "-clr", "-rst", "-tab", "\"\"") acc = append(acc, "-sep", ",", "-meshcode", "MeshHeading/DescriptorName@UI") - */ + } acc = append(acc, "-clr", "-lbl", " </IdxSearchFields>\\n") } else { acc = append(acc, "-head", "\"<IdxDocumentSet>\"", "-tail", "\"</IdxDocumentSet>\"") @@ -5867,10 +5867,10 @@ func ProcessE2Index(args []string, isPipe bool) []string { ql = fmt.Sprintf("\"%s\"", str) acc = append(acc, ql) } - /* + if tform != "" { acc = append(acc, "-clr", "-rst", "-tab", "\"\"") acc = append(acc, "-sep", "\",\"", "-meshcode", "\"MeshHeading/DescriptorName@UI\"") - */ + } acc = append(acc, "-clr", "-lbl", "\" </IdxSearchFields>\\n\"") } @@ -8003,6 +8003,7 @@ func main() { // NAME OF OUTPUT STRING TRANSFORMATION FILE + tform := "" transform := make(map[string]string) populateTx := func(tf string) { @@ -8027,7 +8028,7 @@ func main() { } if len(args) > 2 && args[0] == "-transform" { - tform := args[1] + tform = args[1] args = args[2:] if tform != "" { populateTx(tform) @@ -8103,14 +8104,14 @@ func main() { // -e2index shortcut for experimental indexing code (documented in rchive.go) if args[0] == "-e2index" { - // e.g., xtract -transform meshtable.txt -e2index PubmedArticle MedlineCitation/PMID ArticleTitle,Abstract/AbstractText + // e.g., xtract -transform meshtree.txt -e2index /* - meshtable.txt was prepared by running: + meshtree.txt was prepared by running: cat desc2018.xml | xtract -pattern DescriptorRecord -element "DescriptorRecord/DescriptorUI" \ - -sep "," -element TreeNumber > meshtable.txt + -sep "," -element TreeNumber > meshtree.txt */ args = args[1:] @@ -8120,11 +8121,14 @@ func main() { args = []string{"PubmedArticle", "MedlineCitation/PMID", "ArticleTitle,Abstract/AbstractText"} } - res := ProcessE2Index(args, isPipe || usingFile) + res := ProcessE2Index(args, tform, isPipe || usingFile) if !isPipe && !usingFile { // no piped input, so write output instructions fmt.Printf("xtract") + if tform != "" { + fmt.Printf(" -transform %s", tform) + } for _, str := range res { fmt.Printf(" %s", str) } |