diff options
Diffstat (limited to 'xtract.go')
-rw-r--r-- | xtract.go | 379 |
1 files changed, 263 insertions, 116 deletions
@@ -93,8 +93,8 @@ Overview Processing Flags - -mixed Allow PubMed mixed content -strict Remove HTML highlight tags + -mixed Allow PubMed mixed content -accent Delete Unicode accents -ascii Unicode to numeric character references @@ -327,17 +327,15 @@ Examples ` const xtractExtras = ` +Processing Flags + + -flags [strict|mixed|none] + Local Record Indexing -stash Base path for individual XML files -index Name of element to use for identifier - -Processing Commands - - -prepare [release|report] Compare daily update to stash - -ignore Ignore contents of object in -prepare comparisons - -missing Print list of missing identifiers - -unique File of UIDs for skipping all but last version + -unique File of UIDs for removing intermediate records Sample File Download @@ -361,11 +359,11 @@ Human Subset Extraction PubMed Download download-pubmed baseline updatefiles - unpack-pubmed + unpack-pubmed mixed PubMed Archive Creation - stash-pubmed /Volumes/myssd/Pubmed + stash-pubmed mixed /Volumes/myssd/Pubmed PubMed Archive Retrieval @@ -374,6 +372,12 @@ PubMed Archive Retrieval ` const xtractAdvanced = ` +Processing Commands + + -prepare [release|report] Compare daily update to stash + -ignore Ignore contents of object in -prepare comparisons + -missing Print list of missing identifiers + Update Candidate Report gzcat medline*.xml.gz | xtract -strict -compress -format flush | @@ -452,14 +456,14 @@ Performance Tuning Script Processor Titration Results - 1 27748 207 - 2 51011 272 - 3 73487 700 - 4 93032 2559 - 5 92596 1549 - 6 89513 1570 - 7 84872 1145 - 8 83829 952 + 1 27622 31 + 2 51799 312 + 3 74853 593 + 4 95867 1337 + 5 97171 4019 + 6 93460 2458 + 7 87467 1030 + 8 82448 2651 Execution Profiling @@ -618,7 +622,7 @@ Gene Regions LOCUS NC_000076 2142 bp DNA linear CON 09-FEB-2015 DEFINITION Mus musculus strain C57BL/6J chromosome 10, GRCm38.p3 C57BL/6J. ACCESSION NC_000076 REGION: complement(75771233..75773374) GPC_000000783 - VERSION NC_000076.6 GI:372099100 + VERSION NC_000076.6 ... FEATURES Location/Qualifiers source 1..2142 @@ -2276,7 +2280,7 @@ type Tables struct { DeGloss bool DoMixed bool DeAccent bool - DoAscii bool + DoASCII bool } type Node struct { @@ -2487,26 +2491,22 @@ func TrimPunctuation(str string) string { } } - if max > 0 { - if str[0] == '(' && !strings.Contains(str, ")") { - // trim isolated left parentheses - str = str[1:] - max-- - } + if max > 0 && str[0] == '(' && !strings.Contains(str, ")") { + // trim isolated left parentheses + str = str[1:] + max-- } - if max > 1 { - if str[max-1] == ')' && !strings.Contains(str, "(") { - // trim isolated right parentheses - str = str[:max-1] - // max-- - } + if max > 1 && str[max-1] == ')' && !strings.Contains(str, "(") { + // trim isolated right parentheses + str = str[:max-1] + // max-- } return str } -func HtmlAhead(text string, pos int) int { +func HTMLAhead(text string, pos int) int { max := len(text) - pos @@ -2570,7 +2570,7 @@ func HtmlAhead(text string, pos int) int { return 0 } -func HtmlBehind(bufr []byte, pos int) bool { +func HTMLBehind(bufr []byte, pos int) bool { if pos > 1 && bufr[pos-2] == '<' { ch := bufr[pos-1] @@ -2781,7 +2781,7 @@ var ( rpair *strings.Replacer ) -func DoHtmlReplace(str string) string { +func DoHTMLReplace(str string) string { // replacer/repairer not reentrant, protected by mutex rlock.Lock() @@ -2862,7 +2862,7 @@ func DoHtmlReplace(str string) string { return str } -func DoHtmlRepair(str string) string { +func DoHTMLRepair(str string) string { // replacer/repairer not reentrant, protected by mutex rlock.Lock() @@ -2923,7 +2923,7 @@ func DoHtmlRepair(str string) string { return str } -func DoTrimFlankingHtml(str string) string { +func DoTrimFlankingHTML(str string) string { badPrefix := [10]string{ "<i></i>", @@ -3050,7 +3050,7 @@ func DoAccentTransform(str string) string { return str } -func UnicodeToAscii(str string) string { +func UnicodeToASCII(str string) string { var buffer bytes.Buffer @@ -3874,16 +3874,16 @@ type XMLReader struct { Closed bool Docompress bool Docleanup bool - Leavehtml bool + LeaveHTML bool } -func NewXMLReader(in io.Reader, doCompress, doCleanup, leaveHtml bool) *XMLReader { +func NewXMLReader(in io.Reader, doCompress, doCleanup, leaveHTML bool) *XMLReader { if in == nil { return nil } - rdr := &XMLReader{Reader: in, Docompress: doCompress, Docleanup: doCleanup, Leavehtml: leaveHtml} + rdr := &XMLReader{Reader: in, Docompress: doCompress, Docleanup: doCleanup, LeaveHTML: leaveHTML} // 65536 appears to be the maximum number of characters presented to io.Reader when input is piped from stdin // increasing size of buffer when input is from a file does not improve program performance @@ -3940,9 +3940,9 @@ func (rdr *XMLReader) NextBlock() string { pos := -1 for pos = len(bufr) - 1; pos >= 0; pos-- { if bufr[pos] == '>' { - if rdr.Leavehtml { + if rdr.LeaveHTML { // optionally skip backwards past embedded i, b, u, sub, and sup HTML open, close, and empty tags - if HtmlBehind(bufr, pos) { + if HTMLBehind(bufr, pos) { continue } } @@ -4521,7 +4521,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special start := idx - if ch == '<' && (plainText || HtmlAhead(text, idx) == 0) { + if ch == '<' && (plainText || HTMLAhead(text, idx) == 0) { // at start of element idx++ @@ -4723,7 +4723,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special } if ch == '<' && !plainText { // optionally allow HTML text formatting elements and super/subscripts - advance := HtmlAhead(text, idx) + advance := HTMLAhead(text, idx) if advance > 0 { idx += advance ch = text[idx] @@ -4934,7 +4934,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special startLine := 0 // warn if HTML tags are not well-formed - unbalancedHtml := func(text string) bool { + unbalancedHTML := func(text string) bool { var arry []string @@ -5037,7 +5037,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special fmt.Fprintf(os.Stdout, "Contents not expected before </%s>, line %d\n", parent, line) } if tbls.DeGloss || tbls.DoMixed { - if unbalancedHtml(name) { + if unbalancedHTML(name) { fmt.Fprintf(os.Stdout, "Unbalanced mixed-content tags, line %d\n", line) } } @@ -5393,7 +5393,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special str = RemoveUnicodeMarkup(str) } if HasAngleBracket(str) { - str = DoHtmlReplace(str) + str = DoHTMLReplace(str) } } if tbls.DoMixed { @@ -5401,18 +5401,18 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special str = SimulateUnicodeMarkup(str) } if HasAngleBracket(str) { - str = DoHtmlRepair(str) + str = DoHTMLRepair(str) } - str = DoTrimFlankingHtml(str) + str = DoTrimFlankingHTML(str) } if tbls.DeAccent { if IsNotASCII(str) { str = DoAccentTransform(str) } } - if tbls.DoAscii { + if tbls.DoASCII { if IsNotASCII(str) { - str = UnicodeToAscii(str) + str = UnicodeToASCII(str) } } @@ -5530,9 +5530,9 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special attr = DoAccentTransform(attr) } } - if tbls.DoAscii { + if tbls.DoASCII { if IsNotASCII(attr) { - attr = UnicodeToAscii(attr) + attr = UnicodeToASCII(attr) } } @@ -5773,7 +5773,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special name = RemoveUnicodeMarkup(name) } if HasAngleBracket(name) { - name = DoHtmlReplace(name) + name = DoHTMLReplace(name) } } if tbls.DoMixed { @@ -5781,18 +5781,18 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special name = SimulateUnicodeMarkup(name) } if HasAngleBracket(name) { - name = DoHtmlRepair(name) + name = DoHTMLRepair(name) } - name = DoTrimFlankingHtml(name) + name = DoTrimFlankingHTML(name) } if tbls.DeAccent { if IsNotASCII(name) { name = DoAccentTransform(name) } } - if tbls.DoAscii { + if tbls.DoASCII { if IsNotASCII(name) { - name = UnicodeToAscii(name) + name = UnicodeToASCII(name) } } if HasFlankingSpace(name) { @@ -7168,7 +7168,7 @@ func ProcessClause(curr *Node, stages []*Step, mask, prev, pfx, sfx, sep, def st str = RemoveUnicodeMarkup(str) } if HasAngleBracket(str) { - str = DoHtmlReplace(str) + str = DoHTMLReplace(str) } // break terms at spaces, allowing hyphenated words @@ -7993,7 +7993,7 @@ func ProcessQuery(Text, parent string, index int, cmds *Block, tbls *Tables, act start := idx - if ch == '<' && (plainText || HtmlAhead(text, idx) == 0) { + if ch == '<' && (plainText || HTMLAhead(text, idx) == 0) { // at start of element idx++ @@ -8142,7 +8142,7 @@ func ProcessQuery(Text, parent string, index int, cmds *Block, tbls *Tables, act } if ch == '<' && !plainText { // optionally allow HTML text formatting elements and super/subscripts - advance := HtmlAhead(text, idx) + advance := HTMLAhead(text, idx) if advance > 0 { idx += advance ch = text[idx] @@ -8218,7 +8218,7 @@ func ProcessQuery(Text, parent string, index int, cmds *Block, tbls *Tables, act name = RemoveUnicodeMarkup(name) } if HasAngleBracket(name) { - name = DoHtmlReplace(name) + name = DoHTMLReplace(name) } } if tbls.DoMixed { @@ -8226,18 +8226,18 @@ func ProcessQuery(Text, parent string, index int, cmds *Block, tbls *Tables, act name = SimulateUnicodeMarkup(name) } if HasAngleBracket(name) { - name = DoHtmlReplace(name) + name = DoHTMLReplace(name) } - name = DoTrimFlankingHtml(name) + name = DoTrimFlankingHTML(name) } if tbls.DeAccent { if IsNotASCII(name) { name = DoAccentTransform(name) } } - if tbls.DoAscii { + if tbls.DoASCII { if IsNotASCII(name) { - name = UnicodeToAscii(name) + name = UnicodeToASCII(name) } } node.Contents = name @@ -8546,7 +8546,7 @@ func (h *ExtractHeap) Pop() interface{} { // process with single goroutine calls defer close(out) so consumer(s) can range over channel // process with multiple instances calls defer wg.Done(), separate goroutine uses wg.Wait() to delay close(out) -func CreateProducer(pat, star string, rdr *XMLReader, tbls *Tables) <-chan Extract { +func CreateProducer(pat, star string, rdr *XMLReader, uidFile string, tbls *Tables) <-chan Extract { if rdr == nil || tbls == nil { return nil @@ -8558,15 +8558,72 @@ func CreateProducer(pat, star string, rdr *XMLReader, tbls *Tables) <-chan Extra os.Exit(1) } + // create map that counts instances of each UID + order := make(map[string]int) + + checkIDs := false + + if uidFile != "" { + checkIDs = true + + // read file of identifiers to use for filtering + fl, err := os.Open(uidFile) + if err != nil { + fmt.Fprintf(os.Stderr, "\nERROR: Unable to open identifier file '%s'\n", uidFile) + os.Exit(1) + } + + scanr := bufio.NewScanner(fl) + + // read lines of identifiers + for scanr.Scan() { + + id := scanr.Text() + + // map records count for given identifier + val := order[id] + val++ + order[id] = val + } + + fl.Close() + } + // xmlProducer sends partitioned XML strings through channel xmlProducer := func(pat, star string, rdr *XMLReader, out chan<- Extract) { // close channel when all records have been processed defer close(out) + parent := "" + if star == "*" { + parent = pat + } + // partition all input by pattern and send XML substring to available consumer through channel PartitionPattern(pat, star, rdr, func(rec int, ofs int64, str string) { + + if checkIDs { + id := ProcessQuery(str[:], parent, rec, nil, tbls, DOINDEX) + if id == "" { + return + } + + val, ok := order[id] + if !ok { + // not in identifier list, skip + return + } + // decrement count in map + val-- + order[id] = val + if val > 0 { + // only write last record with a given identifier + return + } + } + out <- Extract{rec, "", str} }) } @@ -9058,7 +9115,10 @@ func main() { deGloss := false doMixed := false deAccent := false - doAscii := false + doASCII := false + + // -flags sets -strict or -mixed cleanup flags from argument + flgs := "" // read data from file instead of stdin fileName := "" @@ -9076,7 +9136,7 @@ func main() { // element to use as local data index indx := "" - // file of index values for removing duplicates + // file of index values for removing duplicates (read or write, depending upon context) unqe := "" // phrase to find anywhere in XML @@ -9165,10 +9225,10 @@ func main() { fileName = args[1] // skip past first of two arguments args = args[1:] - // file with selected indexes for removing duplicates + // uid file for removing duplicates case "-unique": if len(args) < 2 { - fmt.Fprintf(os.Stderr, "\nERROR: Unique identifier file is missing\n") + fmt.Fprintf(os.Stderr, "\nERROR: Unique identifier file name is missing\n") os.Exit(1) } unqe = args[1] @@ -9217,7 +9277,15 @@ func main() { case "-accent", "-plain": deAccent = true case "-ascii": - doAscii = true + doASCII = true + case "-flags": + if len(args) < 2 { + fmt.Fprintf(os.Stderr, "\nERROR: Flags argument is missing\n") + os.Exit(1) + } + flgs = args[1] + // skip past first of two arguments + args = args[1:] // debugging flags case "-prepare": cmpr = true @@ -9270,6 +9338,20 @@ func main() { } } + // -flags allows script to set -strict or -mixed from argument + switch flgs { + case "strict": + deGloss = true + case "mixed": + doMixed = true + case "none", "default": + default: + if flgs != "" { + fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized -flags value '%s'\n", flgs) + os.Exit(1) + } + } + // reality checks on number of processors to use // performance degrades if capacity is above maximum number of partitions per second (context switching?) if numProcs == 0 { @@ -9442,7 +9524,7 @@ func main() { tbls.DeGloss = deGloss tbls.DoMixed = doMixed tbls.DeAccent = deAccent - tbls.DoAscii = doAscii + tbls.DoASCII = doASCII // FILE NAME CAN BE SUPPLIED WITH -input COMMAND @@ -9879,7 +9961,7 @@ func main() { // COMPARE XML UPDATES TO LOCAL DIRECTORY, RETAIN NEW OR SUBSTANTIVELY CHANGED RECORDS - // -prepare plus -stash plus -index plus -pattern compares XML files against stash (undocumented) + // -prepare plus -stash plus -index plus -pattern compares XML files against stash if stsh != "" && indx != "" && cmpr { doReport := false @@ -10022,10 +10104,10 @@ func main() { // SAVE XML COMPONENT RECORDS TO LOCAL DIRECTORY INDEXED BY TRIE ON IDENTIFIER - // -stash plus -index plus -pattern saves XML files in trie-based directory structure + // -stash plus -index [plus -unique] plus -pattern saves XML files in trie-based directory structure if stsh != "" && indx != "" { - xmlq := CreateProducer(topPattern, star, rdr, tbls) + xmlq := CreateProducer(topPattern, star, rdr, unqe, tbls) idnq := CreateExaminers(tbls, parent, xmlq) unsq := CreateUnshuffler(tbls, idnq) unqq := CreateUniquer(tbls, unsq) @@ -10050,40 +10132,83 @@ func main() { return } - // READ FILE OF IDENTIFIERS AND EXTRACT SELECTED RECORDS FROM XML INPUT FILE + // GENERATE UID LIST AND REMOVE LEADING SPACES FROM XML - // -index plus -unique [plus -head/-tail/-hd/-tl] plus -pattern with no other extraction arguments - // takes an XML input file and a file of its UIDs and keeps only the last version of each record - if indx != "" && unqe != "" && len(args) == 2 { + // -index plus -unique [plus -head/-tail/-hd/-tl] plus -pattern takes an XML input file and + // writes a trimmed version with leading spaces removed, also creating a file of its UIDs + if stsh == "" && indx != "" && unqe != "" { - // read file of identifiers to use for filtering - fl, err := os.Open(unqe) + fl, err := os.Create(unqe) if err != nil { - fmt.Fprintf(os.Stderr, "\nERROR: Unable to open identifier file '%s'\n", unqe) + fmt.Fprintf(os.Stderr, "\nERROR: Unable to open uid output file '%s'\n", unqe) os.Exit(1) } - // create map that counts instances of each UID - order := make(map[string]int) + if head != "" { + os.Stdout.WriteString(head) + os.Stdout.WriteString("\n") + } - scanr := bufio.NewScanner(fl) + // write output, efficiently skipping leading spaces on each line + writeFlush := func(text string) { - // read lines of identifiers - for scanr.Scan() { + if text == "" { + return + } - id := scanr.Text() + var buffer bytes.Buffer - // map records count for given identifier - val := order[id] - val++ - order[id] = val - } + max := len(text) + idx := 0 + inBlank := &tbls.InBlank - fl.Close() + for idx < max { - if head != "" { - os.Stdout.WriteString(head) - os.Stdout.WriteString("\n") + // skip past leading blanks and empty lines + for idx < max { + ch := text[idx] + if !inBlank[ch] { + break + } + idx++ + } + + start := idx + + // skip to next newline + for idx < max { + if text[idx] == '\n' { + break + } + idx++ + } + + str := text[start:idx] + + if str == "" { + continue + } + + // skip processing instruction + if strings.HasPrefix(str, "<?") && strings.HasSuffix(str, "?>") { + continue + } + + // trim spaces next to angle bracket + for strings.Contains(str, "> ") { + str = strings.Replace(str, "> ", ">", 1) + } + for strings.Contains(str, " <") { + str = strings.Replace(str, " <", "<", 1) + } + + buffer.WriteString(str[:]) + buffer.WriteString("\n") + } + + rsult := buffer.String() + + os.Stdout.WriteString(rsult) } PartitionPattern(topPattern, star, rdr, @@ -10095,27 +10220,43 @@ func main() { return } - val, ok := order[id] - if !ok { - // not in identifier list, skip - return - } - // decrement count in map - val-- - order[id] = val - if val > 0 { - // only write last record with a given identifier - return - } + fl.WriteString(id) + fl.WriteString("\n") if hd != "" { os.Stdout.WriteString(hd) os.Stdout.WriteString("\n") } - // write selected record - os.Stdout.WriteString(str[:]) - os.Stdout.WriteString("\n") + if tbls.DeGloss { + if HasMarkup(str) { + str = RemoveUnicodeMarkup(str) + } + if HasAngleBracket(str) { + str = DoHTMLReplace(str) + } + } + if tbls.DoMixed { + if HasMarkup(str) { + str = SimulateUnicodeMarkup(str) + } + if HasAngleBracket(str) { + str = DoHTMLRepair(str) + } + str = DoTrimFlankingHTML(str) + } + if tbls.DeAccent { + if IsNotASCII(str) { + str = DoAccentTransform(str) + } + } + if tbls.DoASCII { + if IsNotASCII(str) { + str = UnicodeToASCII(str) + } + } + + writeFlush(str[:]) if tl != "" { os.Stdout.WriteString(tl) @@ -10128,6 +10269,12 @@ func main() { os.Stdout.WriteString("\n") } + err = fl.Sync() + if err != nil { + fmt.Println(err.Error()) + } + fl.Close() + if timr { printDuration("records") } @@ -10333,7 +10480,7 @@ func main() { os.Exit(1) } - xmlq := CreateProducer(topPattern, star, rdr, tbls) + xmlq := CreateProducer(topPattern, star, rdr, "", tbls) tblq := CreateConsumers(cmds, tbls, parent, xmlq) if xmlq == nil || tblq == nil { @@ -10445,7 +10592,7 @@ func main() { // LAUNCH PRODUCER, CONSUMER, AND UNSHUFFLER SERVERS // launch producer goroutine to partition XML by pattern - xmlq := CreateProducer(topPattern, star, rdr, tbls) + xmlq := CreateProducer(topPattern, star, rdr, "", tbls) // launch consumer goroutines to parse and explore partitioned XML objects tblq := CreateConsumers(cmds, tbls, parent, xmlq) |