summaryrefslogtreecommitdiff
path: root/xtract.go
diff options
context:
space:
mode:
Diffstat (limited to 'xtract.go')
-rw-r--r--xtract.go379
1 files changed, 263 insertions, 116 deletions
diff --git a/xtract.go b/xtract.go
index 46c760e..2d91bf7 100644
--- a/xtract.go
+++ b/xtract.go
@@ -93,8 +93,8 @@ Overview
Processing Flags
- -mixed Allow PubMed mixed content
-strict Remove HTML highlight tags
+ -mixed Allow PubMed mixed content
-accent Delete Unicode accents
-ascii Unicode to numeric character references
@@ -327,17 +327,15 @@ Examples
`
const xtractExtras = `
+Processing Flags
+
+ -flags [strict|mixed|none]
+
Local Record Indexing
-stash Base path for individual XML files
-index Name of element to use for identifier
-
-Processing Commands
-
- -prepare [release|report] Compare daily update to stash
- -ignore Ignore contents of object in -prepare comparisons
- -missing Print list of missing identifiers
- -unique File of UIDs for skipping all but last version
+ -unique File of UIDs for removing intermediate records
Sample File Download
@@ -361,11 +359,11 @@ Human Subset Extraction
PubMed Download
download-pubmed baseline updatefiles
- unpack-pubmed
+ unpack-pubmed mixed
PubMed Archive Creation
- stash-pubmed /Volumes/myssd/Pubmed
+ stash-pubmed mixed /Volumes/myssd/Pubmed
PubMed Archive Retrieval
@@ -374,6 +372,12 @@ PubMed Archive Retrieval
`
const xtractAdvanced = `
+Processing Commands
+
+ -prepare [release|report] Compare daily update to stash
+ -ignore Ignore contents of object in -prepare comparisons
+ -missing Print list of missing identifiers
+
Update Candidate Report
gzcat medline*.xml.gz | xtract -strict -compress -format flush |
@@ -452,14 +456,14 @@ Performance Tuning Script
Processor Titration Results
- 1 27748 207
- 2 51011 272
- 3 73487 700
- 4 93032 2559
- 5 92596 1549
- 6 89513 1570
- 7 84872 1145
- 8 83829 952
+ 1 27622 31
+ 2 51799 312
+ 3 74853 593
+ 4 95867 1337
+ 5 97171 4019
+ 6 93460 2458
+ 7 87467 1030
+ 8 82448 2651
Execution Profiling
@@ -618,7 +622,7 @@ Gene Regions
LOCUS NC_000076 2142 bp DNA linear CON 09-FEB-2015
DEFINITION Mus musculus strain C57BL/6J chromosome 10, GRCm38.p3 C57BL/6J.
ACCESSION NC_000076 REGION: complement(75771233..75773374) GPC_000000783
- VERSION NC_000076.6 GI:372099100
+ VERSION NC_000076.6
...
FEATURES Location/Qualifiers
source 1..2142
@@ -2276,7 +2280,7 @@ type Tables struct {
DeGloss bool
DoMixed bool
DeAccent bool
- DoAscii bool
+ DoASCII bool
}
type Node struct {
@@ -2487,26 +2491,22 @@ func TrimPunctuation(str string) string {
}
}
- if max > 0 {
- if str[0] == '(' && !strings.Contains(str, ")") {
- // trim isolated left parentheses
- str = str[1:]
- max--
- }
+ if max > 0 && str[0] == '(' && !strings.Contains(str, ")") {
+ // trim isolated left parentheses
+ str = str[1:]
+ max--
}
- if max > 1 {
- if str[max-1] == ')' && !strings.Contains(str, "(") {
- // trim isolated right parentheses
- str = str[:max-1]
- // max--
- }
+ if max > 1 && str[max-1] == ')' && !strings.Contains(str, "(") {
+ // trim isolated right parentheses
+ str = str[:max-1]
+ // max--
}
return str
}
-func HtmlAhead(text string, pos int) int {
+func HTMLAhead(text string, pos int) int {
max := len(text) - pos
@@ -2570,7 +2570,7 @@ func HtmlAhead(text string, pos int) int {
return 0
}
-func HtmlBehind(bufr []byte, pos int) bool {
+func HTMLBehind(bufr []byte, pos int) bool {
if pos > 1 && bufr[pos-2] == '<' {
ch := bufr[pos-1]
@@ -2781,7 +2781,7 @@ var (
rpair *strings.Replacer
)
-func DoHtmlReplace(str string) string {
+func DoHTMLReplace(str string) string {
// replacer/repairer not reentrant, protected by mutex
rlock.Lock()
@@ -2862,7 +2862,7 @@ func DoHtmlReplace(str string) string {
return str
}
-func DoHtmlRepair(str string) string {
+func DoHTMLRepair(str string) string {
// replacer/repairer not reentrant, protected by mutex
rlock.Lock()
@@ -2923,7 +2923,7 @@ func DoHtmlRepair(str string) string {
return str
}
-func DoTrimFlankingHtml(str string) string {
+func DoTrimFlankingHTML(str string) string {
badPrefix := [10]string{
"<i></i>",
@@ -3050,7 +3050,7 @@ func DoAccentTransform(str string) string {
return str
}
-func UnicodeToAscii(str string) string {
+func UnicodeToASCII(str string) string {
var buffer bytes.Buffer
@@ -3874,16 +3874,16 @@ type XMLReader struct {
Closed bool
Docompress bool
Docleanup bool
- Leavehtml bool
+ LeaveHTML bool
}
-func NewXMLReader(in io.Reader, doCompress, doCleanup, leaveHtml bool) *XMLReader {
+func NewXMLReader(in io.Reader, doCompress, doCleanup, leaveHTML bool) *XMLReader {
if in == nil {
return nil
}
- rdr := &XMLReader{Reader: in, Docompress: doCompress, Docleanup: doCleanup, Leavehtml: leaveHtml}
+ rdr := &XMLReader{Reader: in, Docompress: doCompress, Docleanup: doCleanup, LeaveHTML: leaveHTML}
// 65536 appears to be the maximum number of characters presented to io.Reader when input is piped from stdin
// increasing size of buffer when input is from a file does not improve program performance
@@ -3940,9 +3940,9 @@ func (rdr *XMLReader) NextBlock() string {
pos := -1
for pos = len(bufr) - 1; pos >= 0; pos-- {
if bufr[pos] == '>' {
- if rdr.Leavehtml {
+ if rdr.LeaveHTML {
// optionally skip backwards past embedded i, b, u, sub, and sup HTML open, close, and empty tags
- if HtmlBehind(bufr, pos) {
+ if HTMLBehind(bufr, pos) {
continue
}
}
@@ -4521,7 +4521,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
start := idx
- if ch == '<' && (plainText || HtmlAhead(text, idx) == 0) {
+ if ch == '<' && (plainText || HTMLAhead(text, idx) == 0) {
// at start of element
idx++
@@ -4723,7 +4723,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
}
if ch == '<' && !plainText {
// optionally allow HTML text formatting elements and super/subscripts
- advance := HtmlAhead(text, idx)
+ advance := HTMLAhead(text, idx)
if advance > 0 {
idx += advance
ch = text[idx]
@@ -4934,7 +4934,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
startLine := 0
// warn if HTML tags are not well-formed
- unbalancedHtml := func(text string) bool {
+ unbalancedHTML := func(text string) bool {
var arry []string
@@ -5037,7 +5037,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
fmt.Fprintf(os.Stdout, "Contents not expected before </%s>, line %d\n", parent, line)
}
if tbls.DeGloss || tbls.DoMixed {
- if unbalancedHtml(name) {
+ if unbalancedHTML(name) {
fmt.Fprintf(os.Stdout, "Unbalanced mixed-content tags, line %d\n", line)
}
}
@@ -5393,7 +5393,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
str = RemoveUnicodeMarkup(str)
}
if HasAngleBracket(str) {
- str = DoHtmlReplace(str)
+ str = DoHTMLReplace(str)
}
}
if tbls.DoMixed {
@@ -5401,18 +5401,18 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
str = SimulateUnicodeMarkup(str)
}
if HasAngleBracket(str) {
- str = DoHtmlRepair(str)
+ str = DoHTMLRepair(str)
}
- str = DoTrimFlankingHtml(str)
+ str = DoTrimFlankingHTML(str)
}
if tbls.DeAccent {
if IsNotASCII(str) {
str = DoAccentTransform(str)
}
}
- if tbls.DoAscii {
+ if tbls.DoASCII {
if IsNotASCII(str) {
- str = UnicodeToAscii(str)
+ str = UnicodeToASCII(str)
}
}
@@ -5530,9 +5530,9 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
attr = DoAccentTransform(attr)
}
}
- if tbls.DoAscii {
+ if tbls.DoASCII {
if IsNotASCII(attr) {
- attr = UnicodeToAscii(attr)
+ attr = UnicodeToASCII(attr)
}
}
@@ -5773,7 +5773,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
name = RemoveUnicodeMarkup(name)
}
if HasAngleBracket(name) {
- name = DoHtmlReplace(name)
+ name = DoHTMLReplace(name)
}
}
if tbls.DoMixed {
@@ -5781,18 +5781,18 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
name = SimulateUnicodeMarkup(name)
}
if HasAngleBracket(name) {
- name = DoHtmlRepair(name)
+ name = DoHTMLRepair(name)
}
- name = DoTrimFlankingHtml(name)
+ name = DoTrimFlankingHTML(name)
}
if tbls.DeAccent {
if IsNotASCII(name) {
name = DoAccentTransform(name)
}
}
- if tbls.DoAscii {
+ if tbls.DoASCII {
if IsNotASCII(name) {
- name = UnicodeToAscii(name)
+ name = UnicodeToASCII(name)
}
}
if HasFlankingSpace(name) {
@@ -7168,7 +7168,7 @@ func ProcessClause(curr *Node, stages []*Step, mask, prev, pfx, sfx, sep, def st
str = RemoveUnicodeMarkup(str)
}
if HasAngleBracket(str) {
- str = DoHtmlReplace(str)
+ str = DoHTMLReplace(str)
}
// break terms at spaces, allowing hyphenated words
@@ -7993,7 +7993,7 @@ func ProcessQuery(Text, parent string, index int, cmds *Block, tbls *Tables, act
start := idx
- if ch == '<' && (plainText || HtmlAhead(text, idx) == 0) {
+ if ch == '<' && (plainText || HTMLAhead(text, idx) == 0) {
// at start of element
idx++
@@ -8142,7 +8142,7 @@ func ProcessQuery(Text, parent string, index int, cmds *Block, tbls *Tables, act
}
if ch == '<' && !plainText {
// optionally allow HTML text formatting elements and super/subscripts
- advance := HtmlAhead(text, idx)
+ advance := HTMLAhead(text, idx)
if advance > 0 {
idx += advance
ch = text[idx]
@@ -8218,7 +8218,7 @@ func ProcessQuery(Text, parent string, index int, cmds *Block, tbls *Tables, act
name = RemoveUnicodeMarkup(name)
}
if HasAngleBracket(name) {
- name = DoHtmlReplace(name)
+ name = DoHTMLReplace(name)
}
}
if tbls.DoMixed {
@@ -8226,18 +8226,18 @@ func ProcessQuery(Text, parent string, index int, cmds *Block, tbls *Tables, act
name = SimulateUnicodeMarkup(name)
}
if HasAngleBracket(name) {
- name = DoHtmlReplace(name)
+ name = DoHTMLReplace(name)
}
- name = DoTrimFlankingHtml(name)
+ name = DoTrimFlankingHTML(name)
}
if tbls.DeAccent {
if IsNotASCII(name) {
name = DoAccentTransform(name)
}
}
- if tbls.DoAscii {
+ if tbls.DoASCII {
if IsNotASCII(name) {
- name = UnicodeToAscii(name)
+ name = UnicodeToASCII(name)
}
}
node.Contents = name
@@ -8546,7 +8546,7 @@ func (h *ExtractHeap) Pop() interface{} {
// process with single goroutine calls defer close(out) so consumer(s) can range over channel
// process with multiple instances calls defer wg.Done(), separate goroutine uses wg.Wait() to delay close(out)
-func CreateProducer(pat, star string, rdr *XMLReader, tbls *Tables) <-chan Extract {
+func CreateProducer(pat, star string, rdr *XMLReader, uidFile string, tbls *Tables) <-chan Extract {
if rdr == nil || tbls == nil {
return nil
@@ -8558,15 +8558,72 @@ func CreateProducer(pat, star string, rdr *XMLReader, tbls *Tables) <-chan Extra
os.Exit(1)
}
+ // create map that counts instances of each UID
+ order := make(map[string]int)
+
+ checkIDs := false
+
+ if uidFile != "" {
+ checkIDs = true
+
+ // read file of identifiers to use for filtering
+ fl, err := os.Open(uidFile)
+ if err != nil {
+ fmt.Fprintf(os.Stderr, "\nERROR: Unable to open identifier file '%s'\n", uidFile)
+ os.Exit(1)
+ }
+
+ scanr := bufio.NewScanner(fl)
+
+ // read lines of identifiers
+ for scanr.Scan() {
+
+ id := scanr.Text()
+
+ // map records count for given identifier
+ val := order[id]
+ val++
+ order[id] = val
+ }
+
+ fl.Close()
+ }
+
// xmlProducer sends partitioned XML strings through channel
xmlProducer := func(pat, star string, rdr *XMLReader, out chan<- Extract) {
// close channel when all records have been processed
defer close(out)
+ parent := ""
+ if star == "*" {
+ parent = pat
+ }
+
// partition all input by pattern and send XML substring to available consumer through channel
PartitionPattern(pat, star, rdr,
func(rec int, ofs int64, str string) {
+
+ if checkIDs {
+ id := ProcessQuery(str[:], parent, rec, nil, tbls, DOINDEX)
+ if id == "" {
+ return
+ }
+
+ val, ok := order[id]
+ if !ok {
+ // not in identifier list, skip
+ return
+ }
+ // decrement count in map
+ val--
+ order[id] = val
+ if val > 0 {
+ // only write last record with a given identifier
+ return
+ }
+ }
+
out <- Extract{rec, "", str}
})
}
@@ -9058,7 +9115,10 @@ func main() {
deGloss := false
doMixed := false
deAccent := false
- doAscii := false
+ doASCII := false
+
+ // -flags sets -strict or -mixed cleanup flags from argument
+ flgs := ""
// read data from file instead of stdin
fileName := ""
@@ -9076,7 +9136,7 @@ func main() {
// element to use as local data index
indx := ""
- // file of index values for removing duplicates
+ // file of index values for removing duplicates (read or write, depending upon context)
unqe := ""
// phrase to find anywhere in XML
@@ -9165,10 +9225,10 @@ func main() {
fileName = args[1]
// skip past first of two arguments
args = args[1:]
- // file with selected indexes for removing duplicates
+ // uid file for removing duplicates
case "-unique":
if len(args) < 2 {
- fmt.Fprintf(os.Stderr, "\nERROR: Unique identifier file is missing\n")
+ fmt.Fprintf(os.Stderr, "\nERROR: Unique identifier file name is missing\n")
os.Exit(1)
}
unqe = args[1]
@@ -9217,7 +9277,15 @@ func main() {
case "-accent", "-plain":
deAccent = true
case "-ascii":
- doAscii = true
+ doASCII = true
+ case "-flags":
+ if len(args) < 2 {
+ fmt.Fprintf(os.Stderr, "\nERROR: Flags argument is missing\n")
+ os.Exit(1)
+ }
+ flgs = args[1]
+ // skip past first of two arguments
+ args = args[1:]
// debugging flags
case "-prepare":
cmpr = true
@@ -9270,6 +9338,20 @@ func main() {
}
}
+ // -flags allows script to set -strict or -mixed from argument
+ switch flgs {
+ case "strict":
+ deGloss = true
+ case "mixed":
+ doMixed = true
+ case "none", "default":
+ default:
+ if flgs != "" {
+ fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized -flags value '%s'\n", flgs)
+ os.Exit(1)
+ }
+ }
+
// reality checks on number of processors to use
// performance degrades if capacity is above maximum number of partitions per second (context switching?)
if numProcs == 0 {
@@ -9442,7 +9524,7 @@ func main() {
tbls.DeGloss = deGloss
tbls.DoMixed = doMixed
tbls.DeAccent = deAccent
- tbls.DoAscii = doAscii
+ tbls.DoASCII = doASCII
// FILE NAME CAN BE SUPPLIED WITH -input COMMAND
@@ -9879,7 +9961,7 @@ func main() {
// COMPARE XML UPDATES TO LOCAL DIRECTORY, RETAIN NEW OR SUBSTANTIVELY CHANGED RECORDS
- // -prepare plus -stash plus -index plus -pattern compares XML files against stash (undocumented)
+ // -prepare plus -stash plus -index plus -pattern compares XML files against stash
if stsh != "" && indx != "" && cmpr {
doReport := false
@@ -10022,10 +10104,10 @@ func main() {
// SAVE XML COMPONENT RECORDS TO LOCAL DIRECTORY INDEXED BY TRIE ON IDENTIFIER
- // -stash plus -index plus -pattern saves XML files in trie-based directory structure
+ // -stash plus -index [plus -unique] plus -pattern saves XML files in trie-based directory structure
if stsh != "" && indx != "" {
- xmlq := CreateProducer(topPattern, star, rdr, tbls)
+ xmlq := CreateProducer(topPattern, star, rdr, unqe, tbls)
idnq := CreateExaminers(tbls, parent, xmlq)
unsq := CreateUnshuffler(tbls, idnq)
unqq := CreateUniquer(tbls, unsq)
@@ -10050,40 +10132,83 @@ func main() {
return
}
- // READ FILE OF IDENTIFIERS AND EXTRACT SELECTED RECORDS FROM XML INPUT FILE
+ // GENERATE UID LIST AND REMOVE LEADING SPACES FROM XML
- // -index plus -unique [plus -head/-tail/-hd/-tl] plus -pattern with no other extraction arguments
- // takes an XML input file and a file of its UIDs and keeps only the last version of each record
- if indx != "" && unqe != "" && len(args) == 2 {
+ // -index plus -unique [plus -head/-tail/-hd/-tl] plus -pattern takes an XML input file and
+ // writes a trimmed version with leading spaces removed, also creating a file of its UIDs
+ if stsh == "" && indx != "" && unqe != "" {
- // read file of identifiers to use for filtering
- fl, err := os.Open(unqe)
+ fl, err := os.Create(unqe)
if err != nil {
- fmt.Fprintf(os.Stderr, "\nERROR: Unable to open identifier file '%s'\n", unqe)
+ fmt.Fprintf(os.Stderr, "\nERROR: Unable to open uid output file '%s'\n", unqe)
os.Exit(1)
}
- // create map that counts instances of each UID
- order := make(map[string]int)
+ if head != "" {
+ os.Stdout.WriteString(head)
+ os.Stdout.WriteString("\n")
+ }
- scanr := bufio.NewScanner(fl)
+ // write output, efficiently skipping leading spaces on each line
+ writeFlush := func(text string) {
- // read lines of identifiers
- for scanr.Scan() {
+ if text == "" {
+ return
+ }
- id := scanr.Text()
+ var buffer bytes.Buffer
- // map records count for given identifier
- val := order[id]
- val++
- order[id] = val
- }
+ max := len(text)
+ idx := 0
+ inBlank := &tbls.InBlank
- fl.Close()
+ for idx < max {
- if head != "" {
- os.Stdout.WriteString(head)
- os.Stdout.WriteString("\n")
+ // skip past leading blanks and empty lines
+ for idx < max {
+ ch := text[idx]
+ if !inBlank[ch] {
+ break
+ }
+ idx++
+ }
+
+ start := idx
+
+ // skip to next newline
+ for idx < max {
+ if text[idx] == '\n' {
+ break
+ }
+ idx++
+ }
+
+ str := text[start:idx]
+
+ if str == "" {
+ continue
+ }
+
+ // skip processing instruction
+ if strings.HasPrefix(str, "<?") && strings.HasSuffix(str, "?>") {
+ continue
+ }
+
+ // trim spaces next to angle bracket
+ for strings.Contains(str, "> ") {
+ str = strings.Replace(str, "> ", ">", 1)
+ }
+ for strings.Contains(str, " <") {
+ str = strings.Replace(str, " <", "<", 1)
+ }
+
+ buffer.WriteString(str[:])
+ buffer.WriteString("\n")
+ }
+
+ rsult := buffer.String()
+
+ os.Stdout.WriteString(rsult)
}
PartitionPattern(topPattern, star, rdr,
@@ -10095,27 +10220,43 @@ func main() {
return
}
- val, ok := order[id]
- if !ok {
- // not in identifier list, skip
- return
- }
- // decrement count in map
- val--
- order[id] = val
- if val > 0 {
- // only write last record with a given identifier
- return
- }
+ fl.WriteString(id)
+ fl.WriteString("\n")
if hd != "" {
os.Stdout.WriteString(hd)
os.Stdout.WriteString("\n")
}
- // write selected record
- os.Stdout.WriteString(str[:])
- os.Stdout.WriteString("\n")
+ if tbls.DeGloss {
+ if HasMarkup(str) {
+ str = RemoveUnicodeMarkup(str)
+ }
+ if HasAngleBracket(str) {
+ str = DoHTMLReplace(str)
+ }
+ }
+ if tbls.DoMixed {
+ if HasMarkup(str) {
+ str = SimulateUnicodeMarkup(str)
+ }
+ if HasAngleBracket(str) {
+ str = DoHTMLRepair(str)
+ }
+ str = DoTrimFlankingHTML(str)
+ }
+ if tbls.DeAccent {
+ if IsNotASCII(str) {
+ str = DoAccentTransform(str)
+ }
+ }
+ if tbls.DoASCII {
+ if IsNotASCII(str) {
+ str = UnicodeToASCII(str)
+ }
+ }
+
+ writeFlush(str[:])
if tl != "" {
os.Stdout.WriteString(tl)
@@ -10128,6 +10269,12 @@ func main() {
os.Stdout.WriteString("\n")
}
+ err = fl.Sync()
+ if err != nil {
+ fmt.Println(err.Error())
+ }
+ fl.Close()
+
if timr {
printDuration("records")
}
@@ -10333,7 +10480,7 @@ func main() {
os.Exit(1)
}
- xmlq := CreateProducer(topPattern, star, rdr, tbls)
+ xmlq := CreateProducer(topPattern, star, rdr, "", tbls)
tblq := CreateConsumers(cmds, tbls, parent, xmlq)
if xmlq == nil || tblq == nil {
@@ -10445,7 +10592,7 @@ func main() {
// LAUNCH PRODUCER, CONSUMER, AND UNSHUFFLER SERVERS
// launch producer goroutine to partition XML by pattern
- xmlq := CreateProducer(topPattern, star, rdr, tbls)
+ xmlq := CreateProducer(topPattern, star, rdr, "", tbls)
// launch consumer goroutines to parse and explore partitioned XML objects
tblq := CreateConsumers(cmds, tbls, parent, xmlq)