diff options
Diffstat (limited to 'eutils/xml.go')
-rw-r--r-- | eutils/xml.go | 131 |
1 files changed, 79 insertions, 52 deletions
diff --git a/eutils/xml.go b/eutils/xml.go index aed00f3..6a4be7c 100644 --- a/eutils/xml.go +++ b/eutils/xml.go @@ -38,10 +38,14 @@ import ( "os" ) -// XMLBlock is a string with a leading left angle bracket and trailing right angle bracket +// XMLBlock is a string that begins with a left angle bracket and is trimmed back to +// end with a right angle bracket. The excluded characters are saved and prepended +// to the next buffer. Providing complete object tags simplifies subsequent parsing. type XMLBlock string -// CreateXMLStreamer reads XML input file into a channel of trimmed blocks +// CreateXMLStreamer reads XML input into a channel of trimmed strings that are +// then split by PartitionPattern into individual records (which can be processed +// concurrently), or parsed directly into a channel of tokens by CreateTokenizer. func CreateXMLStreamer(in io.Reader) <-chan XMLBlock { if in == nil { @@ -54,15 +58,17 @@ func CreateXMLStreamer(in io.Reader) <-chan XMLBlock { os.Exit(1) } - // xmlReader sends XML blocks through channel + // xmlReader sends trimmed XML blocks through the output channel. xmlReader := func(in io.Reader, out chan<- XMLBlock) { // close channel when all blocks have been processed defer close(out) - // 65536 appears to be the maximum number of characters presented to io.Reader when input is piped from stdin - // increasing size of buffer when input is from a file does not improve program performance - // additional 16384 bytes are reserved for copying previous remainder to start of buffer before next read + // 65536 appears to be the maximum number of characters presented to io.Reader + // when input is piped from stdin. Increasing the buffer size when input is from + // a file does not improve program performance. An additional 16384 bytes are + // reserved for copying the previous remainder to the beginning of the buffer + // before the next read. const XMLBUFSIZE = 65536 + 16384 buffer := make([]byte, XMLBUFSIZE) @@ -71,10 +77,15 @@ func CreateXMLStreamer(in io.Reader) <-chan XMLBlock { delta := 0 isClosed := false + // htmlBehind is used in strict mode to trim back further when a lower-case tag + // is encountered. This may be a formatting decoration, such as <i> or </i> for + // italics. Processing HTML, which may have embedded mixed content, requires use + // of mixed mode. htmlBehind := func(bufr []byte, pos, txtlen int) bool { for pos >= 0 { if bufr[pos] == '<' { + // detect lower-case markup tags, or DispFormula in PubMed return HTMLAhead(string(bufr), pos, txtlen) != 0 } pos-- @@ -83,7 +94,10 @@ func CreateXMLStreamer(in io.Reader) <-chan XMLBlock { return false } - // read one buffer, trim at last > and retain remainder for next call, signal if no > character + // nextBuffer reads one buffer, trims back to the right-most > character, and + // retains the remainder for prepending in the next call. It also signals if + // there was no > character, resulting in subsequent calls to nextBuffer to + // continue reading a large content string. nextBuffer := func() ([]byte, bool, bool) { if isClosed { @@ -94,30 +108,34 @@ func CreateXMLStreamer(in io.Reader) <-chan XMLBlock { m := copy(buffer, remainder) remainder = "" if m > 16384 { - // previous remainder is larger than reserved section, write and signal need to continue reading + // previous remainder is larger than reserved section, + // write and signal the need to continue reading. return buffer[:m], true, false } // read next block, append behind copied remainder from previous read n, err := in.Read(buffer[m:]) - // with data piped through stdin, read function may not always return the same number of bytes each time + // with data piped through stdin, read function may not always return the + // same number of bytes each time if err != nil { if err != io.EOF { - // real error + // real error. fmt.Fprintf(os.Stderr, "\nERROR: %s\n", err.Error()) - // Ignore bytes - non-conforming implementations of io.Reader may returned mangled data on non-EOF errors + // ignore bytes - non-conforming implementations of io.Reader may + // return mangled data on non-EOF errors isClosed = true return nil, false, true } - // end of file + // end of file. isClosed = true if n == 0 { - // if EOF and no more data, do not send final remainder (not terminated by right angle bracket that is used as a sentinel) + // if EOF and no more data, do not send final remainder (not terminated + // by right angle bracket that is used as a sentinel) return nil, false, true } } if n < 0 { - // Reality check - non-conforming implementations of io.Reader may return -1 + // reality check - non-conforming implementations of io.Reader may return -1 fmt.Fprintf(os.Stderr, "\nERROR: io.Reader returned negative count %d\n", n) // treat as n == 0 in order to update file offset and avoid losing previous remainder n = 0 @@ -130,13 +148,14 @@ func CreateXMLStreamer(in io.Reader) <-chan XMLBlock { // slice of actual characters read bufr := buffer[:n+m] - // look for last > character - // safe to back up on UTF-8 rune array when looking for 7-bit ASCII character + // Look for last > character. It is safe to back up on UTF-8 rune array when looking + // for a 7-bit ASCII character. pos := -1 for pos = len(bufr) - 1; pos >= 0; pos-- { if bufr[pos] == '>' { if doStrict { - // optionally skip backwards past embedded i, b, u, sub, and sup HTML open, close, and empty tags, and MathML + // optionally skip backwards past embedded i, b, u, sub, and sup + // HTML open, close, and empty tags, and MathML instructions if htmlBehind(bufr, pos, len(bufr)) { continue } @@ -157,8 +176,9 @@ func CreateXMLStreamer(in io.Reader) <-chan XMLBlock { return bufr[:], true, false } - // nextBlock reads buffer, concatenates if necessary to place long element content into a single string - // all result strings end in > character that is used as a sentinel in subsequent code + // nextBlock reads buffer, concatenates if necessary to place long element content + // into a single string. All result strings end in > character that is used as a + // sentinel in subsequent code. nextBlock := func() string { // read next buffer @@ -169,8 +189,8 @@ func CreateXMLStreamer(in io.Reader) <-chan XMLBlock { return "" } - // if buffer does not end with > character if cont { + // current line does not end with > character var buff bytes.Buffer // keep reading long content blocks @@ -202,7 +222,7 @@ func CreateXMLStreamer(in io.Reader) <-chan XMLBlock { // trimming spaces here would throw off line tracking - // optionally compress/cleanup tags/attributes and contents (undocumented) + // optionally compress/cleanup tags/attributes and contents if doCleanup { if HasBadSpace(str) { str = CleanupBadSpaces(str) @@ -227,7 +247,9 @@ func CreateXMLStreamer(in io.Reader) <-chan XMLBlock { return out } -// XMLRecord structure wraps a numbered XML record sent down a channel +// XMLRecord wraps a numbered XML record or the results of data extraction on +// that record. The Index field stores the record's original position in the +// input stream. The Data field is used for binary compressed PubmedArticle XML. type XMLRecord struct { Index int Ident string @@ -235,30 +257,9 @@ type XMLRecord struct { Data []byte } -type xmlRecordHeap []XMLRecord - -// methods that satisfy heap.Interface -func (h xmlRecordHeap) Len() int { - return len(h) -} -func (h xmlRecordHeap) Less(i, j int) bool { - return h[i].Index < h[j].Index -} -func (h xmlRecordHeap) Swap(i, j int) { - h[i], h[j] = h[j], h[i] -} -func (h *xmlRecordHeap) Push(x interface{}) { - *h = append(*h, x.(XMLRecord)) -} -func (h *xmlRecordHeap) Pop() interface{} { - old := *h - n := len(old) - x := old[n-1] - *h = old[0 : n-1] - return x -} - -// CreateXMLProducer partitions an XML set and sends it down a channel +// CreateXMLProducer partitions an XML set and sends records down a channel. +// After processing asynchronously in multiple concurrent go routines, the +// original order can be restored by passage through the XMLUnshuffler. func CreateXMLProducer(pat, star string, rdr <-chan XMLBlock) <-chan XMLRecord { if rdr == nil { @@ -271,7 +272,7 @@ func CreateXMLProducer(pat, star string, rdr <-chan XMLBlock) <-chan XMLRecord { os.Exit(1) } - // xmlProducer sends partitioned XML strings through channel + // xmlProducer sends partitioned XML strings through channel. xmlProducer := func(pat, star string, rdr <-chan XMLBlock, out chan<- XMLRecord) { // close channel when all records have been processed @@ -293,7 +294,32 @@ func CreateXMLProducer(pat, star string, rdr <-chan XMLBlock) <-chan XMLRecord { return out } -// CreateXMLUnshuffler uses heap to restore output of multiple consumers to original record order +// xmlRecordHeap collects asynchronous processing results for presentation in the original order. +type xmlRecordHeap []XMLRecord + +// methods that satisfy heap.Interface +func (h xmlRecordHeap) Len() int { + return len(h) +} +func (h xmlRecordHeap) Less(i, j int) bool { + return h[i].Index < h[j].Index +} +func (h xmlRecordHeap) Swap(i, j int) { + h[i], h[j] = h[j], h[i] +} +func (h *xmlRecordHeap) Push(x interface{}) { + *h = append(*h, x.(XMLRecord)) +} +func (h *xmlRecordHeap) Pop() interface{} { + old := *h + n := len(old) + x := old[n-1] + *h = old[0 : n-1] + return x +} + +// CreateXMLUnshuffler passes the output of multiple concurrent processors to +// a heap, which releases results in the same order as the original records. func CreateXMLUnshuffler(inp <-chan XMLRecord) <-chan XMLRecord { if inp == nil { @@ -306,7 +332,7 @@ func CreateXMLUnshuffler(inp <-chan XMLRecord) <-chan XMLRecord { os.Exit(1) } - // xmlUnshuffler restores original order with heap + // xmlUnshuffler restores original order with heap. xmlUnshuffler := func(inp <-chan XMLRecord, out chan<- XMLRecord) { // close channel when all records have been processed @@ -326,7 +352,8 @@ func CreateXMLUnshuffler(inp <-chan XMLRecord) <-chan XMLRecord { // push result onto heap heap.Push(hp, ext) - // read several values before checking to see if next record to print has been processed + // Read several values before checking to see if next record to print has been processed. + // The default heapSize value has been tuned by experiment for maximum performance. if delay < heapSize { delay++ continue @@ -356,11 +383,11 @@ func CreateXMLUnshuffler(inp <-chan XMLRecord) <-chan XMLRecord { next++ } - // keep checking heap to see if next result is already available + // continue to check heap to see if next result is already available } } - // send remainder of heap to output + // flush remainder of heap to output for hp.Len() > 0 { curr := heap.Pop(hp).(XMLRecord) |