summaryrefslogtreecommitdiff
path: root/eutils/xml.go
diff options
context:
space:
mode:
Diffstat (limited to 'eutils/xml.go')
-rw-r--r--eutils/xml.go131
1 files changed, 79 insertions, 52 deletions
diff --git a/eutils/xml.go b/eutils/xml.go
index aed00f3..6a4be7c 100644
--- a/eutils/xml.go
+++ b/eutils/xml.go
@@ -38,10 +38,14 @@ import (
"os"
)
-// XMLBlock is a string with a leading left angle bracket and trailing right angle bracket
+// XMLBlock is a string that begins with a left angle bracket and is trimmed back to
+// end with a right angle bracket. The excluded characters are saved and prepended
+// to the next buffer. Providing complete object tags simplifies subsequent parsing.
type XMLBlock string
-// CreateXMLStreamer reads XML input file into a channel of trimmed blocks
+// CreateXMLStreamer reads XML input into a channel of trimmed strings that are
+// then split by PartitionPattern into individual records (which can be processed
+// concurrently), or parsed directly into a channel of tokens by CreateTokenizer.
func CreateXMLStreamer(in io.Reader) <-chan XMLBlock {
if in == nil {
@@ -54,15 +58,17 @@ func CreateXMLStreamer(in io.Reader) <-chan XMLBlock {
os.Exit(1)
}
- // xmlReader sends XML blocks through channel
+ // xmlReader sends trimmed XML blocks through the output channel.
xmlReader := func(in io.Reader, out chan<- XMLBlock) {
// close channel when all blocks have been processed
defer close(out)
- // 65536 appears to be the maximum number of characters presented to io.Reader when input is piped from stdin
- // increasing size of buffer when input is from a file does not improve program performance
- // additional 16384 bytes are reserved for copying previous remainder to start of buffer before next read
+ // 65536 appears to be the maximum number of characters presented to io.Reader
+ // when input is piped from stdin. Increasing the buffer size when input is from
+ // a file does not improve program performance. An additional 16384 bytes are
+ // reserved for copying the previous remainder to the beginning of the buffer
+ // before the next read.
const XMLBUFSIZE = 65536 + 16384
buffer := make([]byte, XMLBUFSIZE)
@@ -71,10 +77,15 @@ func CreateXMLStreamer(in io.Reader) <-chan XMLBlock {
delta := 0
isClosed := false
+ // htmlBehind is used in strict mode to trim back further when a lower-case tag
+ // is encountered. This may be a formatting decoration, such as <i> or </i> for
+ // italics. Processing HTML, which may have embedded mixed content, requires use
+ // of mixed mode.
htmlBehind := func(bufr []byte, pos, txtlen int) bool {
for pos >= 0 {
if bufr[pos] == '<' {
+ // detect lower-case markup tags, or DispFormula in PubMed
return HTMLAhead(string(bufr), pos, txtlen) != 0
}
pos--
@@ -83,7 +94,10 @@ func CreateXMLStreamer(in io.Reader) <-chan XMLBlock {
return false
}
- // read one buffer, trim at last > and retain remainder for next call, signal if no > character
+ // nextBuffer reads one buffer, trims back to the right-most > character, and
+ // retains the remainder for prepending in the next call. It also signals if
+ // there was no > character, resulting in subsequent calls to nextBuffer to
+ // continue reading a large content string.
nextBuffer := func() ([]byte, bool, bool) {
if isClosed {
@@ -94,30 +108,34 @@ func CreateXMLStreamer(in io.Reader) <-chan XMLBlock {
m := copy(buffer, remainder)
remainder = ""
if m > 16384 {
- // previous remainder is larger than reserved section, write and signal need to continue reading
+ // previous remainder is larger than reserved section,
+ // write and signal the need to continue reading.
return buffer[:m], true, false
}
// read next block, append behind copied remainder from previous read
n, err := in.Read(buffer[m:])
- // with data piped through stdin, read function may not always return the same number of bytes each time
+ // with data piped through stdin, read function may not always return the
+ // same number of bytes each time
if err != nil {
if err != io.EOF {
- // real error
+ // real error.
fmt.Fprintf(os.Stderr, "\nERROR: %s\n", err.Error())
- // Ignore bytes - non-conforming implementations of io.Reader may returned mangled data on non-EOF errors
+ // ignore bytes - non-conforming implementations of io.Reader may
+ // return mangled data on non-EOF errors
isClosed = true
return nil, false, true
}
- // end of file
+ // end of file.
isClosed = true
if n == 0 {
- // if EOF and no more data, do not send final remainder (not terminated by right angle bracket that is used as a sentinel)
+ // if EOF and no more data, do not send final remainder (not terminated
+ // by right angle bracket that is used as a sentinel)
return nil, false, true
}
}
if n < 0 {
- // Reality check - non-conforming implementations of io.Reader may return -1
+ // reality check - non-conforming implementations of io.Reader may return -1
fmt.Fprintf(os.Stderr, "\nERROR: io.Reader returned negative count %d\n", n)
// treat as n == 0 in order to update file offset and avoid losing previous remainder
n = 0
@@ -130,13 +148,14 @@ func CreateXMLStreamer(in io.Reader) <-chan XMLBlock {
// slice of actual characters read
bufr := buffer[:n+m]
- // look for last > character
- // safe to back up on UTF-8 rune array when looking for 7-bit ASCII character
+ // Look for last > character. It is safe to back up on UTF-8 rune array when looking
+ // for a 7-bit ASCII character.
pos := -1
for pos = len(bufr) - 1; pos >= 0; pos-- {
if bufr[pos] == '>' {
if doStrict {
- // optionally skip backwards past embedded i, b, u, sub, and sup HTML open, close, and empty tags, and MathML
+ // optionally skip backwards past embedded i, b, u, sub, and sup
+ // HTML open, close, and empty tags, and MathML instructions
if htmlBehind(bufr, pos, len(bufr)) {
continue
}
@@ -157,8 +176,9 @@ func CreateXMLStreamer(in io.Reader) <-chan XMLBlock {
return bufr[:], true, false
}
- // nextBlock reads buffer, concatenates if necessary to place long element content into a single string
- // all result strings end in > character that is used as a sentinel in subsequent code
+ // nextBlock reads buffer, concatenates if necessary to place long element content
+ // into a single string. All result strings end in > character that is used as a
+ // sentinel in subsequent code.
nextBlock := func() string {
// read next buffer
@@ -169,8 +189,8 @@ func CreateXMLStreamer(in io.Reader) <-chan XMLBlock {
return ""
}
- // if buffer does not end with > character
if cont {
+ // current line does not end with > character
var buff bytes.Buffer
// keep reading long content blocks
@@ -202,7 +222,7 @@ func CreateXMLStreamer(in io.Reader) <-chan XMLBlock {
// trimming spaces here would throw off line tracking
- // optionally compress/cleanup tags/attributes and contents (undocumented)
+ // optionally compress/cleanup tags/attributes and contents
if doCleanup {
if HasBadSpace(str) {
str = CleanupBadSpaces(str)
@@ -227,7 +247,9 @@ func CreateXMLStreamer(in io.Reader) <-chan XMLBlock {
return out
}
-// XMLRecord structure wraps a numbered XML record sent down a channel
+// XMLRecord wraps a numbered XML record or the results of data extraction on
+// that record. The Index field stores the record's original position in the
+// input stream. The Data field is used for binary compressed PubmedArticle XML.
type XMLRecord struct {
Index int
Ident string
@@ -235,30 +257,9 @@ type XMLRecord struct {
Data []byte
}
-type xmlRecordHeap []XMLRecord
-
-// methods that satisfy heap.Interface
-func (h xmlRecordHeap) Len() int {
- return len(h)
-}
-func (h xmlRecordHeap) Less(i, j int) bool {
- return h[i].Index < h[j].Index
-}
-func (h xmlRecordHeap) Swap(i, j int) {
- h[i], h[j] = h[j], h[i]
-}
-func (h *xmlRecordHeap) Push(x interface{}) {
- *h = append(*h, x.(XMLRecord))
-}
-func (h *xmlRecordHeap) Pop() interface{} {
- old := *h
- n := len(old)
- x := old[n-1]
- *h = old[0 : n-1]
- return x
-}
-
-// CreateXMLProducer partitions an XML set and sends it down a channel
+// CreateXMLProducer partitions an XML set and sends records down a channel.
+// After processing asynchronously in multiple concurrent go routines, the
+// original order can be restored by passage through the XMLUnshuffler.
func CreateXMLProducer(pat, star string, rdr <-chan XMLBlock) <-chan XMLRecord {
if rdr == nil {
@@ -271,7 +272,7 @@ func CreateXMLProducer(pat, star string, rdr <-chan XMLBlock) <-chan XMLRecord {
os.Exit(1)
}
- // xmlProducer sends partitioned XML strings through channel
+ // xmlProducer sends partitioned XML strings through channel.
xmlProducer := func(pat, star string, rdr <-chan XMLBlock, out chan<- XMLRecord) {
// close channel when all records have been processed
@@ -293,7 +294,32 @@ func CreateXMLProducer(pat, star string, rdr <-chan XMLBlock) <-chan XMLRecord {
return out
}
-// CreateXMLUnshuffler uses heap to restore output of multiple consumers to original record order
+// xmlRecordHeap collects asynchronous processing results for presentation in the original order.
+type xmlRecordHeap []XMLRecord
+
+// methods that satisfy heap.Interface
+func (h xmlRecordHeap) Len() int {
+ return len(h)
+}
+func (h xmlRecordHeap) Less(i, j int) bool {
+ return h[i].Index < h[j].Index
+}
+func (h xmlRecordHeap) Swap(i, j int) {
+ h[i], h[j] = h[j], h[i]
+}
+func (h *xmlRecordHeap) Push(x interface{}) {
+ *h = append(*h, x.(XMLRecord))
+}
+func (h *xmlRecordHeap) Pop() interface{} {
+ old := *h
+ n := len(old)
+ x := old[n-1]
+ *h = old[0 : n-1]
+ return x
+}
+
+// CreateXMLUnshuffler passes the output of multiple concurrent processors to
+// a heap, which releases results in the same order as the original records.
func CreateXMLUnshuffler(inp <-chan XMLRecord) <-chan XMLRecord {
if inp == nil {
@@ -306,7 +332,7 @@ func CreateXMLUnshuffler(inp <-chan XMLRecord) <-chan XMLRecord {
os.Exit(1)
}
- // xmlUnshuffler restores original order with heap
+ // xmlUnshuffler restores original order with heap.
xmlUnshuffler := func(inp <-chan XMLRecord, out chan<- XMLRecord) {
// close channel when all records have been processed
@@ -326,7 +352,8 @@ func CreateXMLUnshuffler(inp <-chan XMLRecord) <-chan XMLRecord {
// push result onto heap
heap.Push(hp, ext)
- // read several values before checking to see if next record to print has been processed
+ // Read several values before checking to see if next record to print has been processed.
+ // The default heapSize value has been tuned by experiment for maximum performance.
if delay < heapSize {
delay++
continue
@@ -356,11 +383,11 @@ func CreateXMLUnshuffler(inp <-chan XMLRecord) <-chan XMLRecord {
next++
}
- // keep checking heap to see if next result is already available
+ // continue to check heap to see if next result is already available
}
}
- // send remainder of heap to output
+ // flush remainder of heap to output
for hp.Len() > 0 {
curr := heap.Pop(hp).(XMLRecord)