1 files changed, 166 insertions, 6 deletions
diff --git a/xtract.go b/xtract.go
index 1d465d0..cc85043 100644
--- a/xtract.go
+++ b/xtract.go
@@ -78,7 +78,7 @@ import (
 
 // VERSION AND HELP MESSAGE TEXT
 
-const xtractVersion = "7.00"
+const xtractVersion = "7.10"
 
 const xtractHelp = `
 Overview
@@ -99,6 +99,7 @@ Processing Flags
   -strict          Remove HTML highlight tags
 
   -accent          Delete Unicode accents
+  -ascii           Unicode to numeric character references
 
 Data Source
 
@@ -1715,6 +1716,25 @@ Directory and File Navigation
   -t       Sort by most recently modified
 
  pwd       Prints working directory path
+ 
+File Redirection
+
+  <        Read stdin from file
+  >        Redirect stdout to file
+  >>       Append to file
+  2>       Redirect stderr
+  2>&1     Merge stderr into stdout
+  |        Pipe between programs
+  <(cmd)   Execute command, read results as file
+ 
+Shell Script Variables
+
+  $0       Name of script
+  $n       Nth argument
+  $#       Number of arguments
+  "$*"     Argument list as one argument
+  "$@"     Argument list as separate arguments
+  $?       Exit status of previous command
 `
 
 // TYPED CONSTANTS
@@ -1930,6 +1950,18 @@ var accentRunes = map[rune]rune{
 	'\u02BC': '\'',
 }
 
+var ligatureRunes = map[rune]string{
+	'\u00DF': "ss",
+	'\u00E6': "ae",
+	'\uFB00': "ff",
+	'\uFB01': "fi",
+	'\uFB02': "fl",
+	'\uFB03': "ffi",
+	'\uFB04': "ffl",
+	'\uFB05': "ft",
+	'\uFB06': "st",
+}
+
 var argTypeIs = map[string]ArgumentType{
 	"-unit":        EXPLORATION,
 	"-Unit":        EXPLORATION,
@@ -2315,6 +2347,7 @@ type Tables struct {
 	DeGloss   bool
 	DoMixed   bool
 	DeAccent  bool
+	DoAscii   bool
 }
 
 type Node struct {
@@ -2961,6 +2994,56 @@ func DoHtmlRepair(str string) string {
 	return str
 }
 
+func DoTrimFlankingHtml(str string) string {
+
+	badPrefix := [10]string{
+		"<i></i>",
+		"<b></b>",
+		"<u></u>",
+		"<sup></sup>",
+		"<sub></sub>",
+		"</i>",
+		"</b>",
+		"</u>",
+		"</sup>",
+		"</sub>",
+	}
+
+	badSuffix := [10]string{
+		"<i></i>",
+		"<b></b>",
+		"<u></u>",
+		"<sup></sup>",
+		"<sub></sub>",
+		"<i>",
+		"<b>",
+		"<u>",
+		"<sup>",
+		"<sub>",
+	}
+
+	if strings.Contains(str, "<") {
+		goOn := true
+		for goOn {
+			goOn = false
+			for _, tag := range badPrefix {
+				if strings.HasPrefix(str, tag) {
+					str = str[len(tag):]
+					goOn = true
+				}
+			}
+			for _, tag := range badSuffix {
+				if strings.HasSuffix(str, tag) {
+					str = str[:len(str)-len(tag)]
+					goOn = true
+				}
+			}
+		}
+	}
+
+	return str
+}
+
 func HasBadAccent(str string) bool {
 
 	for _, ch := range str {
@@ -2970,6 +3053,8 @@ func HasBadAccent(str string) bool {
 		// quick min-to-max check for additional characters to treat as accents
 		if ch >= '\u00D8' && ch <= '\u02BC' {
 			return true
+		} else if ch >= '\uFB00' && ch <= '\uFB06' {
+			return true
 		}
 	}
 
@@ -2985,7 +3070,20 @@ func FixBadAccent(str string) string {
 			if ch >= '\u00D8' && ch <= '\u02BC' {
 				rn, ok := accentRunes[ch]
 				if ok {
-					ch = rn
+					buffer.WriteRune(rn)
+					continue
+				}
+				st, ok := ligatureRunes[ch]
+				if ok {
+					buffer.WriteString(st)
+					continue
+				}
+			}
+			if ch >= '\uFB00' && ch <= '\uFB06' {
+				st, ok := ligatureRunes[ch]
+				if ok {
+					buffer.WriteString(st)
+					continue
 				}
 			}
 		}
@@ -3023,6 +3121,31 @@ func DoAccentTransform(str string) string {
 	return str
 }
 
+func UnicodeToAscii(str string) string {
+
+	var buffer bytes.Buffer
+
+	for _, ch := range str {
+		if ch > 127 {
+			s := strconv.QuoteToASCII(string(ch))
+			s = strings.ToUpper(s[3:7])
+			for {
+				if !strings.HasPrefix(s, "0") {
+					break
+				}
+				s = s[1:]
+			}
+			buffer.WriteString("&#x")
+			buffer.WriteString(s)
+			buffer.WriteRune(';')
+			continue
+		}
+		buffer.WriteRune(ch)
+	}
+
+	return buffer.String()
+}
+
 // CREATE COMMON DRIVER TABLES
 
 // InitTables creates lookup tables to simplify the tokenizer
@@ -3117,7 +3240,7 @@ func DebugBlock(blk *Block, depth int) {
 // ParseArguments parses nested exploration instruction from command-line arguments
 func ParseArguments(args []string, pttrn string) *Block {
 
-	// different names of exploration control arguments allow multiple levels of nested "for" loops in linear command line
+	// different names of exploration control arguments allow multiple levels of nested "for" loops in a linear command line
 	// (capitalized versions for backward-compatibility with original Perl implementation handling of recursive definitions)
 	var (
 		lcname = []string{
@@ -5381,12 +5504,18 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
 					if HasAngleBracket(str) {
 						str = DoHtmlRepair(str)
 					}
+					str = DoTrimFlankingHtml(str)
 				}
 				if tbls.DeAccent {
 					if IsNotASCII(str) {
 						str = DoAccentTransform(str)
 					}
 				}
+				if tbls.DoAscii {
+					if IsNotASCII(str) {
+						str = UnicodeToAscii(str)
+					}
+				}
 
 				os.Stdout.WriteString(str)
 			}
@@ -5497,6 +5626,16 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
 
 			attr = strings.TrimSpace(attr)
 			attr = CompressRunsOfSpaces(attr)
+			if tbls.DeAccent {
+				if IsNotASCII(attr) {
+					attr = DoAccentTransform(attr)
+				}
+			}
+			if tbls.DoAscii {
+				if IsNotASCII(attr) {
+					attr = UnicodeToAscii(attr)
+				}
+			}
 
 			if wrapAttrs {
 
@@ -5745,12 +5884,18 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
 						if HasAngleBracket(name) {
 							name = DoHtmlRepair(name)
 						}
+						name = DoTrimFlankingHtml(name)
 					}
 					if tbls.DeAccent {
 						if IsNotASCII(name) {
 							name = DoAccentTransform(name)
 						}
 					}
+					if tbls.DoAscii {
+						if IsNotASCII(name) {
+							name = UnicodeToAscii(name)
+						}
+					}
 					if HasFlankingSpace(name) {
 						name = strings.TrimSpace(name)
 					}
@@ -5814,7 +5959,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
 	processSplit := func() {
 
 		if len(args) > 1 {
-			if args[1] == "-pattern" || args[1] == "-Pattern" {
+			if args[1] == "-pattern" || args[1] == "-Pattern" || args[1] == "-record" || args[1] == "-Record" {
 				// skip past -split if followed by -pattern
 				args = args[1:]
 			}
@@ -5836,7 +5981,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
 	processDrain := func() {
 
 		if len(args) > 1 {
-			if args[1] == "-pattern" || args[1] == "-Pattern" {
+			if args[1] == "-pattern" || args[1] == "-Pattern" || args[1] == "-record" || args[1] == "-Record" {
 				// skip past -drain if followed by -pattern
 				args = args[1:]
 			}
@@ -8285,12 +8430,18 @@ func ProcessQuery(Text, parent string, index int, cmds *Block, tbls *Tables, act
 					if HasAngleBracket(name) {
 						name = DoHtmlReplace(name)
 					}
+					name = DoTrimFlankingHtml(name)
 				}
 				if tbls.DeAccent {
 					if IsNotASCII(name) {
 						name = DoAccentTransform(name)
 					}
 				}
+				if tbls.DoAscii {
+					if IsNotASCII(name) {
+						name = UnicodeToAscii(name)
+					}
+				}
 				node.Contents = name
 			case SELFTAG:
 				if attr == "" {
@@ -9249,6 +9400,7 @@ func main() {
 	deGloss := false
 	doMixed := false
 	deAccent := false
+	doAscii := false
 
 	// read data from file instead of stdin
 	fileName := ""
@@ -9420,6 +9572,8 @@ func main() {
 			doMixed = true
 		case "-accent", "-plain":
 			deAccent = true
+		case "-ascii":
+			doAscii = true
 		// debugging flags
 		case "-prepare":
 			cmpr = true
@@ -9699,6 +9853,7 @@ func main() {
 	tbls.DeGloss = deGloss
 	tbls.DoMixed = doMixed
 	tbls.DeAccent = deAccent
+	tbls.DoAscii = doAscii
 
 	// FILE NAME CAN BE SUPPLIED WITH -input COMMAND
 
@@ -10124,6 +10279,11 @@ func main() {
 		os.Exit(1)
 	}
 
+	// allow -record as synonym of -pattern (undocumented)
+	if args[0] == "-record" || args[0] == "-Record" {
+		args[0] = "-pattern"
+	}
+
 	// make sure top-level -pattern command is next
 	if args[0] != "-pattern" && args[0] != "-Pattern" {
 		fmt.Fprintf(os.Stderr, "\nERROR: No -pattern in command-line arguments\n")
@@ -10513,7 +10673,7 @@ func main() {
 
 	// FILTER XML RECORDS BY PRESENCE OF ONE OR MORE PHRASES
 
-	// -phase plus -pattern filters by phrase in XML
+	// -phrase plus -pattern filters by phrase in XML
 	if phrs != "" && len(args) == 2 {
 
 		// cleanupPhrase splits at punctuation, but leaves < and > in to avoid false positives