summaryrefslogtreecommitdiff
path: root/xtract.go
diff options
context:
space:
mode:
Diffstat (limited to 'xtract.go')
-rw-r--r--xtract.go172
1 files changed, 166 insertions, 6 deletions
diff --git a/xtract.go b/xtract.go
index 1d465d0..cc85043 100644
--- a/xtract.go
+++ b/xtract.go
@@ -78,7 +78,7 @@ import (
// VERSION AND HELP MESSAGE TEXT
-const xtractVersion = "7.00"
+const xtractVersion = "7.10"
const xtractHelp = `
Overview
@@ -99,6 +99,7 @@ Processing Flags
-strict Remove HTML highlight tags
-accent Delete Unicode accents
+ -ascii Unicode to numeric character references
Data Source
@@ -1715,6 +1716,25 @@ Directory and File Navigation
-t Sort by most recently modified
pwd Prints working directory path
+
+File Redirection
+
+ < Read stdin from file
+ > Redirect stdout to file
+ >> Append to file
+ 2> Redirect stderr
+ 2>&1 Merge stderr into stdout
+ | Pipe between programs
+ <(cmd) Execute command, read results as file
+
+Shell Script Variables
+
+ $0 Name of script
+ $n Nth argument
+ $# Number of arguments
+ "$*" Argument list as one argument
+ "$@" Argument list as separate arguments
+ $? Exit status of previous command
`
// TYPED CONSTANTS
@@ -1930,6 +1950,18 @@ var accentRunes = map[rune]rune{
'\u02BC': '\'',
}
+var ligatureRunes = map[rune]string{
+ '\u00DF': "ss",
+ '\u00E6': "ae",
+ '\uFB00': "ff",
+ '\uFB01': "fi",
+ '\uFB02': "fl",
+ '\uFB03': "ffi",
+ '\uFB04': "ffl",
+ '\uFB05': "ft",
+ '\uFB06': "st",
+}
+
var argTypeIs = map[string]ArgumentType{
"-unit": EXPLORATION,
"-Unit": EXPLORATION,
@@ -2315,6 +2347,7 @@ type Tables struct {
DeGloss bool
DoMixed bool
DeAccent bool
+ DoAscii bool
}
type Node struct {
@@ -2961,6 +2994,56 @@ func DoHtmlRepair(str string) string {
return str
}
+func DoTrimFlankingHtml(str string) string {
+
+ badPrefix := [10]string{
+ "<i></i>",
+ "<b></b>",
+ "<u></u>",
+ "<sup></sup>",
+ "<sub></sub>",
+ "</i>",
+ "</b>",
+ "</u>",
+ "</sup>",
+ "</sub>",
+ }
+
+ badSuffix := [10]string{
+ "<i></i>",
+ "<b></b>",
+ "<u></u>",
+ "<sup></sup>",
+ "<sub></sub>",
+ "<i>",
+ "<b>",
+ "<u>",
+ "<sup>",
+ "<sub>",
+ }
+
+ if strings.Contains(str, "<") {
+ goOn := true
+ for goOn {
+ goOn = false
+ for _, tag := range badPrefix {
+ if strings.HasPrefix(str, tag) {
+ str = str[len(tag):]
+ goOn = true
+ }
+ }
+ for _, tag := range badSuffix {
+ if strings.HasSuffix(str, tag) {
+ str = str[:len(str)-len(tag)]
+ goOn = true
+ }
+ }
+ }
+ }
+
+ return str
+}
+
func HasBadAccent(str string) bool {
for _, ch := range str {
@@ -2970,6 +3053,8 @@ func HasBadAccent(str string) bool {
// quick min-to-max check for additional characters to treat as accents
if ch >= '\u00D8' && ch <= '\u02BC' {
return true
+ } else if ch >= '\uFB00' && ch <= '\uFB06' {
+ return true
}
}
@@ -2985,7 +3070,20 @@ func FixBadAccent(str string) string {
if ch >= '\u00D8' && ch <= '\u02BC' {
rn, ok := accentRunes[ch]
if ok {
- ch = rn
+ buffer.WriteRune(rn)
+ continue
+ }
+ st, ok := ligatureRunes[ch]
+ if ok {
+ buffer.WriteString(st)
+ continue
+ }
+ }
+ if ch >= '\uFB00' && ch <= '\uFB06' {
+ st, ok := ligatureRunes[ch]
+ if ok {
+ buffer.WriteString(st)
+ continue
}
}
}
@@ -3023,6 +3121,31 @@ func DoAccentTransform(str string) string {
return str
}
+func UnicodeToAscii(str string) string {
+
+ var buffer bytes.Buffer
+
+ for _, ch := range str {
+ if ch > 127 {
+ s := strconv.QuoteToASCII(string(ch))
+ s = strings.ToUpper(s[3:7])
+ for {
+ if !strings.HasPrefix(s, "0") {
+ break
+ }
+ s = s[1:]
+ }
+ buffer.WriteString("&#x")
+ buffer.WriteString(s)
+ buffer.WriteRune(';')
+ continue
+ }
+ buffer.WriteRune(ch)
+ }
+
+ return buffer.String()
+}
+
// CREATE COMMON DRIVER TABLES
// InitTables creates lookup tables to simplify the tokenizer
@@ -3117,7 +3240,7 @@ func DebugBlock(blk *Block, depth int) {
// ParseArguments parses nested exploration instruction from command-line arguments
func ParseArguments(args []string, pttrn string) *Block {
- // different names of exploration control arguments allow multiple levels of nested "for" loops in linear command line
+ // different names of exploration control arguments allow multiple levels of nested "for" loops in a linear command line
// (capitalized versions for backward-compatibility with original Perl implementation handling of recursive definitions)
var (
lcname = []string{
@@ -5381,12 +5504,18 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
if HasAngleBracket(str) {
str = DoHtmlRepair(str)
}
+ str = DoTrimFlankingHtml(str)
}
if tbls.DeAccent {
if IsNotASCII(str) {
str = DoAccentTransform(str)
}
}
+ if tbls.DoAscii {
+ if IsNotASCII(str) {
+ str = UnicodeToAscii(str)
+ }
+ }
os.Stdout.WriteString(str)
}
@@ -5497,6 +5626,16 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
attr = strings.TrimSpace(attr)
attr = CompressRunsOfSpaces(attr)
+ if tbls.DeAccent {
+ if IsNotASCII(attr) {
+ attr = DoAccentTransform(attr)
+ }
+ }
+ if tbls.DoAscii {
+ if IsNotASCII(attr) {
+ attr = UnicodeToAscii(attr)
+ }
+ }
if wrapAttrs {
@@ -5745,12 +5884,18 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
if HasAngleBracket(name) {
name = DoHtmlRepair(name)
}
+ name = DoTrimFlankingHtml(name)
}
if tbls.DeAccent {
if IsNotASCII(name) {
name = DoAccentTransform(name)
}
}
+ if tbls.DoAscii {
+ if IsNotASCII(name) {
+ name = UnicodeToAscii(name)
+ }
+ }
if HasFlankingSpace(name) {
name = strings.TrimSpace(name)
}
@@ -5814,7 +5959,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
processSplit := func() {
if len(args) > 1 {
- if args[1] == "-pattern" || args[1] == "-Pattern" {
+ if args[1] == "-pattern" || args[1] == "-Pattern" || args[1] == "-record" || args[1] == "-Record" {
// skip past -split if followed by -pattern
args = args[1:]
}
@@ -5836,7 +5981,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
processDrain := func() {
if len(args) > 1 {
- if args[1] == "-pattern" || args[1] == "-Pattern" {
+ if args[1] == "-pattern" || args[1] == "-Pattern" || args[1] == "-record" || args[1] == "-Record" {
// skip past -drain if followed by -pattern
args = args[1:]
}
@@ -8285,12 +8430,18 @@ func ProcessQuery(Text, parent string, index int, cmds *Block, tbls *Tables, act
if HasAngleBracket(name) {
name = DoHtmlReplace(name)
}
+ name = DoTrimFlankingHtml(name)
}
if tbls.DeAccent {
if IsNotASCII(name) {
name = DoAccentTransform(name)
}
}
+ if tbls.DoAscii {
+ if IsNotASCII(name) {
+ name = UnicodeToAscii(name)
+ }
+ }
node.Contents = name
case SELFTAG:
if attr == "" {
@@ -9249,6 +9400,7 @@ func main() {
deGloss := false
doMixed := false
deAccent := false
+ doAscii := false
// read data from file instead of stdin
fileName := ""
@@ -9420,6 +9572,8 @@ func main() {
doMixed = true
case "-accent", "-plain":
deAccent = true
+ case "-ascii":
+ doAscii = true
// debugging flags
case "-prepare":
cmpr = true
@@ -9699,6 +9853,7 @@ func main() {
tbls.DeGloss = deGloss
tbls.DoMixed = doMixed
tbls.DeAccent = deAccent
+ tbls.DoAscii = doAscii
// FILE NAME CAN BE SUPPLIED WITH -input COMMAND
@@ -10124,6 +10279,11 @@ func main() {
os.Exit(1)
}
+ // allow -record as synonym of -pattern (undocumented)
+ if args[0] == "-record" || args[0] == "-Record" {
+ args[0] = "-pattern"
+ }
+
// make sure top-level -pattern command is next
if args[0] != "-pattern" && args[0] != "-Pattern" {
fmt.Fprintf(os.Stderr, "\nERROR: No -pattern in command-line arguments\n")
@@ -10513,7 +10673,7 @@ func main() {
// FILTER XML RECORDS BY PRESENCE OF ONE OR MORE PHRASES
- // -phase plus -pattern filters by phrase in XML
+ // -phrase plus -pattern filters by phrase in XML
if phrs != "" && len(args) == 2 {
// cleanupPhrase splits at punctuation, but leaves < and > in to avoid false positives