diff options
Diffstat (limited to 'xtract.go')
-rw-r--r-- | xtract.go | 172 |
1 files changed, 166 insertions, 6 deletions
@@ -78,7 +78,7 @@ import ( // VERSION AND HELP MESSAGE TEXT -const xtractVersion = "7.00" +const xtractVersion = "7.10" const xtractHelp = ` Overview @@ -99,6 +99,7 @@ Processing Flags -strict Remove HTML highlight tags -accent Delete Unicode accents + -ascii Unicode to numeric character references Data Source @@ -1715,6 +1716,25 @@ Directory and File Navigation -t Sort by most recently modified pwd Prints working directory path + +File Redirection + + < Read stdin from file + > Redirect stdout to file + >> Append to file + 2> Redirect stderr + 2>&1 Merge stderr into stdout + | Pipe between programs + <(cmd) Execute command, read results as file + +Shell Script Variables + + $0 Name of script + $n Nth argument + $# Number of arguments + "$*" Argument list as one argument + "$@" Argument list as separate arguments + $? Exit status of previous command ` // TYPED CONSTANTS @@ -1930,6 +1950,18 @@ var accentRunes = map[rune]rune{ '\u02BC': '\'', } +var ligatureRunes = map[rune]string{ + '\u00DF': "ss", + '\u00E6': "ae", + '\uFB00': "ff", + '\uFB01': "fi", + '\uFB02': "fl", + '\uFB03': "ffi", + '\uFB04': "ffl", + '\uFB05': "ft", + '\uFB06': "st", +} + var argTypeIs = map[string]ArgumentType{ "-unit": EXPLORATION, "-Unit": EXPLORATION, @@ -2315,6 +2347,7 @@ type Tables struct { DeGloss bool DoMixed bool DeAccent bool + DoAscii bool } type Node struct { @@ -2961,6 +2994,56 @@ func DoHtmlRepair(str string) string { return str } +func DoTrimFlankingHtml(str string) string { + + badPrefix := [10]string{ + "<i></i>", + "<b></b>", + "<u></u>", + "<sup></sup>", + "<sub></sub>", + "</i>", + "</b>", + "</u>", + "</sup>", + "</sub>", + } + + badSuffix := [10]string{ + "<i></i>", + "<b></b>", + "<u></u>", + "<sup></sup>", + "<sub></sub>", + "<i>", + "<b>", + "<u>", + "<sup>", + "<sub>", + } + + if strings.Contains(str, "<") { + goOn := true + for goOn { + goOn = false + for _, tag := range badPrefix { + if strings.HasPrefix(str, tag) { + str = str[len(tag):] + goOn = true + } + } + for _, tag := range badSuffix { + if strings.HasSuffix(str, tag) { + str = str[:len(str)-len(tag)] + goOn = true + } + } + } + } + + return str +} + func HasBadAccent(str string) bool { for _, ch := range str { @@ -2970,6 +3053,8 @@ func HasBadAccent(str string) bool { // quick min-to-max check for additional characters to treat as accents if ch >= '\u00D8' && ch <= '\u02BC' { return true + } else if ch >= '\uFB00' && ch <= '\uFB06' { + return true } } @@ -2985,7 +3070,20 @@ func FixBadAccent(str string) string { if ch >= '\u00D8' && ch <= '\u02BC' { rn, ok := accentRunes[ch] if ok { - ch = rn + buffer.WriteRune(rn) + continue + } + st, ok := ligatureRunes[ch] + if ok { + buffer.WriteString(st) + continue + } + } + if ch >= '\uFB00' && ch <= '\uFB06' { + st, ok := ligatureRunes[ch] + if ok { + buffer.WriteString(st) + continue } } } @@ -3023,6 +3121,31 @@ func DoAccentTransform(str string) string { return str } +func UnicodeToAscii(str string) string { + + var buffer bytes.Buffer + + for _, ch := range str { + if ch > 127 { + s := strconv.QuoteToASCII(string(ch)) + s = strings.ToUpper(s[3:7]) + for { + if !strings.HasPrefix(s, "0") { + break + } + s = s[1:] + } + buffer.WriteString("&#x") + buffer.WriteString(s) + buffer.WriteRune(';') + continue + } + buffer.WriteRune(ch) + } + + return buffer.String() +} + // CREATE COMMON DRIVER TABLES // InitTables creates lookup tables to simplify the tokenizer @@ -3117,7 +3240,7 @@ func DebugBlock(blk *Block, depth int) { // ParseArguments parses nested exploration instruction from command-line arguments func ParseArguments(args []string, pttrn string) *Block { - // different names of exploration control arguments allow multiple levels of nested "for" loops in linear command line + // different names of exploration control arguments allow multiple levels of nested "for" loops in a linear command line // (capitalized versions for backward-compatibility with original Perl implementation handling of recursive definitions) var ( lcname = []string{ @@ -5381,12 +5504,18 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special if HasAngleBracket(str) { str = DoHtmlRepair(str) } + str = DoTrimFlankingHtml(str) } if tbls.DeAccent { if IsNotASCII(str) { str = DoAccentTransform(str) } } + if tbls.DoAscii { + if IsNotASCII(str) { + str = UnicodeToAscii(str) + } + } os.Stdout.WriteString(str) } @@ -5497,6 +5626,16 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special attr = strings.TrimSpace(attr) attr = CompressRunsOfSpaces(attr) + if tbls.DeAccent { + if IsNotASCII(attr) { + attr = DoAccentTransform(attr) + } + } + if tbls.DoAscii { + if IsNotASCII(attr) { + attr = UnicodeToAscii(attr) + } + } if wrapAttrs { @@ -5745,12 +5884,18 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special if HasAngleBracket(name) { name = DoHtmlRepair(name) } + name = DoTrimFlankingHtml(name) } if tbls.DeAccent { if IsNotASCII(name) { name = DoAccentTransform(name) } } + if tbls.DoAscii { + if IsNotASCII(name) { + name = UnicodeToAscii(name) + } + } if HasFlankingSpace(name) { name = strings.TrimSpace(name) } @@ -5814,7 +5959,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special processSplit := func() { if len(args) > 1 { - if args[1] == "-pattern" || args[1] == "-Pattern" { + if args[1] == "-pattern" || args[1] == "-Pattern" || args[1] == "-record" || args[1] == "-Record" { // skip past -split if followed by -pattern args = args[1:] } @@ -5836,7 +5981,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special processDrain := func() { if len(args) > 1 { - if args[1] == "-pattern" || args[1] == "-Pattern" { + if args[1] == "-pattern" || args[1] == "-Pattern" || args[1] == "-record" || args[1] == "-Record" { // skip past -drain if followed by -pattern args = args[1:] } @@ -8285,12 +8430,18 @@ func ProcessQuery(Text, parent string, index int, cmds *Block, tbls *Tables, act if HasAngleBracket(name) { name = DoHtmlReplace(name) } + name = DoTrimFlankingHtml(name) } if tbls.DeAccent { if IsNotASCII(name) { name = DoAccentTransform(name) } } + if tbls.DoAscii { + if IsNotASCII(name) { + name = UnicodeToAscii(name) + } + } node.Contents = name case SELFTAG: if attr == "" { @@ -9249,6 +9400,7 @@ func main() { deGloss := false doMixed := false deAccent := false + doAscii := false // read data from file instead of stdin fileName := "" @@ -9420,6 +9572,8 @@ func main() { doMixed = true case "-accent", "-plain": deAccent = true + case "-ascii": + doAscii = true // debugging flags case "-prepare": cmpr = true @@ -9699,6 +9853,7 @@ func main() { tbls.DeGloss = deGloss tbls.DoMixed = doMixed tbls.DeAccent = deAccent + tbls.DoAscii = doAscii // FILE NAME CAN BE SUPPLIED WITH -input COMMAND @@ -10124,6 +10279,11 @@ func main() { os.Exit(1) } + // allow -record as synonym of -pattern (undocumented) + if args[0] == "-record" || args[0] == "-Record" { + args[0] = "-pattern" + } + // make sure top-level -pattern command is next if args[0] != "-pattern" && args[0] != "-Pattern" { fmt.Fprintf(os.Stderr, "\nERROR: No -pattern in command-line arguments\n") @@ -10513,7 +10673,7 @@ func main() { // FILTER XML RECORDS BY PRESENCE OF ONE OR MORE PHRASES - // -phase plus -pattern filters by phrase in XML + // -phrase plus -pattern filters by phrase in XML if phrs != "" && len(args) == 2 { // cleanupPhrase splits at punctuation, but leaves < and > in to avoid false positives |