From 4653a15248616bb735335736fbfbc480d4fb8994 Mon Sep 17 00:00:00 2001 From: "Aaron M. Ucko" Date: Thu, 5 Oct 2017 20:48:35 -0400 Subject: New upstream version 7.10.20170810+ds --- edirect.pl | 8 +-- setup-deps.pl | 38 ++++++++++--- setup.sh | 4 +- xtract | 2 +- xtract.go | 172 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 5 files changed, 204 insertions(+), 20 deletions(-) diff --git a/edirect.pl b/edirect.pl index b58f33e..8ce717c 100755 --- a/edirect.pl +++ b/edirect.pl @@ -41,6 +41,10 @@ my ($LibDir, $ScriptName); use File::Spec; +# EDirect version number + +$version = "7.10"; + BEGIN { my $Volume; @@ -85,10 +89,6 @@ $begin_time = Time::HiRes::time(); use constant false => 0; use constant true => 1; -# EDirect version number - -$version = "7.00"; - # URL address components $base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"; diff --git a/setup-deps.pl b/setup-deps.pl index 813cfb0..3fa2d06 100755 --- a/setup-deps.pl +++ b/setup-deps.pl @@ -5,9 +5,35 @@ use CPAN; use CPAN::HandleConfig; use CPAN::Shell; +my $already_configured_cpan = 0; my $root; BEGIN { + sub CheckAvailability + { + my $code = "require $_[0]"; + if (@_ > 1) { + $code .= "; $_[1]"; + } + eval $code; + if ($@) { + if ($already_configured_cpan) { + print STDERR "Missing $_[0]; CPAN already initialized.\n"; + } else { + print STDERR "Missing $_[0]; initializing CPAN.\n"; + CPAN::HandleConfig->load(autoconfig => 1, auto_pick => 1, + doit => 1); + CPAN::Shell::setup_output; + CPAN::Index->reload; + $already_configured_cpan = 1; + } + return 0; + } else { + print STDERR "Found $_[0].\n"; + return 1; + } + } + alarm(3600); $root = $INC[0]; @@ -25,13 +51,11 @@ BEGIN { }; } # $CPAN::DEBUG ||= $CPAN::DEBUG{'FTP'}; - CPAN::HandleConfig->load(autoconfig => 1, auto_pick => 1, doit => 1); - CPAN::Shell::setup_output; - CPAN::Index->reload; - my $ll = CPAN::Shell->expandany('local::lib'); - if ( ( ! $ll->inst_file || $ll->inst_version =~ /^1\./) + if ( !CheckAvailability('local::lib', + 'die unless $local::lib::VERSION >= 2') && ! -d "$root/aux/lib/perl5/local" ) { + my $ll = CPAN::Shell->expandany('local::lib'); $ll->get; system('mkdir', '-p', "$root/aux/lib/perl5/local"); system('cp', $ll->distribution->dir . "/lib/local/lib.pm", @@ -47,10 +71,10 @@ my @lwp_deps = qw(Encode::Locale File::Listing IO::Socket::SSL LWP::MediaTypes LWP::Protocol::https Net::HTTP URI WWW::RobotRules Mozilla::CA); for my $module (@lwp_deps, 'Time::HiRes') { - if ( ! CPAN::Shell->expandany($module)->inst_file ) { + if ( ! CheckAvailability($module) ) { CPAN::Shell->install($module); } } -if ( ! CPAN::Shell->expandany('LWP')->inst_file ) { +if ( ! CheckAvailability('LWP') ) { CPAN::Shell->install('Bundle::LWP'); } diff --git a/setup.sh b/setup.sh index f4124f3..e25ec73 100755 --- a/setup.sh +++ b/setup.sh @@ -13,7 +13,7 @@ cd "$DIR" mkdir -p _cpan/CPAN echo '1;' >> _cpan/CPAN/MyConfig.pm -if ! perl -I_cpan setup-deps.pl setup-deps.log 2>&1 +if ! perl -I_cpan -Iaux/lib/perl5 setup-deps.pl setup-deps.log 2>&1 then if grep '^read timeout.*HTTP' setup-deps.log >/dev/null then @@ -31,7 +31,7 @@ then gzip -cd Mozilla-CA.tar.gz | tar xvf - fi -osname=`uname -s | sed -e 's/[0-9.-]*$//'` +osname=`uname -s | sed -e 's/_NT-.*$/_NT/; s/^MINGW[0-9]*/CYGWIN/'` cputype=`uname -m` case "$osname-$cputype" in Linux-x86_64 | Darwin-x86_64 | CYGWIN_NT-* ) diff --git a/xtract b/xtract index 1dcc399..ae44929 100755 --- a/xtract +++ b/xtract @@ -1,7 +1,7 @@ #!/bin/sh PATH=/bin:/usr/bin export PATH -compiled=$0.`uname -s | sed -e 's/[0-9.-]*$//'` +compiled=$0.`uname -s | sed -e 's/_NT-.*$/_NT/; s/^MINGW[0-9]*/CYGWIN/'` if [ -x "$compiled" ] then exec "$compiled" "$@" diff --git a/xtract.go b/xtract.go index 1d465d0..cc85043 100644 --- a/xtract.go +++ b/xtract.go @@ -78,7 +78,7 @@ import ( // VERSION AND HELP MESSAGE TEXT -const xtractVersion = "7.00" +const xtractVersion = "7.10" const xtractHelp = ` Overview @@ -99,6 +99,7 @@ Processing Flags -strict Remove HTML highlight tags -accent Delete Unicode accents + -ascii Unicode to numeric character references Data Source @@ -1715,6 +1716,25 @@ Directory and File Navigation -t Sort by most recently modified pwd Prints working directory path + +File Redirection + + < Read stdin from file + > Redirect stdout to file + >> Append to file + 2> Redirect stderr + 2>&1 Merge stderr into stdout + | Pipe between programs + <(cmd) Execute command, read results as file + +Shell Script Variables + + $0 Name of script + $n Nth argument + $# Number of arguments + "$*" Argument list as one argument + "$@" Argument list as separate arguments + $? Exit status of previous command ` // TYPED CONSTANTS @@ -1930,6 +1950,18 @@ var accentRunes = map[rune]rune{ '\u02BC': '\'', } +var ligatureRunes = map[rune]string{ + '\u00DF': "ss", + '\u00E6': "ae", + '\uFB00': "ff", + '\uFB01': "fi", + '\uFB02': "fl", + '\uFB03': "ffi", + '\uFB04': "ffl", + '\uFB05': "ft", + '\uFB06': "st", +} + var argTypeIs = map[string]ArgumentType{ "-unit": EXPLORATION, "-Unit": EXPLORATION, @@ -2315,6 +2347,7 @@ type Tables struct { DeGloss bool DoMixed bool DeAccent bool + DoAscii bool } type Node struct { @@ -2961,6 +2994,56 @@ func DoHtmlRepair(str string) string { return str } +func DoTrimFlankingHtml(str string) string { + + badPrefix := [10]string{ + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + } + + badSuffix := [10]string{ + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + } + + if strings.Contains(str, "<") { + goOn := true + for goOn { + goOn = false + for _, tag := range badPrefix { + if strings.HasPrefix(str, tag) { + str = str[len(tag):] + goOn = true + } + } + for _, tag := range badSuffix { + if strings.HasSuffix(str, tag) { + str = str[:len(str)-len(tag)] + goOn = true + } + } + } + } + + return str +} + func HasBadAccent(str string) bool { for _, ch := range str { @@ -2970,6 +3053,8 @@ func HasBadAccent(str string) bool { // quick min-to-max check for additional characters to treat as accents if ch >= '\u00D8' && ch <= '\u02BC' { return true + } else if ch >= '\uFB00' && ch <= '\uFB06' { + return true } } @@ -2985,7 +3070,20 @@ func FixBadAccent(str string) string { if ch >= '\u00D8' && ch <= '\u02BC' { rn, ok := accentRunes[ch] if ok { - ch = rn + buffer.WriteRune(rn) + continue + } + st, ok := ligatureRunes[ch] + if ok { + buffer.WriteString(st) + continue + } + } + if ch >= '\uFB00' && ch <= '\uFB06' { + st, ok := ligatureRunes[ch] + if ok { + buffer.WriteString(st) + continue } } } @@ -3023,6 +3121,31 @@ func DoAccentTransform(str string) string { return str } +func UnicodeToAscii(str string) string { + + var buffer bytes.Buffer + + for _, ch := range str { + if ch > 127 { + s := strconv.QuoteToASCII(string(ch)) + s = strings.ToUpper(s[3:7]) + for { + if !strings.HasPrefix(s, "0") { + break + } + s = s[1:] + } + buffer.WriteString("&#x") + buffer.WriteString(s) + buffer.WriteRune(';') + continue + } + buffer.WriteRune(ch) + } + + return buffer.String() +} + // CREATE COMMON DRIVER TABLES // InitTables creates lookup tables to simplify the tokenizer @@ -3117,7 +3240,7 @@ func DebugBlock(blk *Block, depth int) { // ParseArguments parses nested exploration instruction from command-line arguments func ParseArguments(args []string, pttrn string) *Block { - // different names of exploration control arguments allow multiple levels of nested "for" loops in linear command line + // different names of exploration control arguments allow multiple levels of nested "for" loops in a linear command line // (capitalized versions for backward-compatibility with original Perl implementation handling of recursive definitions) var ( lcname = []string{ @@ -5381,12 +5504,18 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special if HasAngleBracket(str) { str = DoHtmlRepair(str) } + str = DoTrimFlankingHtml(str) } if tbls.DeAccent { if IsNotASCII(str) { str = DoAccentTransform(str) } } + if tbls.DoAscii { + if IsNotASCII(str) { + str = UnicodeToAscii(str) + } + } os.Stdout.WriteString(str) } @@ -5497,6 +5626,16 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special attr = strings.TrimSpace(attr) attr = CompressRunsOfSpaces(attr) + if tbls.DeAccent { + if IsNotASCII(attr) { + attr = DoAccentTransform(attr) + } + } + if tbls.DoAscii { + if IsNotASCII(attr) { + attr = UnicodeToAscii(attr) + } + } if wrapAttrs { @@ -5745,12 +5884,18 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special if HasAngleBracket(name) { name = DoHtmlRepair(name) } + name = DoTrimFlankingHtml(name) } if tbls.DeAccent { if IsNotASCII(name) { name = DoAccentTransform(name) } } + if tbls.DoAscii { + if IsNotASCII(name) { + name = UnicodeToAscii(name) + } + } if HasFlankingSpace(name) { name = strings.TrimSpace(name) } @@ -5814,7 +5959,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special processSplit := func() { if len(args) > 1 { - if args[1] == "-pattern" || args[1] == "-Pattern" { + if args[1] == "-pattern" || args[1] == "-Pattern" || args[1] == "-record" || args[1] == "-Record" { // skip past -split if followed by -pattern args = args[1:] } @@ -5836,7 +5981,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special processDrain := func() { if len(args) > 1 { - if args[1] == "-pattern" || args[1] == "-Pattern" { + if args[1] == "-pattern" || args[1] == "-Pattern" || args[1] == "-record" || args[1] == "-Record" { // skip past -drain if followed by -pattern args = args[1:] } @@ -8285,12 +8430,18 @@ func ProcessQuery(Text, parent string, index int, cmds *Block, tbls *Tables, act if HasAngleBracket(name) { name = DoHtmlReplace(name) } + name = DoTrimFlankingHtml(name) } if tbls.DeAccent { if IsNotASCII(name) { name = DoAccentTransform(name) } } + if tbls.DoAscii { + if IsNotASCII(name) { + name = UnicodeToAscii(name) + } + } node.Contents = name case SELFTAG: if attr == "" { @@ -9249,6 +9400,7 @@ func main() { deGloss := false doMixed := false deAccent := false + doAscii := false // read data from file instead of stdin fileName := "" @@ -9420,6 +9572,8 @@ func main() { doMixed = true case "-accent", "-plain": deAccent = true + case "-ascii": + doAscii = true // debugging flags case "-prepare": cmpr = true @@ -9699,6 +9853,7 @@ func main() { tbls.DeGloss = deGloss tbls.DoMixed = doMixed tbls.DeAccent = deAccent + tbls.DoAscii = doAscii // FILE NAME CAN BE SUPPLIED WITH -input COMMAND @@ -10124,6 +10279,11 @@ func main() { os.Exit(1) } + // allow -record as synonym of -pattern (undocumented) + if args[0] == "-record" || args[0] == "-Record" { + args[0] = "-pattern" + } + // make sure top-level -pattern command is next if args[0] != "-pattern" && args[0] != "-Pattern" { fmt.Fprintf(os.Stderr, "\nERROR: No -pattern in command-line arguments\n") @@ -10513,7 +10673,7 @@ func main() { // FILTER XML RECORDS BY PRESENCE OF ONE OR MORE PHRASES - // -phase plus -pattern filters by phrase in XML + // -phrase plus -pattern filters by phrase in XML if phrs != "" && len(args) == 2 { // cleanupPhrase splits at punctuation, but leaves < and > in to avoid false positives -- cgit v1.2.3