summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAaron M. Ucko <ucko@debian.org>2017-10-05 20:48:35 -0400
committerAaron M. Ucko <ucko@debian.org>2017-10-05 20:48:35 -0400
commit4653a15248616bb735335736fbfbc480d4fb8994 (patch)
tree6c994b44d7e67e1f1b18b7fc14eaf137a6cb482d
parentcbf48d042bde0f7a63d81f49f615d66661f7770b (diff)
New upstream version 7.10.20170810+ds
-rwxr-xr-xedirect.pl8
-rwxr-xr-xsetup-deps.pl38
-rwxr-xr-xsetup.sh4
-rwxr-xr-xxtract2
-rw-r--r--xtract.go172
5 files changed, 204 insertions, 20 deletions
diff --git a/edirect.pl b/edirect.pl
index b58f33e..8ce717c 100755
--- a/edirect.pl
+++ b/edirect.pl
@@ -41,6 +41,10 @@ my ($LibDir, $ScriptName);
use File::Spec;
+# EDirect version number
+
+$version = "7.10";
+
BEGIN
{
my $Volume;
@@ -85,10 +89,6 @@ $begin_time = Time::HiRes::time();
use constant false => 0;
use constant true => 1;
-# EDirect version number
-
-$version = "7.00";
-
# URL address components
$base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/";
diff --git a/setup-deps.pl b/setup-deps.pl
index 813cfb0..3fa2d06 100755
--- a/setup-deps.pl
+++ b/setup-deps.pl
@@ -5,9 +5,35 @@ use CPAN;
use CPAN::HandleConfig;
use CPAN::Shell;
+my $already_configured_cpan = 0;
my $root;
BEGIN {
+ sub CheckAvailability
+ {
+ my $code = "require $_[0]";
+ if (@_ > 1) {
+ $code .= "; $_[1]";
+ }
+ eval $code;
+ if ($@) {
+ if ($already_configured_cpan) {
+ print STDERR "Missing $_[0]; CPAN already initialized.\n";
+ } else {
+ print STDERR "Missing $_[0]; initializing CPAN.\n";
+ CPAN::HandleConfig->load(autoconfig => 1, auto_pick => 1,
+ doit => 1);
+ CPAN::Shell::setup_output;
+ CPAN::Index->reload;
+ $already_configured_cpan = 1;
+ }
+ return 0;
+ } else {
+ print STDERR "Found $_[0].\n";
+ return 1;
+ }
+ }
+
alarm(3600);
$root = $INC[0];
@@ -25,13 +51,11 @@ BEGIN {
};
}
# $CPAN::DEBUG ||= $CPAN::DEBUG{'FTP'};
- CPAN::HandleConfig->load(autoconfig => 1, auto_pick => 1, doit => 1);
- CPAN::Shell::setup_output;
- CPAN::Index->reload;
- my $ll = CPAN::Shell->expandany('local::lib');
- if ( ( ! $ll->inst_file || $ll->inst_version =~ /^1\./)
+ if ( !CheckAvailability('local::lib',
+ 'die unless $local::lib::VERSION >= 2')
&& ! -d "$root/aux/lib/perl5/local" ) {
+ my $ll = CPAN::Shell->expandany('local::lib');
$ll->get;
system('mkdir', '-p', "$root/aux/lib/perl5/local");
system('cp', $ll->distribution->dir . "/lib/local/lib.pm",
@@ -47,10 +71,10 @@ my @lwp_deps = qw(Encode::Locale File::Listing
IO::Socket::SSL LWP::MediaTypes LWP::Protocol::https
Net::HTTP URI WWW::RobotRules Mozilla::CA);
for my $module (@lwp_deps, 'Time::HiRes') {
- if ( ! CPAN::Shell->expandany($module)->inst_file ) {
+ if ( ! CheckAvailability($module) ) {
CPAN::Shell->install($module);
}
}
-if ( ! CPAN::Shell->expandany('LWP')->inst_file ) {
+if ( ! CheckAvailability('LWP') ) {
CPAN::Shell->install('Bundle::LWP');
}
diff --git a/setup.sh b/setup.sh
index f4124f3..e25ec73 100755
--- a/setup.sh
+++ b/setup.sh
@@ -13,7 +13,7 @@ cd "$DIR"
mkdir -p _cpan/CPAN
echo '1;' >> _cpan/CPAN/MyConfig.pm
-if ! perl -I_cpan setup-deps.pl </dev/null >setup-deps.log 2>&1
+if ! perl -I_cpan -Iaux/lib/perl5 setup-deps.pl </dev/null >setup-deps.log 2>&1
then
if grep '^read timeout.*HTTP' setup-deps.log >/dev/null
then
@@ -31,7 +31,7 @@ then
gzip -cd Mozilla-CA.tar.gz | tar xvf -
fi
-osname=`uname -s | sed -e 's/[0-9.-]*$//'`
+osname=`uname -s | sed -e 's/_NT-.*$/_NT/; s/^MINGW[0-9]*/CYGWIN/'`
cputype=`uname -m`
case "$osname-$cputype" in
Linux-x86_64 | Darwin-x86_64 | CYGWIN_NT-* )
diff --git a/xtract b/xtract
index 1dcc399..ae44929 100755
--- a/xtract
+++ b/xtract
@@ -1,7 +1,7 @@
#!/bin/sh
PATH=/bin:/usr/bin
export PATH
-compiled=$0.`uname -s | sed -e 's/[0-9.-]*$//'`
+compiled=$0.`uname -s | sed -e 's/_NT-.*$/_NT/; s/^MINGW[0-9]*/CYGWIN/'`
if [ -x "$compiled" ]
then
exec "$compiled" "$@"
diff --git a/xtract.go b/xtract.go
index 1d465d0..cc85043 100644
--- a/xtract.go
+++ b/xtract.go
@@ -78,7 +78,7 @@ import (
// VERSION AND HELP MESSAGE TEXT
-const xtractVersion = "7.00"
+const xtractVersion = "7.10"
const xtractHelp = `
Overview
@@ -99,6 +99,7 @@ Processing Flags
-strict Remove HTML highlight tags
-accent Delete Unicode accents
+ -ascii Unicode to numeric character references
Data Source
@@ -1715,6 +1716,25 @@ Directory and File Navigation
-t Sort by most recently modified
pwd Prints working directory path
+
+File Redirection
+
+ < Read stdin from file
+ > Redirect stdout to file
+ >> Append to file
+ 2> Redirect stderr
+ 2>&1 Merge stderr into stdout
+ | Pipe between programs
+ <(cmd) Execute command, read results as file
+
+Shell Script Variables
+
+ $0 Name of script
+ $n Nth argument
+ $# Number of arguments
+ "$*" Argument list as one argument
+ "$@" Argument list as separate arguments
+ $? Exit status of previous command
`
// TYPED CONSTANTS
@@ -1930,6 +1950,18 @@ var accentRunes = map[rune]rune{
'\u02BC': '\'',
}
+var ligatureRunes = map[rune]string{
+ '\u00DF': "ss",
+ '\u00E6': "ae",
+ '\uFB00': "ff",
+ '\uFB01': "fi",
+ '\uFB02': "fl",
+ '\uFB03': "ffi",
+ '\uFB04': "ffl",
+ '\uFB05': "ft",
+ '\uFB06': "st",
+}
+
var argTypeIs = map[string]ArgumentType{
"-unit": EXPLORATION,
"-Unit": EXPLORATION,
@@ -2315,6 +2347,7 @@ type Tables struct {
DeGloss bool
DoMixed bool
DeAccent bool
+ DoAscii bool
}
type Node struct {
@@ -2961,6 +2994,56 @@ func DoHtmlRepair(str string) string {
return str
}
+func DoTrimFlankingHtml(str string) string {
+
+ badPrefix := [10]string{
+ "<i></i>",
+ "<b></b>",
+ "<u></u>",
+ "<sup></sup>",
+ "<sub></sub>",
+ "</i>",
+ "</b>",
+ "</u>",
+ "</sup>",
+ "</sub>",
+ }
+
+ badSuffix := [10]string{
+ "<i></i>",
+ "<b></b>",
+ "<u></u>",
+ "<sup></sup>",
+ "<sub></sub>",
+ "<i>",
+ "<b>",
+ "<u>",
+ "<sup>",
+ "<sub>",
+ }
+
+ if strings.Contains(str, "<") {
+ goOn := true
+ for goOn {
+ goOn = false
+ for _, tag := range badPrefix {
+ if strings.HasPrefix(str, tag) {
+ str = str[len(tag):]
+ goOn = true
+ }
+ }
+ for _, tag := range badSuffix {
+ if strings.HasSuffix(str, tag) {
+ str = str[:len(str)-len(tag)]
+ goOn = true
+ }
+ }
+ }
+ }
+
+ return str
+}
+
func HasBadAccent(str string) bool {
for _, ch := range str {
@@ -2970,6 +3053,8 @@ func HasBadAccent(str string) bool {
// quick min-to-max check for additional characters to treat as accents
if ch >= '\u00D8' && ch <= '\u02BC' {
return true
+ } else if ch >= '\uFB00' && ch <= '\uFB06' {
+ return true
}
}
@@ -2985,7 +3070,20 @@ func FixBadAccent(str string) string {
if ch >= '\u00D8' && ch <= '\u02BC' {
rn, ok := accentRunes[ch]
if ok {
- ch = rn
+ buffer.WriteRune(rn)
+ continue
+ }
+ st, ok := ligatureRunes[ch]
+ if ok {
+ buffer.WriteString(st)
+ continue
+ }
+ }
+ if ch >= '\uFB00' && ch <= '\uFB06' {
+ st, ok := ligatureRunes[ch]
+ if ok {
+ buffer.WriteString(st)
+ continue
}
}
}
@@ -3023,6 +3121,31 @@ func DoAccentTransform(str string) string {
return str
}
+func UnicodeToAscii(str string) string {
+
+ var buffer bytes.Buffer
+
+ for _, ch := range str {
+ if ch > 127 {
+ s := strconv.QuoteToASCII(string(ch))
+ s = strings.ToUpper(s[3:7])
+ for {
+ if !strings.HasPrefix(s, "0") {
+ break
+ }
+ s = s[1:]
+ }
+ buffer.WriteString("&#x")
+ buffer.WriteString(s)
+ buffer.WriteRune(';')
+ continue
+ }
+ buffer.WriteRune(ch)
+ }
+
+ return buffer.String()
+}
+
// CREATE COMMON DRIVER TABLES
// InitTables creates lookup tables to simplify the tokenizer
@@ -3117,7 +3240,7 @@ func DebugBlock(blk *Block, depth int) {
// ParseArguments parses nested exploration instruction from command-line arguments
func ParseArguments(args []string, pttrn string) *Block {
- // different names of exploration control arguments allow multiple levels of nested "for" loops in linear command line
+ // different names of exploration control arguments allow multiple levels of nested "for" loops in a linear command line
// (capitalized versions for backward-compatibility with original Perl implementation handling of recursive definitions)
var (
lcname = []string{
@@ -5381,12 +5504,18 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
if HasAngleBracket(str) {
str = DoHtmlRepair(str)
}
+ str = DoTrimFlankingHtml(str)
}
if tbls.DeAccent {
if IsNotASCII(str) {
str = DoAccentTransform(str)
}
}
+ if tbls.DoAscii {
+ if IsNotASCII(str) {
+ str = UnicodeToAscii(str)
+ }
+ }
os.Stdout.WriteString(str)
}
@@ -5497,6 +5626,16 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
attr = strings.TrimSpace(attr)
attr = CompressRunsOfSpaces(attr)
+ if tbls.DeAccent {
+ if IsNotASCII(attr) {
+ attr = DoAccentTransform(attr)
+ }
+ }
+ if tbls.DoAscii {
+ if IsNotASCII(attr) {
+ attr = UnicodeToAscii(attr)
+ }
+ }
if wrapAttrs {
@@ -5745,12 +5884,18 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
if HasAngleBracket(name) {
name = DoHtmlRepair(name)
}
+ name = DoTrimFlankingHtml(name)
}
if tbls.DeAccent {
if IsNotASCII(name) {
name = DoAccentTransform(name)
}
}
+ if tbls.DoAscii {
+ if IsNotASCII(name) {
+ name = UnicodeToAscii(name)
+ }
+ }
if HasFlankingSpace(name) {
name = strings.TrimSpace(name)
}
@@ -5814,7 +5959,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
processSplit := func() {
if len(args) > 1 {
- if args[1] == "-pattern" || args[1] == "-Pattern" {
+ if args[1] == "-pattern" || args[1] == "-Pattern" || args[1] == "-record" || args[1] == "-Record" {
// skip past -split if followed by -pattern
args = args[1:]
}
@@ -5836,7 +5981,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
processDrain := func() {
if len(args) > 1 {
- if args[1] == "-pattern" || args[1] == "-Pattern" {
+ if args[1] == "-pattern" || args[1] == "-Pattern" || args[1] == "-record" || args[1] == "-Record" {
// skip past -drain if followed by -pattern
args = args[1:]
}
@@ -8285,12 +8430,18 @@ func ProcessQuery(Text, parent string, index int, cmds *Block, tbls *Tables, act
if HasAngleBracket(name) {
name = DoHtmlReplace(name)
}
+ name = DoTrimFlankingHtml(name)
}
if tbls.DeAccent {
if IsNotASCII(name) {
name = DoAccentTransform(name)
}
}
+ if tbls.DoAscii {
+ if IsNotASCII(name) {
+ name = UnicodeToAscii(name)
+ }
+ }
node.Contents = name
case SELFTAG:
if attr == "" {
@@ -9249,6 +9400,7 @@ func main() {
deGloss := false
doMixed := false
deAccent := false
+ doAscii := false
// read data from file instead of stdin
fileName := ""
@@ -9420,6 +9572,8 @@ func main() {
doMixed = true
case "-accent", "-plain":
deAccent = true
+ case "-ascii":
+ doAscii = true
// debugging flags
case "-prepare":
cmpr = true
@@ -9699,6 +9853,7 @@ func main() {
tbls.DeGloss = deGloss
tbls.DoMixed = doMixed
tbls.DeAccent = deAccent
+ tbls.DoAscii = doAscii
// FILE NAME CAN BE SUPPLIED WITH -input COMMAND
@@ -10124,6 +10279,11 @@ func main() {
os.Exit(1)
}
+ // allow -record as synonym of -pattern (undocumented)
+ if args[0] == "-record" || args[0] == "-Record" {
+ args[0] = "-pattern"
+ }
+
// make sure top-level -pattern command is next
if args[0] != "-pattern" && args[0] != "-Pattern" {
fmt.Fprintf(os.Stderr, "\nERROR: No -pattern in command-line arguments\n")
@@ -10513,7 +10673,7 @@ func main() {
// FILTER XML RECORDS BY PRESENCE OF ONE OR MORE PHRASES
- // -phase plus -pattern filters by phrase in XML
+ // -phrase plus -pattern filters by phrase in XML
if phrs != "" && len(args) == 2 {
// cleanupPhrase splits at punctuation, but leaves < and > in to avoid false positives