summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorH.Merijn Brand - Tux <linux@tux.freedom.nl>2023-11-27 13:52:20 +0100
committerH.Merijn Brand - Tux <linux@tux.freedom.nl>2023-11-27 13:52:20 +0100
commit1df8b9e562a7f2748329d9b884212ee8354b5ced (patch)
tree52cd6914f3c2272762de363a2d862afa3d077544
parent0e111934bf6eb3716ac2801b3ce49182935c18b9 (diff)
Auto-use BOM in CSV *files* with xlscat script
-rw-r--r--Changes4
-rwxr-xr-xscripts/xlscat27
2 files changed, 27 insertions, 4 deletions
diff --git a/Changes b/Changes
index 56f4ffe..85ba96b 100644
--- a/Changes
+++ b/Changes
@@ -1,5 +1,5 @@
-0.89 - 07 Nov 2023, H.Merijn Brand
- *
+0.89 - 27 Nov 2023, H.Merijn Brand
+ * Auto-use BOM in CSV *files* with xlscat script
0.88 - 07 Nov 2023, H.Merijn Brand
* Guard $_ globbering from external bitrotted code
diff --git a/scripts/xlscat b/scripts/xlscat
index 84a44dd..22d15a8 100755
--- a/scripts/xlscat
+++ b/scripts/xlscat
@@ -2,14 +2,14 @@
# xlscat: show supported spreadsheet file as Text
# xlsgrep: grep pattern
-# (m)'23 [2023-07-28] Copyright H.M.Brand 2005-2023
+# (m)'23 [2023-11-27] Copyright H.M.Brand 2005-2023
require 5.008001;
use strict;
use warnings;
-our $VERSION = "3.25";
+our $VERSION = "3.26";
(my $CMD = $0) =~ s{.*[\/]}{};
my $is_grep = $0 =~ m/grep$/; # xlsgrep
@@ -99,6 +99,7 @@ my $opt_d = 0; # Debug level for Spreadsheet::Read
my $opt_h = 0; # Number of header lines for grep or -L
my $opt_D = 0; # Dump: 0 = none, 1 = array, 2 = hash
my $clip = 1;
+my $enc_bom; # CSV input encoding derived from BOM
my $enc_i; # Input encoding
my $enc_o; # Output encoding
GetOptions (
@@ -235,6 +236,27 @@ if (defined $file and $file ne "-") {
$opt_v > 1 and warn "Using $file as input\n";
-f $file or usage 1, "the file argument is not a regular file";
-s $file or usage 1, "the file is empty";
+ if ($file =~ m/\.csv$/i and open my $fh, "<", $file) { # Auto-BOM
+ my $l = <$fh>;
+ close $fh;
+ if ($l =~ s/^\x00\x00\xfe\xff//) { $enc_bom = "utf-32be" }
+ elsif ($l =~ s/^\xff\xfe\x00\x00//) { $enc_bom = "utf-32le" }
+ elsif ($l =~ s/^\xfe\xff//) { $enc_bom = "utf-16be" }
+ elsif ($l =~ s/^\xff\xfe//) { $enc_bom = "utf-16le" }
+ elsif ($l =~ s/^\xef\xbb\xbf//) { $enc_bom = "utf-8" }
+ elsif ($l =~ s/^\xf7\x64\x4c//) { $enc_bom = "utf-1" }
+ elsif ($l =~ s/^\xdd\x73\x66\x73//) { $enc_bom = "utf-ebcdic" }
+ elsif ($l =~ s/^\x0e\xfe\xff//) { $enc_bom = "scsu" }
+ elsif ($l =~ s/^\xfb\xee\x28//) { $enc_bom = "bocu-1" }
+ elsif ($l =~ s/^\x84\x31\x95\x33//) { $enc_bom = "gb-18030" }
+ elsif ($l =~ s/^\x{feff}//) { $enc_bom = "" }
+
+ if ($enc_bom and open $fh, "<:encoding($enc_bom)", $file) {
+ $opt_v > 1 and warn "Opened $file with encoding $enc_bom after BOM detection\n";
+ $file = $fh;
+ push @RDarg, parser => "CSV";
+ }
+ }
}
else {
$opt_v > 1 and warn "Working as a pipe\n";
@@ -477,6 +499,7 @@ foreach my $si (1 .. $sc) {
? "=".$s->{attr}[$_][$r]{formula}
: defined $s->{cell}[$_][$r] ? $opt_u ? $uval : $fval : "";
} $c[0] .. $c[1];
+ $r == 1 && $row[0] && defined $enc_bom and $row[0] =~ s/^\N{BOM}//;
exists $print{col} and @row = @row[grep{$_<@row}@{$print{col}}];
$is_grep && $r > $opt_h &&
! first { defined $_ && $_ =~ $pattern } @row and next;