#------------------------------------------------------------------------------ # File: Text.pm # # Description: Deduce characteristics of TXT and CSV files # # Revisions: 2019-11-01 - P. Harvey Created # 2020-02-13 - PH Added CSV file support # # References: 1) https://github.com/file/file #------------------------------------------------------------------------------ package Image::ExifTool::Text; use strict; use vars qw($VERSION); use Image::ExifTool qw(:DataAccess :Utils); $VERSION = '1.04'; # Text tags %Image::ExifTool::Text::Main = ( VARS => { NO_ID => 1 }, GROUPS => { 0 => 'File', 1 => 'File', 2 => 'Document' }, NOTES => q{ Although basic text files contain no metadata, the following tags are determined from a simple analysis of the data in TXT and CSV files. Statistics are generated only for 8-bit encodings, but the L (-fast) option may be used to limit processing to the first 64 kB in which case some tags are not produced. To avoid long processing delays, ExifTool will issue a minor warning and process only the first 64 kB of any file larger than 20 MB unless the L (-m) option is used. }, MIMEEncoding => { Groups => { 2 => 'Other' } }, Newlines => { PrintConv => { "\r\n" => 'Windows CRLF', "\r" => 'Macintosh CR', "\n" => 'Unix LF', '' => '(none)', }, }, ByteOrderMark => { PrintConv => { 0 => 'No', 1 => 'Yes' } }, LineCount => { }, WordCount => { }, Delimiter => { PrintConv => { '' => '(none)', ',' => 'Comma', ';' => 'Semicolon', "\t" => 'Tab' }}, Quoting => { PrintConv => { '' => '(none)', '"' => 'Double quotes', "'" => 'Single quotes' }}, RowCount => { }, ColumnCount => { }, ); #------------------------------------------------------------------------------ # Extract some stats from a text file # Inputs: 0) ExifTool ref, 1) dirInfo ref # Returns: 1 on success, 0 if this wasn't a text file sub ProcessTXT($$) { my ($et, $dirInfo) = @_; my $dataPt = $$dirInfo{TestBuff}; my $raf = $$dirInfo{RAF}; my $fast = $et->Options('FastScan') || 0; my ($buff, $enc, $isBOM, $isUTF8); my $nl = ''; return 0 unless length $$dataPt; # can't call it a text file if it has no text # read more from the file if necessary if ($fast < 3 and length($$dataPt) == $Image::ExifTool::testLen) { $raf->Read($buff, 65536) or return 0; $dataPt = \$buff; } # # make our best guess at the character encoding (EBCDIC is not supported) # if ($$dataPt =~ /([\0-\x06\x0e-\x1a\x1c-\x1f\x7f])/) { # file contains weird control characters, could be multi-byte Unicode if ($$dataPt =~ /^(\xff\xfe\0\0|\0\0\xfe\xff)/) { if ($1 eq "\xff\xfe\0\0") { $enc = 'utf-32le'; $nl = $1 if $$dataPt =~ /(\r\0\0\0\n|\r|\n)\0\0\0/; } else { $enc = 'utf-32be'; $nl = $1 if $$dataPt =~ /\0\0\0(\r\0\0\0\n|\r|\n)/; } } elsif ($$dataPt =~ /^(\xff\xfe|\xfe\xff)/) { if ($1 eq "\xff\xfe") { $enc = 'utf-16le'; $nl = $1 if $$dataPt =~ /(\r\0\n|\r|\n)\0/; } else { $enc = 'utf-16be'; $nl = $1 if $$dataPt =~ /\0(\r\0\n|\r|\n)/; } } else { return 0; # probably not a text file } $nl =~ tr/\0//d; # remove nulls from newline sequence $isBOM = 1; # (we don't recognize UTF-16/UTF-32 without one) } else { $isUTF8 = Image::ExifTool::IsUTF8($dataPt, 1); if ($isUTF8 == 0) { $enc = 'us-ascii'; } elsif ($isUTF8 > 0) { $enc = 'utf-8'; $isBOM = ($$dataPt =~ /^\xef\xbb\xbf/ ? 1 : 0); } elsif ($$dataPt !~ /[\x80-\x9f]/) { $enc = 'iso-8859-1'; } else { $enc = 'unknown-8bit'; } $nl = $1 if $$dataPt =~ /(\r\n|\r|\n)/; } my $tagTablePtr = GetTagTable('Image::ExifTool::Text::Main'); $et->SetFileType(); $et->HandleTag($tagTablePtr, MIMEEncoding => $enc); return 1 if $fast == 3 or not $raf->Seek(0,0); $et->HandleTag($tagTablePtr, ByteOrderMark => $isBOM) if defined $isBOM; $et->HandleTag($tagTablePtr, Newlines => $nl); return 1 if $fast or not defined $isUTF8; # # generate stats for CSV files # if ($$et{FileType} eq 'CSV') { my ($delim, $quot, $ncols); my $nrows = 0; while ($raf->ReadLine($buff)) { if (not defined $delim) { my %count = ( ',' => 0, ';' => 0, "\t" => 0 ); ++$count{$_} foreach $buff =~ /[,;\t]/g; if ($count{','} > $count{';'} and $count{','} > $count{"\t"}) { $delim = ','; } elsif ($count{';'} > $count{"\t"}) { $delim = ';'; } elsif ($count{"\t"}) { $delim = "\t"; } else { $delim = ''; $ncols = 1; } unless ($ncols) { # account for delimiters in quotes (simplistically) while ($buff =~ /(^|$delim)(["'])(.*?)\2(?=$delim|$)/sg) { $quot = $2; my $field = $3; $count{$delim} -= () = $field =~ /$delim/g; } $ncols = $count{$delim} + 1; } } elsif (not $quot) { $quot = $2 if $buff =~ /(^|$delim)(["'])(.*?)\2(?=$delim|$)/sg; } if (++$nrows == 1000 and $et->Warn('Not counting rows past 1000', 2)) { undef $nrows; last; } } $et->HandleTag($tagTablePtr, Delimiter => ($delim || '')); $et->HandleTag($tagTablePtr, Quoting => ($quot || '')); $et->HandleTag($tagTablePtr, ColumnCount => $ncols); $et->HandleTag($tagTablePtr, RowCount => $nrows) if $nrows; return 1; } return 1 if $$et{VALUE}{FileSize} and $$et{VALUE}{FileSize} > 20000000 and $et->Warn('Not counting lines/words in text file larger than 20 MB', 2); # # count lines/words and check encoding of the rest of the file # my ($lines, $words) = (0, 0); my $oldNL = $/; $/ = $nl if $nl; while ($raf->ReadLine($buff)) { ++$lines; ++$words while $buff =~ /\S+/g; if (not $nl and $buff =~ /(\r\n|\r|\n)$/) { # (the first line must have been longer than 64 kB) $$et{VALUE}{Newlines} = $nl = $1; } next if $raf->Tell() < 65536; # continue to check encoding after the first 64 kB if ($isUTF8 >= 0) { # (if ascii or utf8) $isUTF8 = Image::ExifTool::IsUTF8(\$buff); if ($isUTF8 > 0) { $enc = 'utf-8'; } elsif ($isUTF8 < 0) { $enc = $buff =~ /[\x80-\x9f]/ ? 'unknown-8bit' : 'iso-8859-1'; } } elsif ($enc eq 'iso-8859-1' and $buff =~ /[\x80-\x9f]/) { $enc = 'unknown-8bit'; } } if ($$et{VALUE}{MIMEEncoding} ne $enc) { $$et{VALUE}{MIMEEncoding} = $enc; $et->VPrint(0," MIMEEncoding [override] = $enc\n"); } $/ = $oldNL; $et->HandleTag($tagTablePtr, LineCount => $lines); $et->HandleTag($tagTablePtr, WordCount => $words); return 1; } 1; # end __END__ =head1 NAME Image::ExifTool::Text - Read Text meta information =head1 SYNOPSIS This module is used by Image::ExifTool =head1 DESCRIPTION This module contains definitions required by Image::ExifTool to deduce some characteristics of TXT and CSV files. =head1 AUTHOR Copyright 2003-2023, Phil Harvey (philharvey66 at gmail.com) This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 REFERENCES =over 4 =item L =back =head1 SEE ALSO L, L =cut