Biffing for unknown charsets is now forced to ASCII.

Output is converted into ASCII characters, whith controls swallowed silently and non-ASCII characters dumped as '.'. This allows biffing to occur even when the body of the message is encoded with an unknown charset, the assumption being that ASCII is a firm common ground for all such charsets!
author: Raphael Manfredi <Raphael_Manfredi@pobox.com> 2020-04-08 10:16:01 +0200
committer: Raphael Manfredi <Raphael_Manfredi@pobox.com> 2020-04-09 16:16:46 +0200
commit: 9a14a1512556db68543bfeebac2138ce1be0b0fd (patch)
tree: 07b18f90e51052742f61a23a542f97fe9d17e7f2
parent: 3b6d126355db75f9f5015175cefd0819b8bff131 (diff)
1 files changed, 26 insertions, 2 deletions
diff --git a/agent/pl/biff.pl b/agent/pl/biff.pl
index 6346440..f48c971 100644
--- a/agent/pl/biff.pl
+++ b/agent/pl/biff.pl
@@ -267,6 +267,16 @@ sub is_blank {
 	return $l =~ /^[\W_]*$/;	# Contains only non-words and underscores
 }
 
+# Keep only printable ASCII chars from biffable lines in specified body array
+# Control chars are swallowed, non-ASCII chars converted to '.'.
+sub to_ascii {
+	my ($aref, $lines) = @_;	# Body as array ref, amount of lines to convert
+	my $n = $lines > @{$aref} ? @{$aref} : $lines;
+	for (my $i = 0; $i < $n; $i++) {
+		$aref->[$i] =~ s/(.)/mangle_ascii($1)/ge;
+	}
+}
+
 # Print first $cf'bifflines lines or $cf'bifflen charaters, whichever
 # comes first. Assumes TTY already opened correctly
 # Also known as the %-B macro if called body(0), or %-T if called body(1).
@@ -285,6 +295,7 @@ sub body {
 	my ($content, $entity, $enc, $biffenc);
 	($content, $entity) = unmime(\@body) if $'Header{'Mime-Version'};
 
+	my $convert_to_ascii = 0;
 	if (length($content)) {
 		&'add_log("biffing $entity entity is $content") if $'loglvl > 8;
 		my $charset;
@@ -292,9 +303,9 @@ sub body {
 		if (defined $charset) {
 			$enc = Encode::find_encoding($charset);
 			unless (ref $enc) {
-				&'add_log("WARNING unknown charset '$charset', no body shown")
+				&'add_log("WARNING unknown charset '$charset', handling as ASCII")
 					if $'loglvl > 1;
-				@body = ("[body hidden: unknown charset '$charset']");
+				$convert_to_ascii = 1;
 			}
 
 			# If the encoding is the same as the one used in the terminal,
@@ -313,6 +324,7 @@ sub body {
 
 	strip_html(\@body) if $content =~ /html\b/;
 	&trim(*body) if $trim;		# Smart trim of leading reply text
+	to_ascii(\@body, $lines) if $convert_to_ascii;
 	&mh(*body, $len) if $cf'biffmh =~ /^on/i;
 
 	my $reformat = $cf'biffnice =~ /^on/i;
@@ -518,6 +530,18 @@ sub format {
 
 # Perload OFF
 
+# Mangle given character to ASCII, or swallow it if CTRL char
+# MUST NOT be dataloaded (would mess $1 in the regexp)
+sub mangle_ascii {
+	my ($x) = @_;
+	my $c = unpack("U", $x);				# Read as Unicode
+	return '' if $c <= 8;					# Invisible
+	# Chars 9 and 10 are \t and \n in ASCII
+	return '' if $c >= 11 && $c < 32;		# Invisible
+	return '.' if $c >= 127;				# Outside the ASCII range
+	return pack("C", $c);					# Write as a byte (ASCII)
+}
+
 # Quoted-printable decoder
 # MUST NOT be dataloaded (would mess $1 in the regexp)
 sub to_txt {
author	Raphael Manfredi <Raphael_Manfredi@pobox.com>	2020-04-08 10:16:01 +0200
committer	Raphael Manfredi <Raphael_Manfredi@pobox.com>	2020-04-09 16:16:46 +0200
commit	9a14a1512556db68543bfeebac2138ce1be0b0fd (patch)
tree	07b18f90e51052742f61a23a542f97fe9d17e7f2
parent	3b6d126355db75f9f5015175cefd0819b8bff131 (diff)