summaryrefslogtreecommitdiff
path: root/CleanTxt
diff options
context:
space:
mode:
Diffstat (limited to 'CleanTxt')
-rwxr-xr-xCleanTxt113
1 files changed, 113 insertions, 0 deletions
diff --git a/CleanTxt b/CleanTxt
new file mode 100755
index 0000000..1f42519
--- /dev/null
+++ b/CleanTxt
@@ -0,0 +1,113 @@
+#! /usr/bin/perl -w
+
+# Script to take the output of nroff -man and remove all the backspacing and
+# the page footers and the screen commands etc so that it is more usefully
+# readable online. In fact, in the latest nroff, intermediate footers don't
+# seem to be generated any more.
+
+$blankcount = 0;
+$lastwascut = 0;
+$firstheader = 1;
+
+# Input on STDIN; output to STDOUT.
+
+while (<STDIN>)
+ {
+ s/\x1b\[\d+m//g; # Remove screen controls "ESC [ number m"
+ s/.\x8//g; # Remove "char, backspace"
+
+ # Handle header lines. Retain only the first one we encounter, but remove
+ # the blank line that follows. Any others (e.g. at end of document) and the
+ # following blank line are dropped.
+
+ if (/^PCRE(\w*)\(([13])\)\s+PCRE\1\(\2\)$/)
+ {
+ if ($firstheader)
+ {
+ $firstheader = 0;
+ print;
+ $lastprinted = $_;
+ $lastwascut = 0;
+ }
+ $_=<STDIN>; # Remove a blank that follows
+ next;
+ }
+
+ # Count runs of empty lines
+
+ if (/^\s*$/)
+ {
+ $blankcount++;
+ $lastwascut = 0;
+ next;
+ }
+
+ # If a chunk of lines has been cut out (page footer) and the next line
+ # has a different indentation, put back one blank line.
+
+ if ($lastwascut && $blankcount < 1 && defined($lastprinted))
+ {
+ ($a) = $lastprinted =~ /^(\s*)/;
+ ($b) = $_ =~ /^(\s*)/;
+ $blankcount++ if ($a ne $b);
+ }
+
+ # We get here only when we have a non-blank line in hand. If it was preceded
+ # by 3 or more blank lines, read the next 3 lines and see if they are blank.
+ # If so, remove all 7 lines, and remember that we have just done a cut.
+
+ if ($blankcount >= 3)
+ {
+ for ($i = 0; $i < 3; $i++)
+ {
+ $next[$i] = <STDIN>;
+ $next[$i] = "" if !defined $next[$i];
+ $next[$i] =~ s/\x1b\[\d+m//g; # Remove screen controls "ESC [ number m"
+ $next[$i] =~ s/.\x8//g; # Remove "char, backspace"
+ }
+
+ # Cut out chunks of the form <3 blanks><non-blank><3 blanks>
+
+ if ($next[0] =~ /^\s*$/ &&
+ $next[1] =~ /^\s*$/ &&
+ $next[2] =~ /^\s*$/)
+ {
+ $blankcount -= 3;
+ $lastwascut = 1;
+ }
+
+ # Otherwise output the saved blanks, the current, and the next three
+ # lines. Remember the last printed line.
+
+ else
+ {
+ for ($i = 0; $i < $blankcount; $i++) { print "\n"; }
+ print;
+ for ($i = 0; $i < 3; $i++)
+ {
+ $next[$i] =~ s/.\x8//g;
+ print $next[$i];
+ $lastprinted = $_;
+ }
+ $lastwascut = 0;
+ $blankcount = 0;
+ }
+ }
+
+ # This non-blank line is not preceded by 3 or more blank lines. Output
+ # any blanks there are, and the line. Remember it. Force two blank lines
+ # before headings.
+
+ else
+ {
+ $blankcount = 2 if /^\S/ && !/^Last updated/ && !/^Copyright/ &&
+ defined($lastprinted);
+ for ($i = 0; $i < $blankcount; $i++) { print "\n"; }
+ print;
+ $lastprinted = $_;
+ $lastwascut = 0;
+ $blankcount = 0;
+ }
+ }
+
+# End