diff options
Diffstat (limited to 'src/testpattern/compress-checksums.in')
-rw-r--r-- | src/testpattern/compress-checksums.in | 617 |
1 files changed, 524 insertions, 93 deletions
diff --git a/src/testpattern/compress-checksums.in b/src/testpattern/compress-checksums.in index 19af2f1..97d3574 100644 --- a/src/testpattern/compress-checksums.in +++ b/src/testpattern/compress-checksums.in @@ -1,5 +1,5 @@ #!@PERL@ -## Copyright (C) 2016 Robert Krawitz +## Copyright (C) 2016-2017 Robert Krawitz ## ## This program is free software; you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by @@ -12,52 +12,225 @@ ## GNU General Public License for more details. ## ## You should have received a copy of the GNU General Public License -## along with this program; if not, write to the Free Software -## Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +## along with this program. If not, see <https://www.gnu.org/licenses/>. + +# Compress Gutenprint regression test results +# +# The Gutenprint regression test uses run-testpattern-2 to generate +# hashes for each test case it runs. It currently runs about 370,000 +# cases and generates 75 MB of output (if we used a less...verbose +# hash than SHA-512, we could save a fair bit of that, but it would +# still be something in the range of 40 MB). This is too big to store +# for very long, or to store in the repo. +# +# But there's a lot of compression we can do, some of it lossy and +# some lossless, to greatly reduce the data volume. Obviously, we can +# use conventional lossless compression (such as bzip2), but by +# understanding what we're trying to accomplish we can do much better. +# +# The three important points are: +# +# 1) There's a lot of duplication (a lot of test cases generate the +# same output). Some of this is because a lot of printers actually +# use the same driver, some of it is because in some cases +# different options don't affect the output, and some of it is +# because under many conditions different drivers generate the same +# output. Of the 369195 test cases as of 5.3.0-pre1, there are +# only 17701 distinct outputs, as measured by counting the number +# of distinct hash values. +# +# 2) We don't need the entire hash value, only enough to determine +# with reasonable reliability that a particular test case's output +# has not changed between runs. If we keep, say, 32 bits of hash +# value, we're going to miss a change in output only once in every +# 4 billion cases. For this purpose, that's good enough. So we +# keep only enough hash to distinguish between all of the hash +# values in the test run. That's usually between 7 and 9 bytes +# (28-36 bits). If we miss a change in output once in 256 million +# cases, that should be OK. And for that matter, if we sort the +# hashes, we only need to keep the delta between adjacent hash +# value; leading hex digits in common can be omitted. +# +# 3) We don't care about preserving the order of the test runs. It +# doesn't matter at all if the test runs show up in different +# order; all we need to do is match them up case by case to see +# what has changed. The cases are all independent. +# +# We also know about the structure of the test data. Each test result +# is stored as <hash, name>. The names themselves are structured as +# <printer_option0_value0_option1_value1...>. Since a lot of +# components are preserved from case to case, we can do a lot of +# common component elimination and prefix/suffix removal. +# +# It turns out that we can achieve better compression by putting the +# printer name on the end of the case rather than the beginning, since +# in many cases many printers share the same information. The +# original format puts the printer name first, but no reason we can't +# reorder it for storage, as long as we can undo it later when we +# compare the results. +# +# Surprisingly (perhaps) after all of these machinations the data +# still doesn't have lot of entropy; bzip2 compresses even our best +# work by 90%. And bzip is far from the best compression out there. +# The best that I have found is zpaq +# (http://mattmahoney.net/dc/zpaq.html). It's very slow but very, +# very effective. The zpaq utility is set up for archiving rather +# than file compression, but that's nothing that a little scripting +# can't get around. For our test data, zpaq gets another 35-45%, +# depending upon compression level (initial completely uncompressed +# data is another story; zpaq is about 70% better!). + +# Compression levels (cumulative): +# +# 0 - none +# 1 - elide duplicate hashes (LOSSY -- reorders data) +# 2 - use global minimum hash length prefix (LOSSY -- reduces hash data) +# 3 - eliminate common second level groups (run encoding) +# 4 - use pairwise minimum hash length prefix +# 5 - use substitution table for common option names +# 6 - use pre-substitution table for certain value names (LOSSY -- reorder) +# 7 - common prefix/suffix elimination for chunks +# - pack multiple consecutive identical chunks +# - eliminate BW/color prefix +# 8 - replace space (inter-record gaps) with high bit set on last byte +# 9 - compact subexpression elimination (7a) + +# There are 9 defined compression levels to match usage of other +# compression programs. + +# 369195 test cases in the sample run +# CPU: Intel Xeon E3-1505M, Lenovo P70 + +# Method Size Packed Time Ratio Packed bzip2 Bits/case +# 0 75512435 1970522 0.08 1.0 38.3 6998640 42.69 +# 1 11066295 1316628 2.82 6.8 57.3 1620995 28.52 +# 2 8924474 228519 2.94 8.4 330.4 453281 4.95 +# 3 5925516 216037 3.16 11.3 349.5 383805 4.68 +# 4 5874446 205435 3.22 11.4 367.5 354349 4.45 +# 5 5306533 195594 3.61 14.2 386.0 316775 4.24 +# 6 4788102 182921 3.88 15.7 412.8 290248 3.96 +# 7b 3266449 180295 23.1 417.3 221237 3.91 +# 7a 2401276 126804 31.4 548.7 200778 2.75 +# 7 2312771 126487 4.67 32.6 595.5 200371 2.74 +# 8 2110942 122762 5.12 35.7 615.1 197529 2.66 +# 9 1891341 121970 5.30 39.9 619.1 187040 2.64 + +# Currently reserved special characters: +# +# ( ( ) & ? ) reserved for level 3. +# ; is reserved for level 5. +# Braces {} are reserved for level 6. +# (* + \ /) are reserved for level 7. +# (= ! " , ' % >) are reserved for level 7 (7a) +# High bit set is reserved for level 8. +# Non-printable ASCII (0-31) is reserved for level 9. +# (- _) are used as parts of tokens and token separators. +# | is reserved for use as a temporary. + +# Non-alphanumeric characters currently unreserved are: +# ` ~ @ # $ ^ [ ] < use strict; +use Getopt::Long; +Getopt::Long::Configure("bundling", "no_ignore_case", "pass_through"); + +my ($comp_level) = 9; my (%checksums); -while (<>) { - chomp; - my ($sum, @keys) = split; - my (@pchunks) = (); - foreach my $k (@keys) { - my (@chunks) = split(/_/, $k, -1); - foreach my $i (0..$#chunks) { - if ($chunks[$i] eq '') { - $chunks[$i] = $pchunks[$i]; - } elsif ($chunks[$i] =~ /^([0-9]+)\+(.*)/) { - $chunks[$i] = substr($pchunks[$i], 0, $1) . $2; - } - } - my $key = join('_', @chunks), '.prn'; - if (! defined $checksums{$sum}) { - $checksums{$sum} = []; - } - push @{$checksums{$sum}}, $key; - } +my (@ptable) = + (["=\\+", pack("C", 27)], + ["!\\+", pack("C", 26)], + ['"\+', pack("C", 25)], + [",\\+", pack("C", 24)], + ["'\\+", pack("C", 23)], + ["%\\+", pack("C", 22)], + [">\\+", pack("C", 21)], + ["20\\+", pack("C", 20)], + ["19\\+", pack("C", 19)], + ["18\\+", pack("C", 18)], + ["17\\+", pack("C", 17)], + ["16\\+", pack("C", 16)], + ["15\\+", pack("C", 15)], + ["14\\+", pack("C", 14)], + ["13\\+", pack("C", 13)], + ["12\\+", pack("C", 12)], + ["11\\+", pack("C", 11)], + ["10\\+", pack("C", 9)], # 0xa is newline! + ["9\\+", pack("C", 8)], + ["8\\+", pack("C", 7)], + ["7\\+", pack("C", 6)], + ["6\\+", pack("C", 5)], + ["5\\+", pack("C", 4)], + ["4\\+", pack("C", 3)], + ["3\\+", pack("C", 2)], + ["2\\+", pack("C", 1)]); + +my (%asc_table); +map { $asc_table{pack("C", $_)} = pack("C", $_ + 128); } (0..127); + +my $psub_code = <<\EOF; +sub psub($) { + my ($out) = @_; +EOF +foreach my $psub (@ptable) { + $psub_code .= " \$out =~ s/$$psub[0]/$$psub[1]/go;\n"; +} +$psub_code .= <<\EOF; + return $out; +} +EOF + +eval $psub_code; + +my (%sub_table) = ( + "PrintingMode" => "P;", + "PageSize" => "S;", + "DitherAlgorithm" => "Z;", + "Resolution" => "R;", + "ColorCorrection" => "C;", + "Color" => "c;", + "MediaType" => "M;", + "InputSlot" => "I;" + ); + +my ($spc) = unpack("C", " "); + +sub precompress($) { + my ($run) = @_; + # This one really doesn't belong here. This is to work around the + # (very domain-specifid) fact that most of the runs specify a dither + # algorithm, except for the ones testing the dither algorithms per se. + # The common group elimination code needs to have everything to the right + # of the second option to be fixed except for the printer name. + # If dither algorithm sorts between other options, it often + # breaks up a run of option names, since if dither algorithm is the + # key option it won't appear to the right, breaking up a run of fixed + # right hand components. + # + # This has a significant effect on compression; at level 10 the + # 5.3.0-pre1 test dataset consumes 1938695 bytes without this + # hack, but 1891341 with it -- about 2.5% savings. The zpaq archive + # is 123645 bytes without and 121970 with, or 1.3%. + $run =~ s/DitherAlgorithm/Z;/go; + $run =~s/600x600dpi_(draft|high|mono|photo|std|tshirt)([^?&_]*)/{600|$1|$2}/go; + $run =~s/720x720dpi_(draft|high|mono|photo|std|tshirt)([^?&_]*)/{720|$1|$2}/go; + $run =~s/300x300dpi_(draft|high|mono|photo|std|tshirt)([^?&_]*)/{300|$1|$2}/go; + $run =~s/360x360dpi_(draft|high|mono|photo|std|tshirt)([^?&_]*)/{360|$1|$2}/go; + $run =~s/\{([0-9]*)\|([^|])[^|]*\|([^?&_]*)\}/{$1$2$3}/go; + return $run; } sub find_min_nonunique_prefix(@) { my (@sums) = sort @_; - my ($last) = ""; - my ($max_prefix) = 0; - my ($lc) = 0; + my ($last) = shift @sums; + my ($maxlen) = length $last; + my ($max_prefix) = 1; foreach my $sum (@sums) { - $lc++; - if ($last eq "") { - $last = $sum; - next; - } - foreach my $l ($max_prefix..length $sum) { - if (substr($sum, 0, $l) eq substr($last, 0, $l)) { - $max_prefix++; - } else { - next; - } + foreach my $l ($max_prefix..$maxlen) { + last if (substr($sum, 0, $l) ne substr($last, 0, $l)); + $max_prefix++; } $last = $sum; } @@ -85,32 +258,216 @@ sub get_mode($) { return $mode; } -my $min_prefix = find_min_nonunique_prefix(keys %checksums); - -my ($last_sum) = ""; +sub scan_next_block($$$$$$$) { + my ($runs, $printers, $start, $blocksize, $ovalue, $rest, $ov1) = @_; + my ($limit) = scalar @$runs; + # Don't have enough runs left + if ($start + $blocksize > $limit) { + return ""; + } + my ($run0) = $$runs[$start]; + $run0 =~ /^([^:]+):(.*)$/o; + my ($data) = $1; + my ($printer) = $2; + # Printer does not match the first printer on the list. + if ($printer ne $$printers[0]) { + return ""; + } + my (@elts) = split(/_/, $data); + my ($v1) = $elts[1]; + if ($v1 ne $ov1) { + return ""; + } + my ($option) = $elts[2]; + my ($value) = $elts[3]; + if ($ovalue eq $value) { + return ""; + } + my ($nrest) = join("", @elts[4..$#elts]); + if ($nrest ne $rest) { + return ""; + } + # Different option + my $i; + for ($i = 1; $i < $blocksize; $i++) { + my ($run) = $$runs[$start + $i]; + $run =~ /^([^:]+):(.*)$/o; + $data = $1; + $printer = $2; + # Different printer that what we expect! + if ($printer ne $$printers[$i]) { + return ""; + } + @elts = split(/_/, $data); + my ($nv1) = $elts[1]; + my $noption = $elts[2]; + if ($nv1 ne $v1) { + return ""; + } + # Different option than what we expect. + if ($noption ne $option) { + return ""; + } + my ($nvalue) = $elts[3]; + # ...or different value from what we expect + if ($nvalue ne $value) { + return ""; + } + # ...or something else different, perchance? + $nrest = join("", @elts[4..$#elts]); + if ($nrest ne $rest) { + return ""; + } + } + return [$option, $value]; +} -my (%sub_table); -$sub_table{"PrintingMode"} = "P;"; -$sub_table{"PageSize"} = "S;"; -$sub_table{"DitherAlgorithm"} = "D;"; -$sub_table{"Resolution"} = "R;"; -$sub_table{"ColorCorrection"} = "C;"; -$sub_table{"Color"} = "c;"; -$sub_table{"MediaType"} = "M;"; -$sub_table{"InputSlot"} = "I;"; - -sub compress_chunk($) { - my ($chunk) = @_; - if (defined $sub_table{$chunk}) { - return $sub_table{$chunk}; +# If we find a match, return the data: glommed-together option string, +# size of the block, and number of values. +# +# If we don't find a match, return how far we searched without finding +# a match, so that the compressor doesn't spend a lot of time searching +# for a match where there isn't going to be one. +sub scan_for_block($$) { + my ($runs, $start) = @_; + my ($first_printer); + my ($idx) = $start; + my ($limit) = scalar @$runs; + # Runs are now of the form + # PrintingMode_XX_OPT1_YY...:printer + # First scan pass: look for the first change of YY while OPT1 remains + # the same. The printer should be the same as the first printer we + # find. + my ($run0) = $$runs[$start]; + $run0 =~ /^([^:]+):(.*)$/o; + my ($data) = $1; + my ($printer) = $2; + my (@elts) = split(/_/, $data); + my ($v1) = $elts[1]; + my ($option) = $elts[2]; + my ($value) = $elts[3]; + my ($rest) = join("", @elts[4..$#elts]); + my (@printers) = ($printer); + my (@values) = ([$option, $value]); + $idx++; + while ($idx < $limit) { + my ($run) = $$runs[$idx]; + $run =~ /^([^:]+):(.*)$/o; + $data = $1; + $printer = $2; + @elts = split(/_/, $data); + my ($nv1) = $elts[1]; + my ($noption) = $elts[2]; + my ($nvalue) = $elts[3]; + if ($v1 ne $nv1) { + return ("", 0, $idx); + } + # Different option name -- don't have a match. + if ($noption ne $option) { + return ("", 0, $idx); + } + my ($nrest) = join("", @elts[4..$#elts]); + if ($nrest ne $rest) { + return ("", 0, $idx); + } + if ($printer ne $printers[0]) { + if ($nvalue ne $value) { + # Different option value, but not the same printer -- don't have + # a match. + return ("", 0, $idx); + } else { + # Same option value, new printer + push @printers, $printer; + $idx++; + } + } else { + if ($nvalue eq $value) { + return ("", 0, $idx); + } + # New option value but same printer as first -- found a block! + last; + } + } + my ($blocksize) = $#printers + 1; + if ($blocksize < 2) { + return ( "", 0, $idx ); + } + my $ovalue = ""; + while ($idx < $limit) { + $value = scan_next_block($runs, \@printers, $idx, $blocksize, $ovalue, + $rest, $v1); + if (ref $value eq "ARRAY") { + push @values, $value; + $ovalue = $value->[1]; + $idx += $blocksize; + } else { + # End of the line. + if ($#values > 0) { + last; + } else { + return ( "", 0, $idx ); + } + } + } + if ($#values > 0) { + my (@optstrs); + my (@valstrs); + my ($last_option) = ""; + foreach my $item (@values) { + my ($opt) = $item->[0]; + my ($val) = $item->[1]; + if ($last_option ne $opt) { + if ($last_option ne "") { + if ($comp_level >= 5 && defined $sub_table{$last_option}) { + $last_option = $sub_table{$last_option}; + } + push @optstrs, "${last_option}_" . join("&", @valstrs); + } + $last_option = $opt; + @valstrs = (); + } + push @valstrs, $val; + } + if ($comp_level >= 5 && defined $sub_table{$last_option}) { + $last_option = $sub_table{$last_option}; + } + push @optstrs, "${last_option}_" . join("&", @valstrs);; + my ($valstr) = join("?", @optstrs); + return ( $valstr, $blocksize, $#values + 1 ); } else { - return $chunk; + return ( "", 0, $idx ); } } +GetOptions("l:i" => \$comp_level); + +if ($comp_level <= 0) { + print while (<>); + exit; +} + +while (<>) { + chomp; + my ($sum, $key) = split; + if (! defined $checksums{$sum}) { + $checksums{$sum} = []; + } + push @{$checksums{$sum}}, $key; +} + +my $min_prefix; + +if ($comp_level >= 2) { + $min_prefix = find_min_nonunique_prefix(keys %checksums); +} else { # $comp_level == 1 + $min_prefix = length((keys %checksums)[0]); +} + +my ($last_sum) = ""; + foreach my $sum (sort keys %checksums) { my ($out) = substr($sum, 0, $min_prefix); - if ($last_sum ne "") { + if ($last_sum ne "" && $comp_level >= 4) { $out = substr($out, find_min_nonunique_prefix($sum, $last_sum) - 1); } $last_sum = $sum; @@ -122,66 +479,140 @@ foreach my $sum (sort keys %checksums) { s/\.prn$//; my ($printer) = get_printer($_); my ($mode) = get_mode($_); + if ($comp_level >= 6) { + $mode = precompress($mode); + } "${mode}_:$printer" } @{$checksums{$sum}}; - foreach my $run (@runs) { + my ($lastdup) = 0; + my ($valstr) = ""; + my ($blocksize) = 0; + my ($valcount) = 0; + my ($stop_at) = -1; + my ($idx); + my ($found_block_first_time) = 0; + my ($next_check) = 0; + while ($idx <= $#runs) { + my $run = $runs[$idx]; my (@chunks) = split(/[_]/, $run); - @chunks = map { compress_chunk($_) } @chunks; + if ($comp_level >= 3 && $stop_at < 0 && + $idx >= $next_check && + $#runs - $idx > 2 && + $pchunks[2] ne $chunks[2]) { + ($valstr, $blocksize, $valcount) = scan_for_block(\@runs, $idx); + if ($blocksize > 0) { + $chunks[2] = ""; + $chunks[3] = $valstr; + $stop_at = $idx + $blocksize; + $found_block_first_time = 1; + } else { + $next_check = $valcount; + } + } + if ($comp_level >= 5) { + map { + if (defined $sub_table{$chunks[$_]}) { + $chunks[$_] = $sub_table{$chunks[$_]}; + } + } (0..$#chunks); + } my ($mchunks) = min(scalar @pchunks, scalar @chunks); my ($i) = 0; $out .= " "; + if ($stop_at > 0) { + if ($found_block_first_time) { + map { $pchunks[$_] = "" } (2..$#pchunks); + $chunks[2] = "($chunks[2]"; + $chunks[3] = $valstr; + } else { + $chunks[2] = ""; + $chunks[3] = ""; + } + } + while ($i < $mchunks) { if ($i > 0) { $out .= '_'; } if ($chunks[$i] ne $pchunks[$i]) { - my ($plen) = length $pchunks[$i]; - my ($len) = length $chunks[$i]; - my ($maxc) = min($len, $plen); - my ($dup) = 0; - my ($tail) = 0; - while ($dup < $maxc) { - last if (substr($pchunks[$i], $dup, 1) ne - substr($chunks[$i], $dup, 1)); - $dup++; - } - while ($tail < $maxc - $dup) { - last if (substr($pchunks[$i], -1 - $tail, 1) ne - substr($chunks[$i], -1 - $tail, 1)); - $tail++; - } - my ($chunk) = $chunks[$i]; - if ($tail == 2) { - $chunk = substr($chunk, 0, -$tail) . '\\'; - } elsif ($tail > 2) { - $chunk = substr($chunk, 0, -$tail) . "/$tail"; - } - if ($dup == 2) { - $out .= '*' . substr($chunk, $dup); - } elsif ($dup > 2) { - $out .= "$dup+" . substr($chunk, $dup); + if ($comp_level >= 7) { + my ($plen) = length $pchunks[$i]; + my ($len) = length $chunks[$i]; + my ($maxc) = $len; + $maxc = $plen if $plen< $len; + my ($dup) = 0; + my ($chunk) = $chunks[$i]; + if (substr($pchunks[$i], 0, 1) ne '(') { + while ($dup < $maxc) { + last if (substr($pchunks[$i], $dup, 1) ne + substr($chunks[$i], $dup, 1)); + $dup++; + } + if (! $found_block_first_time) { + my ($tail) = 0; + while ($tail < $maxc - $dup) { + last if (substr($pchunks[$i], -1 - $tail, 1) ne + substr($chunks[$i], -1 - $tail, 1)); + $tail++; + } + if ($tail == 2) { + $chunk = substr($chunk, 0, -$tail) . '\\'; + } elsif ($tail > 2) { + $chunk = substr($chunk, 0, -$tail) . "/$tail"; + } + } + } + if ($dup == 2) { + $out .= '*' . substr($chunk, $dup); + } elsif ($dup > 2) { + if ($dup == $lastdup) { + $out .= "+" . substr($chunk, $dup); + } else { + $out .= "$dup+" . substr($chunk, $dup); + } + } else { + $out .= "$chunk"; + } + $lastdup = $dup; } else { - $out .= "$chunk"; + $out .= $chunks[$i]; } - } else { - $out .= ""; } $i++; } + $found_block_first_time = 0; + @pchunks = @chunks; if ($i < scalar @chunks) { if ($i > 0) { $out .= '_'; } $out .= join("_", @chunks[$i..$#chunks]); } - $out =~ s/________/=/g; - $out =~ s/_______/!/g; - $out =~ s/______/"/g; - $out =~ s/_____/,/g; - $out =~ s/____/'/g; - $out =~ s/___/%/g; - $out =~ s/__/>/g; - @pchunks = @chunks; + $idx++; + if ($blocksize > 0 && $idx == $stop_at) { + $out .= ")"; + $idx += ($blocksize * ($valcount - 1)); + $stop_at = -1; + } + } + $out =~ s/\(_/(/g; + if ($comp_level >= 7) { + $out =~ s/________/=/go; + $out =~ s/_______/!/go; + $out =~ s/______/"/go; + $out =~ s/_____/,/go; + $out =~ s/____/'/go; + $out =~ s/___/%/go; + $out =~ s/__/>/go; + $out =~ s/^([0-9a-f]+ )P;_BW_/$1./; + $out =~ s/^([0-9a-f]+ )P;_c;_/$1,/; + if ($comp_level >= 8) { + $out =~ s/(.) /$asc_table{$1}/g; + if ($comp_level >= 9) { + $out = psub($out); + } + } } + $out =~ s/\|$//; print "$out\n"; } |