summaryrefslogtreecommitdiff
path: root/src/modules/common/lzsscomprs.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/modules/common/lzsscomprs.cpp')
-rw-r--r--src/modules/common/lzsscomprs.cpp668
1 files changed, 668 insertions, 0 deletions
diff --git a/src/modules/common/lzsscomprs.cpp b/src/modules/common/lzsscomprs.cpp
new file mode 100644
index 0000000..bd8f768
--- /dev/null
+++ b/src/modules/common/lzsscomprs.cpp
@@ -0,0 +1,668 @@
+/******************************************************************************
+ * lzsscomprs.cpp - code for class 'LZSSCompress'- a driver class that
+ * provides LZSS compression
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <lzsscomprs.h>
+
+SWORD_NAMESPACE_START
+
+/******************************************************************************
+ * LZSSCompress Statics
+ */
+
+// m_ring_buffer is a text buffer. It contains "nodes" of
+// uncompressed text that can be indexed by position. That is,
+// a substring of the ring buffer can be indexed by a position
+// and a length. When decoding, the compressed text may contain
+// a position in the ring buffer and a count of the number of
+// bytes from the ring buffer that are to be moved into the
+// uncompressed buffer.
+//
+// This ring buffer is not maintained as part of the compressed
+// text. Instead, it is reconstructed dynamically. That is,
+// it starts out empty and gets built as the text is decompressed.
+//
+// The ring buffer contain N bytes, with an additional F - 1 bytes
+// to facilitate string comparison.
+
+unsigned char LZSSCompress::m_ring_buffer[N + F - 1];
+
+// m_match_position and m_match_length are set by InsertNode().
+//
+// These variables indicate the position in the ring buffer
+// and the number of characters at that position that match
+// a given string.
+
+short int LZSSCompress::m_match_position;
+short int LZSSCompress::m_match_length;
+
+// m_lson, m_rson, and m_dad are the Japanese way of referring to
+// a tree structure. The dad is the parent and it has a right and
+// left son (child).
+//
+// For i = 0 to N-1, m_rson[i] and m_lson[i] will be the right
+// and left children of node i.
+//
+// For i = 0 to N-1, m_dad[i] is the parent of node i.
+//
+// For i = 0 to 255, rson[N + i + 1] is the root of the tree for
+// strings that begin with the character i. Note that this requires
+// one byte characters.
+//
+// These nodes store values of 0...(N-1). Memory requirements
+// can be reduces by using 2-byte integers instead of full 4-byte
+// integers (for 32-bit applications). Therefore, these are
+// defined as "short ints."
+
+short int LZSSCompress::m_lson[N + 1];
+short int LZSSCompress::m_rson[N + 257];
+short int LZSSCompress::m_dad[N + 1];
+
+
+/******************************************************************************
+ * LZSSCompress Constructor - Initializes data for instance of LZSSCompress
+ *
+ */
+
+LZSSCompress::LZSSCompress() : SWCompress() {
+}
+
+
+/******************************************************************************
+ * LZSSCompress Destructor - Cleans up instance of LZSSCompress
+ */
+
+LZSSCompress::~LZSSCompress() {
+}
+
+
+/******************************************************************************
+ * LZSSCompress::InitTree - This function initializes the tree nodes to
+ * "empty" states.
+ */
+
+void LZSSCompress::InitTree(void) {
+ int i;
+
+ // For i = 0 to N - 1, m_rson[i] and m_lson[i] will be the right
+ // and left children of node i. These nodes need not be
+ // initialized. However, for debugging purposes, it is nice to
+ // have them initialized. Since this is only used for compression
+ // (not decompression), I don't mind spending the time to do it.
+ //
+ // For the same range of i, m_dad[i] is the parent of node i.
+ // These are initialized to a known value that can represent
+ // a "not used" state.
+
+ for (i = 0; i < N; i++) {
+ m_lson[i] = NOT_USED;
+ m_rson[i] = NOT_USED;
+ m_dad[i] = NOT_USED;
+ }
+
+ // For i = 0 to 255, m_rson[N + i + 1] is the root of the tree
+ // for strings that begin with the character i. This is why
+ // the right child array is larger than the left child array.
+ // These are also initialzied to a "not used" state.
+ //
+ // Note that there are 256 of these, one for each of the possible
+ // 256 characters.
+
+ for (i = N + 1; i <= (N + 256); i++) {
+ m_rson[i] = NOT_USED;
+ }
+}
+
+
+/******************************************************************************
+ * LZSSCompress::InsertNode - This function inserts a string from the ring
+ * buffer into one of the trees. It loads the
+ * match position and length member variables
+ * for the longest match.
+ *
+ * The string to be inserted is identified by
+ * the parameter Pos, A full F bytes are
+ * inserted. So,
+ * m_ring_buffer[Pos ... Pos+F-1]
+ * are inserted.
+ *
+ * If the matched length is exactly F, then an
+ * old node is removed in favor of the new one
+ * (because the old one will be deleted
+ * sooner).
+ *
+ * Note that Pos plays a dual role. It is
+ * used as both a position in the ring buffer
+ * and also as a tree node.
+ * m_ring_buffer[Pos] defines a character that
+ * is used to identify a tree node.
+ *
+ * ENT: pos - position in the buffer
+ */
+
+void LZSSCompress::InsertNode(short int Pos)
+{
+ short int i;
+ short int p;
+ int cmp;
+ unsigned char * key;
+
+/*
+ ASSERT(Pos >= 0);
+ ASSERT(Pos < N);
+*/
+
+ cmp = 1;
+ key = &(m_ring_buffer[Pos]);
+
+ // The last 256 entries in m_rson contain the root nodes for
+ // strings that begin with a letter. Get an index for the
+ // first letter in this string.
+
+ p = (short int) (N + 1 + key[0]);
+
+ // Set the left and right tree nodes for this position to "not
+ // used."
+
+ m_lson[Pos] = NOT_USED;
+ m_rson[Pos] = NOT_USED;
+
+ // Haven't matched anything yet.
+
+ m_match_length = 0;
+
+ for ( ; ; ) {
+ if (cmp >= 0) {
+ if (m_rson[p] != NOT_USED) {
+ p = m_rson[p];
+ }
+ else {
+ m_rson[p] = Pos;
+ m_dad[Pos] = p;
+ return;
+ }
+ }
+ else {
+ if (m_lson[p] != NOT_USED) {
+ p = m_lson[p];
+ }
+ else {
+ m_lson[p] = Pos;
+ m_dad[Pos] = p;
+ return;
+ }
+ }
+
+ // Should we go to the right or the left to look for the
+ // next match?
+
+ for (i = 1; i < F; i++) {
+ cmp = key[i] - m_ring_buffer[p + i];
+ if (cmp != 0)
+ break;
+ }
+
+ if (i > m_match_length) {
+ m_match_position = p;
+ m_match_length = i;
+
+ if (i >= F)
+ break;
+ }
+ }
+
+ m_dad[Pos] = m_dad[p];
+ m_lson[Pos] = m_lson[p];
+ m_rson[Pos] = m_rson[p];
+
+ m_dad[ m_lson[p] ] = Pos;
+ m_dad[ m_rson[p] ] = Pos;
+
+ if (m_rson[ m_dad[p] ] == p) {
+ m_rson[ m_dad[p] ] = Pos;
+ }
+ else {
+ m_lson[ m_dad[p] ] = Pos;
+ }
+
+ // Remove "p"
+
+ m_dad[p] = NOT_USED;
+}
+
+
+/******************************************************************************
+ * LZSSCompress::DeleteNode - This function removes the node "Node" from the
+ * tree.
+ *
+ * ENT: node - node to be removed
+ */
+
+void LZSSCompress::DeleteNode(short int Node)
+{
+ short int q;
+
+/*
+ ASSERT(Node >= 0);
+ ASSERT(Node < (N+1));
+*/
+
+ if (m_dad[Node] == NOT_USED) { // not in tree, nothing to do
+ return;
+ }
+
+ if (m_rson[Node] == NOT_USED) {
+ q = m_lson[Node];
+ }
+ else if (m_lson[Node] == NOT_USED) {
+ q = m_rson[Node];
+ }
+ else {
+ q = m_lson[Node];
+ if (m_rson[q] != NOT_USED) {
+ do {
+ q = m_rson[q];
+ } while (m_rson[q] != NOT_USED);
+
+ m_rson[ m_dad[q] ] = m_lson[q];
+ m_dad[ m_lson[q] ] = m_dad[q];
+ m_lson[q] = m_lson[Node];
+ m_dad[ m_lson[Node] ] = q;
+ }
+
+ m_rson[q] = m_rson[Node];
+ m_dad[ m_rson[Node] ] = q;
+ }
+
+ m_dad[q] = m_dad[Node];
+
+ if (m_rson[ m_dad[Node] ] == Node) {
+ m_rson[ m_dad[Node] ] = q;
+ }
+ else {
+ m_lson[ m_dad[Node] ] = q;
+ }
+
+ m_dad[Node] = NOT_USED;
+}
+
+
+/******************************************************************************
+ * LZSSCompress::Encode - This function "encodes" the input stream into the
+ * output stream.
+ * The GetChars() and SendChars() functions are
+ * used to separate this method from the actual
+ * i/o.
+ * NOTE: must set zlen for parent class to know length of
+ * compressed buffer.
+ */
+
+void LZSSCompress::Encode(void)
+{
+ short int i; // an iterator
+ short int r; // node number in the binary tree
+ short int s; // position in the ring buffer
+ unsigned short int len; // len of initial string
+ short int last_match_length; // length of last match
+ short int code_buf_pos; // position in the output buffer
+ unsigned char code_buf[17]; // the output buffer
+ unsigned char mask; // bit mask for byte 0 of out buf
+ unsigned char c; // character read from string
+
+ // Start with a clean tree.
+
+ InitTree();
+ direct = 0; // set direction needed by parent [Get|Send]Chars()
+
+ // code_buf[0] works as eight flags. A "1" represents that the
+ // unit is an unencoded letter (1 byte), and a "0" represents
+ // that the next unit is a <position,length> pair (2 bytes).
+ //
+ // code_buf[1..16] stores eight units of code. Since the best
+ // we can do is store eight <position,length> pairs, at most 16
+ // bytes are needed to store this.
+ //
+ // This is why the maximum size of the code buffer is 17 bytes.
+
+ code_buf[0] = 0;
+ code_buf_pos = 1;
+
+ // Mask iterates over the 8 bits in the code buffer. The first
+ // character ends up being stored in the low bit.
+ //
+ // bit 8 7 6 5 4 3 2 1
+ // | |
+ // | first sequence in code buffer
+ // |
+ // last sequence in code buffer
+
+ mask = 1;
+
+ s = 0;
+ r = (short int) N - (short int) F;
+
+ // Initialize the ring buffer with spaces...
+
+ // Note that the last F bytes of the ring buffer are not filled.
+ // This is because those F bytes will be filled in immediately
+ // with bytes from the input stream.
+
+ memset(m_ring_buffer, ' ', N - F);
+
+ // Read F bytes into the last F bytes of the ring buffer.
+ //
+ // This function loads the buffer with X characters and returns
+ // the actual amount loaded.
+
+ len = GetChars((char *) &(m_ring_buffer[r]), F);
+
+ // Make sure there is something to be compressed.
+
+ if (len == 0)
+ return;
+
+ // Insert the F strings, each of which begins with one or more
+ // 'space' characters. Note the order in which these strings
+ // are inserted. This way, degenerate trees will be less likely
+ // to occur.
+
+ for (i = 1; i <= F; i++) {
+ InsertNode((short int) (r - i));
+ }
+
+ // Finally, insert the whole string just read. The
+ // member variables match_length and match_position are set.
+
+ InsertNode(r);
+
+ // Now that we're preloaded, continue till done.
+
+ do {
+
+ // m_match_length may be spuriously long near the end of
+ // text.
+
+ if (m_match_length > len) {
+ m_match_length = len;
+ }
+
+ // Is it cheaper to store this as a single character? If so,
+ // make it so.
+
+ if (m_match_length < THRESHOLD) {
+ // Send one character. Remember that code_buf[0] is the
+ // set of flags for the next eight items.
+
+ m_match_length = 1;
+ code_buf[0] |= mask;
+ code_buf[code_buf_pos++] = m_ring_buffer[r];
+ }
+
+ // Otherwise, we do indeed have a string that can be stored
+ // compressed to save space.
+
+ else {
+ // The next 16 bits need to contain the position (12 bits)
+ // and the length (4 bits).
+
+ code_buf[code_buf_pos++] = (unsigned char) m_match_position;
+ code_buf[code_buf_pos++] = (unsigned char) (
+ ((m_match_position >> 4) & 0xf0) |
+ (m_match_length - THRESHOLD) );
+ }
+
+ // Shift the mask one bit to the left so that it will be ready
+ // to store the new bit.
+
+ mask = (unsigned char) (mask << 1);
+
+ // If the mask is now 0, then we know that we have a full set
+ // of flags and items in the code buffer. These need to be
+ // output.
+
+ if (!mask) {
+ // code_buf is the buffer of characters to be output.
+ // code_buf_pos is the number of characters it contains.
+
+ SendChars((char *) code_buf, code_buf_pos);
+
+ // Reset for next buffer...
+
+ code_buf[0] = 0;
+ code_buf_pos = 1;
+ mask = 1;
+ }
+
+ last_match_length = m_match_length;
+
+ // Delete old strings and read new bytes...
+
+ for (i = 0; i < last_match_length; i++) {
+ // Get next character...
+
+ if (GetChars((char *) &c, 1) != 1)
+ break;
+
+ // Delete "old strings"
+
+ DeleteNode(s);
+
+ // Put this character into the ring buffer.
+ //
+ // The original comment here says "If the position is near
+ // the end of the buffer, extend the buffer to make
+ // string comparison easier."
+ //
+ // That's a little misleading, because the "end" of the
+ // buffer is really what we consider to be the "beginning"
+ // of the buffer, that is, positions 0 through F.
+ //
+ // The idea is that the front end of the buffer is duplicated
+ // into the back end so that when you're looking at characters
+ // at the back end of the buffer, you can index ahead (beyond
+ // the normal end of the buffer) and see the characters
+ // that are at the front end of the buffer wihtout having
+ // to adjust the index.
+ //
+ // That is...
+ //
+ // 1234xxxxxxxxxxxxxxxxxxxxxxxxxxxxx1234
+ // | | |
+ // position 0 end of buffer |
+ // |
+ // duplicate of front of buffer
+
+ m_ring_buffer[s] = c;
+
+ if (s < F - 1) {
+ m_ring_buffer[s + N] = c;
+ }
+
+ // Increment the position, and wrap around when we're at
+ // the end. Note that this relies on N being a power of 2.
+
+ s = (short int) ( (s + 1) & (N - 1) );
+ r = (short int) ( (r + 1) & (N - 1) );
+
+ // Register the string that is found in
+ // m_ring_buffer[r..r+F-1].
+
+ InsertNode(r);
+ }
+
+ // If we didn't quit because we hit the last_match_length,
+ // then we must have quit because we ran out of characters
+ // to process.
+
+ while (i++ < last_match_length) {
+ DeleteNode(s);
+
+ s = (short int) ( (s + 1) & (N - 1) );
+ r = (short int) ( (r + 1) & (N - 1) );
+
+ // Note that len hitting 0 is the key that causes the
+ // do...while() to terminate. This is the only place
+ // within the loop that len is modified.
+ //
+ // Its original value is F (or a number less than F for
+ // short strings).
+
+ if (--len) {
+ InsertNode(r); /* buffer may not be empty. */
+ }
+ }
+
+ // End of do...while() loop. Continue processing until there
+ // are no more characters to be compressed. The variable
+ // "len" is used to signal this condition.
+ } while (len > 0);
+
+ // There could still be something in the output buffer. Send it
+ // now.
+
+ if (code_buf_pos > 1) {
+ // code_buf is the encoded string to send.
+ // code_buf_ptr is the number of characters.
+
+ SendChars((char *) code_buf, code_buf_pos);
+ }
+
+
+ // must set zlen for parent class to know length of compressed buffer
+ zlen = zpos;
+}
+
+
+/******************************************************************************
+ * LZSSCompress::Decode - This function "decodes" the input stream into the
+ * output stream.
+ * The GetChars() and SendChars() functions are
+ * used to separate this method from the actual
+ * i/o.
+ */
+
+void LZSSCompress::Decode(void)
+{
+ int k;
+ int r; // node number
+ unsigned char c[F]; // an array of chars
+ unsigned char flags; // 8 bits of flags
+ int flag_count; // which flag we're on
+ short int pos; // position in the ring buffer
+ short int len; // number of chars in ring buffer
+ unsigned long totalLen = 0;
+
+ direct = 1; // set direction needed by parent [Get|Send]Chars()
+
+ // Initialize the ring buffer with a common string.
+ //
+ // Note that the last F bytes of the ring buffer are not filled.
+
+ memset(m_ring_buffer, ' ', N - F);
+
+ r = N - F;
+
+ flags = (char) 0;
+ flag_count = 0;
+
+ for ( ; ; ) {
+
+ // If there are more bits of interest in this flag, then
+ // shift that next interesting bit into the 1's position.
+ //
+ // If this flag has been exhausted, the next byte must
+ // be a flag.
+
+ if (flag_count > 0) {
+ flags = (unsigned char) (flags >> 1);
+ flag_count--;
+ }
+ else {
+ // Next byte must be a flag.
+
+ if (GetChars((char *) &flags, 1) != 1)
+ break;
+
+ // Set the flag counter. While at first it might appear
+ // that this should be an 8 since there are 8 bits in the
+ // flag, it should really be a 7 because the shift must
+ // be performed 7 times in order to see all 8 bits.
+
+ flag_count = 7;
+ }
+
+ // If the low order bit of the flag is now set, then we know
+ // that the next byte is a single, unencoded character.
+
+ if (flags & 1) {
+ if (GetChars((char *) c, 1) != 1)
+ break;
+
+ if (SendChars((char *) c, 1) != 1) {
+ totalLen++;
+ break;
+ }
+
+ // Add to buffer, and increment to next spot. Wrap at end.
+
+ m_ring_buffer[r] = c[0];
+ r = (short int) ( (r + 1) & (N - 1) );
+ }
+
+ // Otherwise, we know that the next two bytes are a
+ // <position,length> pair. The position is in 12 bits and
+ // the length is in 4 bits.
+
+ else {
+ // Original code:
+ // if ((i = getc(infile)) == EOF)
+ // break;
+ // if ((j = getc(infile)) == EOF)
+ // break;
+ // i |= ((j & 0xf0) << 4);
+ // j = (j & 0x0f) + THRESHOLD;
+ //
+ // I've modified this to only make one input call, and
+ // have changed the variable names to something more
+ // obvious.
+
+ if (GetChars((char *) c, 2) != 2)
+ break;
+
+ // Convert these two characters into the position and
+ // length. Note that the length is always at least
+ // THRESHOLD, which is why we're able to get a length
+ // of 18 out of only 4 bits.
+
+ pos = (short int) ( c[0] | ((c[1] & 0xf0) << 4) );
+
+ len = (short int) ( (c[1] & 0x0f) + THRESHOLD );
+
+ // There are now "len" characters at position "pos" in
+ // the ring buffer that can be pulled out. Note that
+ // len is never more than F.
+
+ for (k = 0; k < len; k++) {
+ c[k] = m_ring_buffer[(pos + k) & (N - 1)];
+
+ // Add to buffer, and increment to next spot. Wrap at end.
+
+ m_ring_buffer[r] = c[k];
+ r = (short int) ( (r + 1) & (N - 1) );
+ }
+
+ // Add the "len" :characters to the output stream.
+
+ if (SendChars((char *) c, len) != (unsigned int)len) {
+ totalLen += len;
+ break;
+ }
+ }
+ }
+ slen = totalLen;
+}
+
+SWORD_NAMESPACE_END