diff options
Diffstat (limited to 'src/modules/common/lzsscomprs.cpp')
-rw-r--r-- | src/modules/common/lzsscomprs.cpp | 668 |
1 files changed, 0 insertions, 668 deletions
diff --git a/src/modules/common/lzsscomprs.cpp b/src/modules/common/lzsscomprs.cpp deleted file mode 100644 index bd8f768..0000000 --- a/src/modules/common/lzsscomprs.cpp +++ /dev/null @@ -1,668 +0,0 @@ -/****************************************************************************** - * lzsscomprs.cpp - code for class 'LZSSCompress'- a driver class that - * provides LZSS compression - */ - -#include <stdlib.h> -#include <string.h> -#include <lzsscomprs.h> - -SWORD_NAMESPACE_START - -/****************************************************************************** - * LZSSCompress Statics - */ - -// m_ring_buffer is a text buffer. It contains "nodes" of -// uncompressed text that can be indexed by position. That is, -// a substring of the ring buffer can be indexed by a position -// and a length. When decoding, the compressed text may contain -// a position in the ring buffer and a count of the number of -// bytes from the ring buffer that are to be moved into the -// uncompressed buffer. -// -// This ring buffer is not maintained as part of the compressed -// text. Instead, it is reconstructed dynamically. That is, -// it starts out empty and gets built as the text is decompressed. -// -// The ring buffer contain N bytes, with an additional F - 1 bytes -// to facilitate string comparison. - -unsigned char LZSSCompress::m_ring_buffer[N + F - 1]; - -// m_match_position and m_match_length are set by InsertNode(). -// -// These variables indicate the position in the ring buffer -// and the number of characters at that position that match -// a given string. - -short int LZSSCompress::m_match_position; -short int LZSSCompress::m_match_length; - -// m_lson, m_rson, and m_dad are the Japanese way of referring to -// a tree structure. The dad is the parent and it has a right and -// left son (child). -// -// For i = 0 to N-1, m_rson[i] and m_lson[i] will be the right -// and left children of node i. -// -// For i = 0 to N-1, m_dad[i] is the parent of node i. -// -// For i = 0 to 255, rson[N + i + 1] is the root of the tree for -// strings that begin with the character i. Note that this requires -// one byte characters. -// -// These nodes store values of 0...(N-1). Memory requirements -// can be reduces by using 2-byte integers instead of full 4-byte -// integers (for 32-bit applications). Therefore, these are -// defined as "short ints." - -short int LZSSCompress::m_lson[N + 1]; -short int LZSSCompress::m_rson[N + 257]; -short int LZSSCompress::m_dad[N + 1]; - - -/****************************************************************************** - * LZSSCompress Constructor - Initializes data for instance of LZSSCompress - * - */ - -LZSSCompress::LZSSCompress() : SWCompress() { -} - - -/****************************************************************************** - * LZSSCompress Destructor - Cleans up instance of LZSSCompress - */ - -LZSSCompress::~LZSSCompress() { -} - - -/****************************************************************************** - * LZSSCompress::InitTree - This function initializes the tree nodes to - * "empty" states. - */ - -void LZSSCompress::InitTree(void) { - int i; - - // For i = 0 to N - 1, m_rson[i] and m_lson[i] will be the right - // and left children of node i. These nodes need not be - // initialized. However, for debugging purposes, it is nice to - // have them initialized. Since this is only used for compression - // (not decompression), I don't mind spending the time to do it. - // - // For the same range of i, m_dad[i] is the parent of node i. - // These are initialized to a known value that can represent - // a "not used" state. - - for (i = 0; i < N; i++) { - m_lson[i] = NOT_USED; - m_rson[i] = NOT_USED; - m_dad[i] = NOT_USED; - } - - // For i = 0 to 255, m_rson[N + i + 1] is the root of the tree - // for strings that begin with the character i. This is why - // the right child array is larger than the left child array. - // These are also initialzied to a "not used" state. - // - // Note that there are 256 of these, one for each of the possible - // 256 characters. - - for (i = N + 1; i <= (N + 256); i++) { - m_rson[i] = NOT_USED; - } -} - - -/****************************************************************************** - * LZSSCompress::InsertNode - This function inserts a string from the ring - * buffer into one of the trees. It loads the - * match position and length member variables - * for the longest match. - * - * The string to be inserted is identified by - * the parameter Pos, A full F bytes are - * inserted. So, - * m_ring_buffer[Pos ... Pos+F-1] - * are inserted. - * - * If the matched length is exactly F, then an - * old node is removed in favor of the new one - * (because the old one will be deleted - * sooner). - * - * Note that Pos plays a dual role. It is - * used as both a position in the ring buffer - * and also as a tree node. - * m_ring_buffer[Pos] defines a character that - * is used to identify a tree node. - * - * ENT: pos - position in the buffer - */ - -void LZSSCompress::InsertNode(short int Pos) -{ - short int i; - short int p; - int cmp; - unsigned char * key; - -/* - ASSERT(Pos >= 0); - ASSERT(Pos < N); -*/ - - cmp = 1; - key = &(m_ring_buffer[Pos]); - - // The last 256 entries in m_rson contain the root nodes for - // strings that begin with a letter. Get an index for the - // first letter in this string. - - p = (short int) (N + 1 + key[0]); - - // Set the left and right tree nodes for this position to "not - // used." - - m_lson[Pos] = NOT_USED; - m_rson[Pos] = NOT_USED; - - // Haven't matched anything yet. - - m_match_length = 0; - - for ( ; ; ) { - if (cmp >= 0) { - if (m_rson[p] != NOT_USED) { - p = m_rson[p]; - } - else { - m_rson[p] = Pos; - m_dad[Pos] = p; - return; - } - } - else { - if (m_lson[p] != NOT_USED) { - p = m_lson[p]; - } - else { - m_lson[p] = Pos; - m_dad[Pos] = p; - return; - } - } - - // Should we go to the right or the left to look for the - // next match? - - for (i = 1; i < F; i++) { - cmp = key[i] - m_ring_buffer[p + i]; - if (cmp != 0) - break; - } - - if (i > m_match_length) { - m_match_position = p; - m_match_length = i; - - if (i >= F) - break; - } - } - - m_dad[Pos] = m_dad[p]; - m_lson[Pos] = m_lson[p]; - m_rson[Pos] = m_rson[p]; - - m_dad[ m_lson[p] ] = Pos; - m_dad[ m_rson[p] ] = Pos; - - if (m_rson[ m_dad[p] ] == p) { - m_rson[ m_dad[p] ] = Pos; - } - else { - m_lson[ m_dad[p] ] = Pos; - } - - // Remove "p" - - m_dad[p] = NOT_USED; -} - - -/****************************************************************************** - * LZSSCompress::DeleteNode - This function removes the node "Node" from the - * tree. - * - * ENT: node - node to be removed - */ - -void LZSSCompress::DeleteNode(short int Node) -{ - short int q; - -/* - ASSERT(Node >= 0); - ASSERT(Node < (N+1)); -*/ - - if (m_dad[Node] == NOT_USED) { // not in tree, nothing to do - return; - } - - if (m_rson[Node] == NOT_USED) { - q = m_lson[Node]; - } - else if (m_lson[Node] == NOT_USED) { - q = m_rson[Node]; - } - else { - q = m_lson[Node]; - if (m_rson[q] != NOT_USED) { - do { - q = m_rson[q]; - } while (m_rson[q] != NOT_USED); - - m_rson[ m_dad[q] ] = m_lson[q]; - m_dad[ m_lson[q] ] = m_dad[q]; - m_lson[q] = m_lson[Node]; - m_dad[ m_lson[Node] ] = q; - } - - m_rson[q] = m_rson[Node]; - m_dad[ m_rson[Node] ] = q; - } - - m_dad[q] = m_dad[Node]; - - if (m_rson[ m_dad[Node] ] == Node) { - m_rson[ m_dad[Node] ] = q; - } - else { - m_lson[ m_dad[Node] ] = q; - } - - m_dad[Node] = NOT_USED; -} - - -/****************************************************************************** - * LZSSCompress::Encode - This function "encodes" the input stream into the - * output stream. - * The GetChars() and SendChars() functions are - * used to separate this method from the actual - * i/o. - * NOTE: must set zlen for parent class to know length of - * compressed buffer. - */ - -void LZSSCompress::Encode(void) -{ - short int i; // an iterator - short int r; // node number in the binary tree - short int s; // position in the ring buffer - unsigned short int len; // len of initial string - short int last_match_length; // length of last match - short int code_buf_pos; // position in the output buffer - unsigned char code_buf[17]; // the output buffer - unsigned char mask; // bit mask for byte 0 of out buf - unsigned char c; // character read from string - - // Start with a clean tree. - - InitTree(); - direct = 0; // set direction needed by parent [Get|Send]Chars() - - // code_buf[0] works as eight flags. A "1" represents that the - // unit is an unencoded letter (1 byte), and a "0" represents - // that the next unit is a <position,length> pair (2 bytes). - // - // code_buf[1..16] stores eight units of code. Since the best - // we can do is store eight <position,length> pairs, at most 16 - // bytes are needed to store this. - // - // This is why the maximum size of the code buffer is 17 bytes. - - code_buf[0] = 0; - code_buf_pos = 1; - - // Mask iterates over the 8 bits in the code buffer. The first - // character ends up being stored in the low bit. - // - // bit 8 7 6 5 4 3 2 1 - // | | - // | first sequence in code buffer - // | - // last sequence in code buffer - - mask = 1; - - s = 0; - r = (short int) N - (short int) F; - - // Initialize the ring buffer with spaces... - - // Note that the last F bytes of the ring buffer are not filled. - // This is because those F bytes will be filled in immediately - // with bytes from the input stream. - - memset(m_ring_buffer, ' ', N - F); - - // Read F bytes into the last F bytes of the ring buffer. - // - // This function loads the buffer with X characters and returns - // the actual amount loaded. - - len = GetChars((char *) &(m_ring_buffer[r]), F); - - // Make sure there is something to be compressed. - - if (len == 0) - return; - - // Insert the F strings, each of which begins with one or more - // 'space' characters. Note the order in which these strings - // are inserted. This way, degenerate trees will be less likely - // to occur. - - for (i = 1; i <= F; i++) { - InsertNode((short int) (r - i)); - } - - // Finally, insert the whole string just read. The - // member variables match_length and match_position are set. - - InsertNode(r); - - // Now that we're preloaded, continue till done. - - do { - - // m_match_length may be spuriously long near the end of - // text. - - if (m_match_length > len) { - m_match_length = len; - } - - // Is it cheaper to store this as a single character? If so, - // make it so. - - if (m_match_length < THRESHOLD) { - // Send one character. Remember that code_buf[0] is the - // set of flags for the next eight items. - - m_match_length = 1; - code_buf[0] |= mask; - code_buf[code_buf_pos++] = m_ring_buffer[r]; - } - - // Otherwise, we do indeed have a string that can be stored - // compressed to save space. - - else { - // The next 16 bits need to contain the position (12 bits) - // and the length (4 bits). - - code_buf[code_buf_pos++] = (unsigned char) m_match_position; - code_buf[code_buf_pos++] = (unsigned char) ( - ((m_match_position >> 4) & 0xf0) | - (m_match_length - THRESHOLD) ); - } - - // Shift the mask one bit to the left so that it will be ready - // to store the new bit. - - mask = (unsigned char) (mask << 1); - - // If the mask is now 0, then we know that we have a full set - // of flags and items in the code buffer. These need to be - // output. - - if (!mask) { - // code_buf is the buffer of characters to be output. - // code_buf_pos is the number of characters it contains. - - SendChars((char *) code_buf, code_buf_pos); - - // Reset for next buffer... - - code_buf[0] = 0; - code_buf_pos = 1; - mask = 1; - } - - last_match_length = m_match_length; - - // Delete old strings and read new bytes... - - for (i = 0; i < last_match_length; i++) { - // Get next character... - - if (GetChars((char *) &c, 1) != 1) - break; - - // Delete "old strings" - - DeleteNode(s); - - // Put this character into the ring buffer. - // - // The original comment here says "If the position is near - // the end of the buffer, extend the buffer to make - // string comparison easier." - // - // That's a little misleading, because the "end" of the - // buffer is really what we consider to be the "beginning" - // of the buffer, that is, positions 0 through F. - // - // The idea is that the front end of the buffer is duplicated - // into the back end so that when you're looking at characters - // at the back end of the buffer, you can index ahead (beyond - // the normal end of the buffer) and see the characters - // that are at the front end of the buffer wihtout having - // to adjust the index. - // - // That is... - // - // 1234xxxxxxxxxxxxxxxxxxxxxxxxxxxxx1234 - // | | | - // position 0 end of buffer | - // | - // duplicate of front of buffer - - m_ring_buffer[s] = c; - - if (s < F - 1) { - m_ring_buffer[s + N] = c; - } - - // Increment the position, and wrap around when we're at - // the end. Note that this relies on N being a power of 2. - - s = (short int) ( (s + 1) & (N - 1) ); - r = (short int) ( (r + 1) & (N - 1) ); - - // Register the string that is found in - // m_ring_buffer[r..r+F-1]. - - InsertNode(r); - } - - // If we didn't quit because we hit the last_match_length, - // then we must have quit because we ran out of characters - // to process. - - while (i++ < last_match_length) { - DeleteNode(s); - - s = (short int) ( (s + 1) & (N - 1) ); - r = (short int) ( (r + 1) & (N - 1) ); - - // Note that len hitting 0 is the key that causes the - // do...while() to terminate. This is the only place - // within the loop that len is modified. - // - // Its original value is F (or a number less than F for - // short strings). - - if (--len) { - InsertNode(r); /* buffer may not be empty. */ - } - } - - // End of do...while() loop. Continue processing until there - // are no more characters to be compressed. The variable - // "len" is used to signal this condition. - } while (len > 0); - - // There could still be something in the output buffer. Send it - // now. - - if (code_buf_pos > 1) { - // code_buf is the encoded string to send. - // code_buf_ptr is the number of characters. - - SendChars((char *) code_buf, code_buf_pos); - } - - - // must set zlen for parent class to know length of compressed buffer - zlen = zpos; -} - - -/****************************************************************************** - * LZSSCompress::Decode - This function "decodes" the input stream into the - * output stream. - * The GetChars() and SendChars() functions are - * used to separate this method from the actual - * i/o. - */ - -void LZSSCompress::Decode(void) -{ - int k; - int r; // node number - unsigned char c[F]; // an array of chars - unsigned char flags; // 8 bits of flags - int flag_count; // which flag we're on - short int pos; // position in the ring buffer - short int len; // number of chars in ring buffer - unsigned long totalLen = 0; - - direct = 1; // set direction needed by parent [Get|Send]Chars() - - // Initialize the ring buffer with a common string. - // - // Note that the last F bytes of the ring buffer are not filled. - - memset(m_ring_buffer, ' ', N - F); - - r = N - F; - - flags = (char) 0; - flag_count = 0; - - for ( ; ; ) { - - // If there are more bits of interest in this flag, then - // shift that next interesting bit into the 1's position. - // - // If this flag has been exhausted, the next byte must - // be a flag. - - if (flag_count > 0) { - flags = (unsigned char) (flags >> 1); - flag_count--; - } - else { - // Next byte must be a flag. - - if (GetChars((char *) &flags, 1) != 1) - break; - - // Set the flag counter. While at first it might appear - // that this should be an 8 since there are 8 bits in the - // flag, it should really be a 7 because the shift must - // be performed 7 times in order to see all 8 bits. - - flag_count = 7; - } - - // If the low order bit of the flag is now set, then we know - // that the next byte is a single, unencoded character. - - if (flags & 1) { - if (GetChars((char *) c, 1) != 1) - break; - - if (SendChars((char *) c, 1) != 1) { - totalLen++; - break; - } - - // Add to buffer, and increment to next spot. Wrap at end. - - m_ring_buffer[r] = c[0]; - r = (short int) ( (r + 1) & (N - 1) ); - } - - // Otherwise, we know that the next two bytes are a - // <position,length> pair. The position is in 12 bits and - // the length is in 4 bits. - - else { - // Original code: - // if ((i = getc(infile)) == EOF) - // break; - // if ((j = getc(infile)) == EOF) - // break; - // i |= ((j & 0xf0) << 4); - // j = (j & 0x0f) + THRESHOLD; - // - // I've modified this to only make one input call, and - // have changed the variable names to something more - // obvious. - - if (GetChars((char *) c, 2) != 2) - break; - - // Convert these two characters into the position and - // length. Note that the length is always at least - // THRESHOLD, which is why we're able to get a length - // of 18 out of only 4 bits. - - pos = (short int) ( c[0] | ((c[1] & 0xf0) << 4) ); - - len = (short int) ( (c[1] & 0x0f) + THRESHOLD ); - - // There are now "len" characters at position "pos" in - // the ring buffer that can be pulled out. Note that - // len is never more than F. - - for (k = 0; k < len; k++) { - c[k] = m_ring_buffer[(pos + k) & (N - 1)]; - - // Add to buffer, and increment to next spot. Wrap at end. - - m_ring_buffer[r] = c[k]; - r = (short int) ( (r + 1) & (N - 1) ); - } - - // Add the "len" :characters to the output stream. - - if (SendChars((char *) c, len) != (unsigned int)len) { - totalLen += len; - break; - } - } - } - slen = totalLen; -} - -SWORD_NAMESPACE_END |