From 7a00574163029c0c2b649878c95d5acbd083564a Mon Sep 17 00:00:00 2001 From: "Roberto C. Sanchez" Date: Mon, 12 May 2014 08:21:30 -0400 Subject: Imported Upstream version 1.7.2+dfsg --- src/modules/common/Makefile.am | 4 +- src/modules/common/bz2comprs.cpp | 181 ++++++++ src/modules/common/compress.cpp.txt | 767 ---------------------------------- src/modules/common/entriesblk.cpp | 9 +- src/modules/common/lzsscomprs.cpp | 120 ++++-- src/modules/common/lzsscomprs.txt | 802 ++++++++++++++++++++++++++++++++++++ src/modules/common/rawstr.cpp | 22 +- src/modules/common/rawstr4.cpp | 24 +- src/modules/common/rawverse.cpp | 13 +- src/modules/common/rawverse4.cpp | 17 +- src/modules/common/sapphire.cpp | 48 ++- src/modules/common/swcipher.cpp | 8 +- src/modules/common/swcomprs.cpp | 7 +- src/modules/common/swcomprs.doc | 802 ------------------------------------ src/modules/common/xzcomprs.cpp | 181 ++++++++ src/modules/common/zipcomprs.cpp | 8 +- src/modules/common/zstr.cpp | 27 +- src/modules/common/zverse.cpp | 38 +- 18 files changed, 1386 insertions(+), 1692 deletions(-) create mode 100644 src/modules/common/bz2comprs.cpp delete mode 100644 src/modules/common/compress.cpp.txt create mode 100644 src/modules/common/lzsscomprs.txt delete mode 100644 src/modules/common/swcomprs.doc create mode 100644 src/modules/common/xzcomprs.cpp (limited to 'src/modules/common') diff --git a/src/modules/common/Makefile.am b/src/modules/common/Makefile.am index 34a14bf..90a3f98 100644 --- a/src/modules/common/Makefile.am +++ b/src/modules/common/Makefile.am @@ -6,7 +6,9 @@ libsword_la_SOURCES += $(commondir)/swcomprs.cpp libsword_la_SOURCES += $(commondir)/lzsscomprs.cpp if HAVE_LIBZ -SWZLIB = $(commondir)/zipcomprs.cpp +SWZLIB = $(commondir)/zipcomprs.cpp +SWZLIB += $(commondir)/bz2comprs.cpp +SWZLIB += $(commondir)/xzcomprs.cpp else SWZLIB = endif diff --git a/src/modules/common/bz2comprs.cpp b/src/modules/common/bz2comprs.cpp new file mode 100644 index 0000000..16f6d11 --- /dev/null +++ b/src/modules/common/bz2comprs.cpp @@ -0,0 +1,181 @@ +/****************************************************************************** + * + * bz2comprs.cpp - Bzip2Compress, a driver class that provides bzip2 + * compression (Burrows–Wheeler with Huffman coding) + * + * $Id: bz2comprs.cpp 2858 2013-07-08 03:08:10Z chrislit $ + * + * Copyright 2013 CrossWire Bible Society (http://www.crosswire.org) + * CrossWire Bible Society + * P. O. Box 2528 + * Tempe, AZ 85280-2528 + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + */ + + +#include +#include +#include +#include +#include + +SWORD_NAMESPACE_START + +/****************************************************************************** + * Bzip2Compress Constructor - Initializes data for instance of Bzip2Compress + * + */ + +Bzip2Compress::Bzip2Compress() : SWCompress() { +} + + +/****************************************************************************** + * Bzip2Compress Destructor - Cleans up instance of Bzip2Compress + */ + +Bzip2Compress::~Bzip2Compress() { +} + + +/****************************************************************************** + * Bzip2Compress::Encode - This function "encodes" the input stream into the + * output stream. + * The GetChars() and SendChars() functions are + * used to separate this method from the actual + * i/o. + * NOTE: must set zlen for parent class to know length of + * compressed buffer. + */ + +void Bzip2Compress::Encode(void) +{ +/* +ZEXTERN int ZEXPORT compress OF((Bytef *dest, uLongf *destLen, + const Bytef *source, uLong sourceLen)); + Compresses the source buffer into the destination buffer. sourceLen is + the byte length of the source buffer. Upon entry, destLen is the total + size of the destination buffer, which must be at least 0.1% larger than + sourceLen plus 12 bytes. Upon exit, destLen is the actual size of the + compressed buffer. + This function can be used to compress a whole file at once if the + input file is mmap'ed. + compress returns Z_OK if success, Z_MEM_ERROR if there was not + enough memory, Z_BUF_ERROR if there was not enough room in the output + buffer. +*/ + direct = 0; // set direction needed by parent [Get|Send]Chars() + + // get buffer + char chunk[1024]; + char *buf = (char *)calloc(1, 1024); + char *chunkbuf = buf; + unsigned long chunklen; + unsigned long len = 0; + while((chunklen = GetChars(chunk, 1023))) { + memcpy(chunkbuf, chunk, chunklen); + len += chunklen; + if (chunklen < 1023) + break; + else buf = (char *)realloc(buf, len + 1024); + chunkbuf = buf+len; + } + + + zlen = (long) (len*1.001)+15; + char *zbuf = new char[zlen+1]; + if (len) + { + //printf("Doing compress\n"); + if (compress((Bytef*)zbuf, &zlen, (const Bytef*)buf, len) != Z_OK) + { + printf("ERROR in compression\n"); + } + else { + SendChars(zbuf, zlen); + } + } + else + { + fprintf(stderr, "ERROR: no buffer to compress\n"); + } + delete [] zbuf; + free (buf); +} + + +/****************************************************************************** + * Bzip2Compress::Decode - This function "decodes" the input stream into the + * output stream. + * The GetChars() and SendChars() functions are + * used to separate this method from the actual + * i/o. + */ + +void Bzip2Compress::Decode(void) +{ +/* +ZEXTERN int ZEXPORT uncompress OF((Bytef *dest, uLongf *destLen, + const Bytef *source, uLong sourceLen)); + Decompresses the source buffer into the destination buffer. sourceLen is + the byte length of the source buffer. Upon entry, destLen is the total + size of the destination buffer, which must be large enough to hold the + entire uncompressed data. (The size of the uncompressed data must have + been saved previously by the compressor and transmitted to the decompressor + by some mechanism outside the scope of this compression library.) + Upon exit, destLen is the actual size of the compressed buffer. + This function can be used to decompress a whole file at once if the + input file is mmap'ed. + + uncompress returns Z_OK if success, Z_MEM_ERROR if there was not + enough memory, Z_BUF_ERROR if there was not enough room in the output + buffer, or Z_DATA_ERROR if the input data was corrupted. +*/ + + // get buffer + char chunk[1024]; + char *zbuf = (char *)calloc(1, 1024); + char *chunkbuf = zbuf; + int chunklen; + unsigned long zlen = 0; + while((chunklen = GetChars(chunk, 1023))) { + memcpy(chunkbuf, chunk, chunklen); + zlen += chunklen; + if (chunklen < 1023) + break; + else zbuf = (char *)realloc(zbuf, zlen + 1024); + chunkbuf = zbuf + zlen; + } + + //printf("Decoding complength{%ld} uncomp{%ld}\n", zlen, blen); + if (zlen) { + unsigned long blen = zlen*20; // trust compression is less than 1000% + char *buf = new char[blen]; + //printf("Doing decompress {%s}\n", zbuf); + slen = 0; + switch (uncompress((Bytef*)buf, &blen, (Bytef*)zbuf, zlen)){ + case Z_OK: SendChars(buf, blen); slen = blen; break; + case Z_MEM_ERROR: fprintf(stderr, "ERROR: not enough memory during decompression.\n"); break; + case Z_BUF_ERROR: fprintf(stderr, "ERROR: not enough room in the out buffer during decompression.\n"); break; + case Z_DATA_ERROR: fprintf(stderr, "ERROR: corrupt data during decompression.\n"); break; + default: fprintf(stderr, "ERROR: an unknown error occured during decompression.\n"); break; + } + delete [] buf; + } + else { + fprintf(stderr, "ERROR: no buffer to decompress!\n"); + } + //printf("Finished decoding\n"); + free (zbuf); +} + +SWORD_NAMESPACE_END diff --git a/src/modules/common/compress.cpp.txt b/src/modules/common/compress.cpp.txt deleted file mode 100644 index 5031adb..0000000 --- a/src/modules/common/compress.cpp.txt +++ /dev/null @@ -1,767 +0,0 @@ -Compression Info, 10-11-95 -Jeff Wheeler - -Source of Algorithm -------------------- - -The compression algorithms used here are based upon the algorithms developed and published by Haruhiko Okumura in a paper entitled "Data Compression Algorithms of LARC and LHarc." This paper discusses three compression algorithms, LSZZ, LZARI, and LZHUF. LZSS is described as the "first" of these, and is described as providing moderate compression with good speed. LZARI is described as an improved LZSS, a combination of the LZSS algorithm with adaptive arithmetic compression. It is described as being slower than LZSS but with better compression. LZHUF (the basis of the common LHA compression program) was included in the paper, however, a free usage license was not included. - -The following are copies of the statements included at the beginning of each source code listing that was supplied in the working paper. - - LZSS, dated 4/6/89, marked as "Use, distribute and - modify this program freely." - - LZARI, dated 4/7/89, marked as "Use, distribute and - modify this program freely." - - LZHUF, dated 11/20/88, written by Haruyasu Yoshizaki, - translated by Haruhiko Okumura on 4/7/89. Not - expressly marked as redistributable or modifiable. - -Since both LZSS and LZARI are marked as "use, distribute and modify freely" we have felt at liberty basing our compression algorithm on either of these. - -Selection of Algorithm ----------------------- - -Working samples of three possible compression algorithms are supplied in Okumura's paper. Which should be used? - -LZSS is the fastest at decompression, but does not generated as small a compressed file as the other methods. The other two methods provided, perhaps, a 15% improvement in compression. Or, put another way, on a 100K file, LZSS might compress it to 50K while the others might approach 40-45K. For STEP purposes, it was decided that decoding speed was of more importance than tighter compression. For these reasons, the first compression algorithm implemented is the LZSS algorithm. - -About LZSS Encoding -------------------- - -(adapted from Haruhiko Okumura's paper) - -This scheme was proposed by Ziv and Lempel [1]. A slightly modified version is described by Storer and Szymanski [2]. An implementation using a binary tree has been proposed by Bell [3]. - -The algorithm is quite simple. -1. Keep a ring buffer which initially contains all space characters. -2. Read several letters from the file to the buffer. -3. Search the buffer for the longest string that matches the letters just read, and send its length and position into the buffer. - -If the ring buffer is 4096 bytes, the position can be stored in 12 bits. If the length is represented in 4 bits, the pair is two bytes long. If the longest match is no more than two characters, then just one character is sent without encoding. The process starts again with the next character. An extra bit is sent each time to tell the decoder whether the next item is a character of a pair. - -[1] J. Ziv and A. Lempel, IEEE Transactions IT-23, 337-343 (1977). -[2] J. A. Storer and T. G. Szymanski, J. ACM, 29, 928-951 (1982). -[3] T.C. Gell, IEEE Transactions COM-34, 1176-1182 (1986). - -void InitTree( // no return value - void); // no parameters - -void InsertNode( // no return value - short int Pos); // position in the buffer - -void DeleteNode( // no return value - short int Node); // node to be removed - -void Encode( // no return value - void); // no parameters - -void Decode( // no return value - void); // no parameters - -// The following are constant sizes used by the compression algorithm. -// -// N - This is the size of the ring buffer. It is set -// to 4K. It is important to note that a position -// within the ring buffer requires 12 bits. -// -// F - This is the maximum length of a character sequence -// that can be taken from the ring buffer. It is set -// to 18. Note that a length must be 3 before it is -// worthwhile to store a position/length pair, so the -// length can be encoded in only 4 bits. Or, put yet -// another way, it is not necessary to encode a length -// of 0-18, it is necessary to encode a length of -// 3-18, which requires 4 bits. -// -// THRESHOLD - It takes 2 bytes to store an offset and -// a length. If a character sequence only -// requires 1 or 2 characters to store -// uncompressed, then it is better to store -// it uncompressed than as an offset into -// the ring buffer. -// -// Note that the 12 bits used to store the position and the 4 bits -// used to store the length equal a total of 16 bits, or 2 bytes. - -#define N 4096 -#define F 18 -#define THRESHOLD 3 -#define NOT_USED N - -// m_ring_buffer is a text buffer. It contains "nodes" of -// uncompressed text that can be indexed by position. That is, -// a substring of the ring buffer can be indexed by a position -// and a length. When decoding, the compressed text may contain -// a position in the ring buffer and a count of the number of -// bytes from the ring buffer that are to be moved into the -// uncompressed buffer. -// -// This ring buffer is not maintained as part of the compressed -// text. Instead, it is reconstructed dynamically. That is, -// it starts out empty and gets built as the text is decompressed. -// -// The ring buffer contain N bytes, with an additional F - 1 bytes -// to facilitate string comparison. - -unsigned char m_ring_buffer[N + F - 1]; - -// m_match_position and m_match_length are set by InsertNode(). -// -// These variables indicate the position in the ring buffer -// and the number of characters at that position that match -// a given string. - -short int m_match_position; -short int m_match_length; - -// m_lson, m_rson, and m_dad are the Japanese way of referring to -// a tree structure. The dad is the parent and it has a right and -// left son (child). -// -// For i = 0 to N-1, m_rson[i] and m_lson[i] will be the right -// and left children of node i. -// -// For i = 0 to N-1, m_dad[i] is the parent of node i. -// -// For i = 0 to 255, rson[N + i + 1] is the root of the tree for -// strings that begin with the character i. Note that this requires -// one byte characters. -// -// These nodes store values of 0...(N-1). Memory requirements -// can be reduces by using 2-byte integers instead of full 4-byte -// integers (for 32-bit applications). Therefore, these are -// defined as "short ints." - -short int m_lson[N + 1]; -short int m_rson[N + 257]; -short int m_dad[N + 1]; - -/* - ------------------------------------------------------------------------- - cLZSS::InitTree - - This function initializes the tree nodes to "empty" states. - ------------------------------------------------------------------------- -*/ - -void cLZSS::InitTree( // no return value - void) // no parameters - throw() // exception list - - { - int i; - - // For i = 0 to N - 1, m_rson[i] and m_lson[i] will be the right - // and left children of node i. These nodes need not be - // initialized. However, for debugging purposes, it is nice to - // have them initialized. Since this is only used for compression - // (not decompression), I don't mind spending the time to do it. - // - // For the same range of i, m_dad[i] is the parent of node i. - // These are initialized to a known value that can represent - // a "not used" state. - - for (i = 0; i < N; i++) - { - m_lson[i] = NOT_USED; - m_rson[i] = NOT_USED; - m_dad[i] = NOT_USED; - } - - // For i = 0 to 255, m_rson[N + i + 1] is the root of the tree - // for strings that begin with the character i. This is why - // the right child array is larger than the left child array. - // These are also initialzied to a "not used" state. - // - // Note that there are 256 of these, one for each of the possible - // 256 characters. - - for (i = N + 1; i <= (N + 256); i++) - { - m_rson[i] = NOT_USED; - } - - // Done. - } - -/* - ------------------------------------------------------------------------- - cLZSS::InsertNode - - This function inserts a string from the ring buffer into one of - the trees. It loads the match position and length member variables - for the longest match. - - The string to be inserted is identified by the parameter Pos, - A full F bytes are inserted. So, m_ring_buffer[Pos ... Pos+F-1] - are inserted. - - If the matched length is exactly F, then an old node is removed - in favor of the new one (because the old one will be deleted - sooner). - - Note that Pos plays a dual role. It is used as both a position - in the ring buffer and also as a tree node. m_ring_buffer[Pos] - defines a character that is used to identify a tree node. - ------------------------------------------------------------------------- -*/ - -void cLZSS::InsertNode( // no return value - short int Pos) // position in the buffer - throw() // exception list - - { - short int i; - short int p; - int cmp; - unsigned char * key; - - ASSERT(Pos >= 0); - ASSERT(Pos < N); - - cmp = 1; - key = &(m_ring_buffer[Pos]); - - // The last 256 entries in m_rson contain the root nodes for - // strings that begin with a letter. Get an index for the - // first letter in this string. - - p = (short int) (N + 1 + key[0]); - - // Set the left and right tree nodes for this position to "not - // used." - - m_lson[Pos] = NOT_USED; - m_rson[Pos] = NOT_USED; - - // Haven't matched anything yet. - - m_match_length = 0; - - for ( ; ; ) - { - if (cmp >= 0) - { - if (m_rson[p] != NOT_USED) - { - p = m_rson[p]; - } - else - { - m_rson[p] = Pos; - m_dad[Pos] = p; - return; - } - } - else - { - if (m_lson[p] != NOT_USED) - { - p = m_lson[p]; - } - else - { - m_lson[p] = Pos; - m_dad[Pos] = p; - return; - } - } - - // Should we go to the right or the left to look for the - // next match? - - for (i = 1; i < F; i++) - { - cmp = key[i] - m_ring_buffer[p + i]; - if (cmp != 0) - break; - } - - if (i > m_match_length) - { - m_match_position = p; - m_match_length = i; - - if (i >= F) - break; - } - } - - m_dad[Pos] = m_dad[p]; - m_lson[Pos] = m_lson[p]; - m_rson[Pos] = m_rson[p]; - - m_dad[ m_lson[p] ] = Pos; - m_dad[ m_rson[p] ] = Pos; - - if (m_rson[ m_dad[p] ] == p) - { - m_rson[ m_dad[p] ] = Pos; - } - else - { - m_lson[ m_dad[p] ] = Pos; - } - - // Remove "p" - - m_dad[p] = NOT_USED; - } - -/* - ------------------------------------------------------------------------- - cLZSS::DeleteNode - - This function removes the node "Node" from the tree. - ------------------------------------------------------------------------- -*/ - -void cLZSS::DeleteNode( // no return value - short int Node) // node to be removed - throw() // exception list - - { - short int q; - - ASSERT(Node >= 0); - ASSERT(Node < (N+1)); - - if (m_dad[Node] == NOT_USED) - { - // not in tree, nothing to do - return; - } - - if (m_rson[Node] == NOT_USED) - { - q = m_lson[Node]; - } - else if (m_lson[Node] == NOT_USED) - { - q = m_rson[Node]; - } - else - { - q = m_lson[Node]; - if (m_rson[q] != NOT_USED) - { - do - { - q = m_rson[q]; - } - while (m_rson[q] != NOT_USED); - - m_rson[ m_dad[q] ] = m_lson[q]; - m_dad[ m_lson[q] ] = m_dad[q]; - m_lson[q] = m_lson[Node]; - m_dad[ m_lson[Node] ] = q; - } - - m_rson[q] = m_rson[Node]; - m_dad[ m_rson[Node] ] = q; - } - - m_dad[q] = m_dad[Node]; - - if (m_rson[ m_dad[Node] ] == Node) - { - m_rson[ m_dad[Node] ] = q; - } - else - { - m_lson[ m_dad[Node] ] = q; - } - - m_dad[Node] = NOT_USED; - } - -/* - ------------------------------------------------------------------------- - cLZSS::Encode - - This function "encodes" the input stream into the output stream. - The GetChars() and SendChars() functions are used to separate - this method from the actual i/o. - ------------------------------------------------------------------------- -*/ - -void cLZSS::Encode( // no return value - void) // no parameters - - { - short int i; // an iterator - short int r; // node number in the binary tree - short int s; // position in the ring buffer - unsigned short int len; // len of initial string - short int last_match_length; // length of last match - short int code_buf_pos; // position in the output buffer - unsigned char code_buf[17]; // the output buffer - unsigned char mask; // bit mask for byte 0 of out buf - unsigned char c; // character read from string - - // Start with a clean tree. - - InitTree(); - - // code_buf[0] works as eight flags. A "1" represents that the - // unit is an unencoded letter (1 byte), and a "0" represents - // that the next unit is a pair (2 bytes). - // - // code_buf[1..16] stores eight units of code. Since the best - // we can do is store eight pairs, at most 16 - // bytes are needed to store this. - // - // This is why the maximum size of the code buffer is 17 bytes. - - code_buf[0] = 0; - code_buf_pos = 1; - - // Mask iterates over the 8 bits in the code buffer. The first - // character ends up being stored in the low bit. - // - // bit 8 7 6 5 4 3 2 1 - // | | - // | first sequence in code buffer - // | - // last sequence in code buffer - - mask = 1; - - s = 0; - r = (short int) N - (short int) F; - - // Initialize the ring buffer with spaces... - - // Note that the last F bytes of the ring buffer are not filled. - // This is because those F bytes will be filled in immediately - // with bytes from the input stream. - - memset(m_ring_buffer, ' ', N - F); - - // Read F bytes into the last F bytes of the ring buffer. - // - // This function loads the buffer with X characters and returns - // the actual amount loaded. - - len = GetChars(&(m_ring_buffer[r]), F); - - // Make sure there is something to be compressed. - - if (len == 0) - return; - - // Insert the F strings, each of which begins with one or more - // 'space' characters. Note the order in which these strings - // are inserted. This way, degenerate trees will be less likely - // to occur. - - for (i = 1; i <= F; i++) - { - InsertNode((short int) (r - i)); - } - - // Finally, insert the whole string just read. The - // member variables match_length and match_position are set. - - InsertNode(r); - - // Now that we're preloaded, continue till done. - - do - { - - // m_match_length may be spuriously long near the end of - // text. - - if (m_match_length > len) - { - m_match_length = len; - } - - // Is it cheaper to store this as a single character? If so, - // make it so. - - if (m_match_length < THRESHOLD) - { - // Send one character. Remember that code_buf[0] is the - // set of flags for the next eight items. - - m_match_length = 1; - code_buf[0] |= mask; - code_buf[code_buf_pos++] = m_ring_buffer[r]; - } - - // Otherwise, we do indeed have a string that can be stored - // compressed to save space. - - else - { - // The next 16 bits need to contain the position (12 bits) - // and the length (4 bits). - - code_buf[code_buf_pos++] = (unsigned char) m_match_position; - code_buf[code_buf_pos++] = (unsigned char) ( - ((m_match_position >> 4) & 0xf0) | - (m_match_length - THRESHOLD) ); - } - - // Shift the mask one bit to the left so that it will be ready - // to store the new bit. - - mask = (unsigned char) (mask << 1); - - // If the mask is now 0, then we know that we have a full set - // of flags and items in the code buffer. These need to be - // output. - - if (mask == 0) - { - // code_buf is the buffer of characters to be output. - // code_buf_pos is the number of characters it contains. - - SendChars(code_buf, code_buf_pos); - - // Reset for next buffer... - - code_buf[0] = 0; - code_buf_pos = 1; - mask = 1; - } - - last_match_length = m_match_length; - - // Delete old strings and read new bytes... - - for (i = 0; i < last_match_length; i++) - { - - // Get next character... - - if (GetChars(&c, 1) != 1) - break; - - // Delete "old strings" - - DeleteNode(s); - - // Put this character into the ring buffer. - // - // The original comment here says "If the position is near - // the end of the buffer, extend the buffer to make - // string comparison easier." - // - // That's a little misleading, because the "end" of the - // buffer is really what we consider to be the "beginning" - // of the buffer, that is, positions 0 through F. - // - // The idea is that the front end of the buffer is duplicated - // into the back end so that when you're looking at characters - // at the back end of the buffer, you can index ahead (beyond - // the normal end of the buffer) and see the characters - // that are at the front end of the buffer wihtout having - // to adjust the index. - // - // That is... - // - // 1234xxxxxxxxxxxxxxxxxxxxxxxxxxxxx1234 - // | | | - // position 0 end of buffer | - // | - // duplicate of front of buffer - - m_ring_buffer[s] = c; - - if (s < F - 1) - { - m_ring_buffer[s + N] = c; - } - - // Increment the position, and wrap around when we're at - // the end. Note that this relies on N being a power of 2. - - s = (short int) ( (s + 1) & (N - 1) ); - r = (short int) ( (r + 1) & (N - 1) ); - - // Register the string that is found in - // m_ring_buffer[r..r+F-1]. - - InsertNode(r); - } - - // If we didn't quit because we hit the last_match_length, - // then we must have quit because we ran out of characters - // to process. - - while (i++ < last_match_length) - { - DeleteNode(s); - - s = (short int) ( (s + 1) & (N - 1) ); - r = (short int) ( (r + 1) & (N - 1) ); - - // Note that len hitting 0 is the key that causes the - // do...while() to terminate. This is the only place - // within the loop that len is modified. - // - // Its original value is F (or a number less than F for - // short strings). - - if (--len) - { - InsertNode(r); /* buffer may not be empty. */ - } - } - - // End of do...while() loop. Continue processing until there - // are no more characters to be compressed. The variable - // "len" is used to signal this condition. - } - while (len > 0); - - // There could still be something in the output buffer. Send it - // now. - - if (code_buf_pos > 1) - { - // code_buf is the encoded string to send. - // code_buf_ptr is the number of characters. - - SendChars(code_buf, code_buf_pos); - } - - // Done! - } - -/* - ------------------------------------------------------------------------- - cLZSS::Decode - - This function "decodes" the input stream into the output stream. - The GetChars() and SendChars() functions are used to separate - this method from the actual i/o. - ------------------------------------------------------------------------- -*/ - -void cLZSS::Decode( // no return value - void) // no parameters - - { - int k; - int r; // node number - unsigned char c[F]; // an array of chars - unsigned char flags; // 8 bits of flags - int flag_count; // which flag we're on - short int pos; // position in the ring buffer - short int len; // number of chars in ring buffer - - // Initialize the ring buffer with a common string. - // - // Note that the last F bytes of the ring buffer are not filled. - - memset(m_ring_buffer, ' ', N - F); - - r = N - F; - - flags = (char) 0; - flag_count = 0; - - for ( ; ; ) - { - - // If there are more bits of interest in this flag, then - // shift that next interesting bit into the 1's position. - // - // If this flag has been exhausted, the next byte must - // be a flag. - - if (flag_count > 0) - { - flags = (unsigned char) (flags >> 1); - flag_count--; - } - else - { - // Next byte must be a flag. - - if (GetChars(&flags, 1) != 1) - break; - - // Set the flag counter. While at first it might appear - // that this should be an 8 since there are 8 bits in the - // flag, it should really be a 7 because the shift must - // be performed 7 times in order to see all 8 bits. - - flag_count = 7; - } - - // If the low order bit of the flag is now set, then we know - // that the next byte is a single, unencoded character. - - if (flags & 1) - { - if (GetChars(c, 1) != 1) - break; - - if (SendChars(c, 1) != 1) - break; - - // Add to buffer, and increment to next spot. Wrap at end. - - m_ring_buffer[r] = c[0]; - r = (short int) ( (r + 1) & (N - 1) ); - } - - // Otherwise, we know that the next two bytes are a - // pair. The position is in 12 bits and - // the length is in 4 bits. - - else - { - // Original code: - // if ((i = getc(infile)) == EOF) - // break; - // if ((j = getc(infile)) == EOF) - // break; - // i |= ((j & 0xf0) << 4); - // j = (j & 0x0f) + THRESHOLD; - // - // I've modified this to only make one input call, and - // have changed the variable names to something more - // obvious. - - if (GetChars(c, 2) != 2) - break; - - // Convert these two characters into the position and - // length. Note that the length is always at least - // THRESHOLD, which is why we're able to get a length - // of 18 out of only 4 bits. - - pos = (short int) ( c[0] | ((c[1] & 0xf0) << 4) ); - - len = (short int) ( (c[1] & 0x0f) + THRESHOLD ); - - // There are now "len" characters at position "pos" in - // the ring buffer that can be pulled out. Note that - // len is never more than F. - - for (k = 0; k < len; k++) - { - c[k] = m_ring_buffer[(pos + k) & (N - 1)]; - - // Add to buffer, and increment to next spot. Wrap at end. - - m_ring_buffer[r] = c[k]; - r = (short int) ( (r + 1) & (N - 1) ); - } - - // Add the "len" characters to the output stream. - - if (SendChars(c, len) != len) - break; - } - } - } - diff --git a/src/modules/common/entriesblk.cpp b/src/modules/common/entriesblk.cpp index 6e4c9aa..4872d28 100644 --- a/src/modules/common/entriesblk.cpp +++ b/src/modules/common/entriesblk.cpp @@ -1,5 +1,10 @@ -/* - * Copyright 2009 CrossWire Bible Society (http://www.crosswire.org) +/****************************************************************************** + * + * entriesblk.cpp - EntriesBlock facilitates compressed lex/dict modules + * + * $Id: entriesblk.cpp 2833 2013-06-29 06:40:28Z chrislit $ + * + * Copyright 2001-2013 CrossWire Bible Society (http://www.crosswire.org) * CrossWire Bible Society * P. O. Box 2528 * Tempe, AZ 85280-2528 diff --git a/src/modules/common/lzsscomprs.cpp b/src/modules/common/lzsscomprs.cpp index 624942d..ef1bc8c 100644 --- a/src/modules/common/lzsscomprs.cpp +++ b/src/modules/common/lzsscomprs.cpp @@ -1,9 +1,11 @@ /****************************************************************************** - * lzsscomprs.cpp - code for class 'LZSSCompress'- a driver class that - * provides LZSS compression * + * lzssomprs.cpp - LZSSCompress: a driver class that provides LZSS + * compression + * + * $Id: lzsscomprs.cpp 2935 2013-08-02 11:06:30Z scribe $ * - * Copyright 2009 CrossWire Bible Society (http://www.crosswire.org) + * Copyright 1996-2013 CrossWire Bible Society (http://www.crosswire.org) * CrossWire Bible Society * P. O. Box 2528 * Tempe, AZ 85280-2528 @@ -23,8 +25,52 @@ #include #include +// The following are constant sizes used by the compression algorithm. +// +// N - This is the size of the ring buffer. It is set +// to 4K. It is important to note that a position +// within the ring buffer requires 12 bits. +// +// F - This is the maximum length of a character sequence +// that can be taken from the ring buffer. It is set +// to 18. Note that a length must be 3 before it is +// worthwhile to store a position/length pair, so the +// length can be encoded in only 4 bits. Or, put yet +// another way, it is not necessary to encode a length +// of 0-18, it is necessary to encode a length of +// 3-18, which requires 4 bits. +// +// THRESHOLD - It takes 2 bytes to store an offset and +// a length. If a character sequence only +// requires 1 or 2 characters to store +// uncompressed, then it is better to store +// it uncompressed than as an offset into +// the ring buffer. +// +// Note that the 12 bits used to store the position and the 4 bits +// used to store the length equal a total of 16 bits, or 2 bytes. + +#define N 4096 +#define F 18 +#define THRESHOLD 3 +#define NOT_USED N + + SWORD_NAMESPACE_START +class LZSSCompress::Private { +public: + static unsigned char m_ring_buffer[N + F - 1]; + static short int m_match_position; + static short int m_match_length; + static short int m_lson[N + 1]; + static short int m_rson[N + 257]; + static short int m_dad[N + 1]; + void InitTree(); + void InsertNode(short int Pos); + void DeleteNode(short int Node); +}; + /****************************************************************************** * LZSSCompress Statics */ @@ -44,7 +90,7 @@ SWORD_NAMESPACE_START // The ring buffer contain N bytes, with an additional F - 1 bytes // to facilitate string comparison. -unsigned char LZSSCompress::m_ring_buffer[N + F - 1]; +unsigned char LZSSCompress::Private::m_ring_buffer[N + F - 1]; // m_match_position and m_match_length are set by InsertNode(). // @@ -52,8 +98,8 @@ unsigned char LZSSCompress::m_ring_buffer[N + F - 1]; // and the number of characters at that position that match // a given string. -short int LZSSCompress::m_match_position; -short int LZSSCompress::m_match_length; +short int LZSSCompress::Private::m_match_position; +short int LZSSCompress::Private::m_match_length; // m_lson, m_rson, and m_dad are the Japanese way of referring to // a tree structure. The dad is the parent and it has a right and @@ -73,9 +119,9 @@ short int LZSSCompress::m_match_length; // integers (for 32-bit applications). Therefore, these are // defined as "short ints." -short int LZSSCompress::m_lson[N + 1]; -short int LZSSCompress::m_rson[N + 257]; -short int LZSSCompress::m_dad[N + 1]; +short int LZSSCompress::Private::m_lson[N + 1]; +short int LZSSCompress::Private::m_rson[N + 257]; +short int LZSSCompress::Private::m_dad[N + 1]; /****************************************************************************** @@ -84,6 +130,7 @@ short int LZSSCompress::m_dad[N + 1]; */ LZSSCompress::LZSSCompress() : SWCompress() { + p = new Private(); } @@ -92,6 +139,7 @@ LZSSCompress::LZSSCompress() : SWCompress() { */ LZSSCompress::~LZSSCompress() { + delete p; } @@ -100,7 +148,7 @@ LZSSCompress::~LZSSCompress() { * "empty" states. */ -void LZSSCompress::InitTree(void) { +void LZSSCompress::Private::InitTree(void) { int i; // For i = 0 to N - 1, m_rson[i] and m_lson[i] will be the right @@ -159,7 +207,7 @@ void LZSSCompress::InitTree(void) { * ENT: pos - position in the buffer */ -void LZSSCompress::InsertNode(short int Pos) +void LZSSCompress::Private::InsertNode(short int Pos) { short int i; short int p; @@ -257,7 +305,7 @@ void LZSSCompress::InsertNode(short int Pos) * ENT: node - node to be removed */ -void LZSSCompress::DeleteNode(short int Node) +void LZSSCompress::Private::DeleteNode(short int Node) { short int q; @@ -330,7 +378,7 @@ void LZSSCompress::Encode(void) // Start with a clean tree. - InitTree(); + p->InitTree(); direct = 0; // set direction needed by parent [Get|Send]Chars() // code_buf[0] works as eight flags. A "1" represents that the @@ -366,14 +414,14 @@ void LZSSCompress::Encode(void) // This is because those F bytes will be filled in immediately // with bytes from the input stream. - memset(m_ring_buffer, ' ', N - F); + memset(p->m_ring_buffer, ' ', N - F); // Read F bytes into the last F bytes of the ring buffer. // // This function loads the buffer with X characters and returns // the actual amount loaded. - len = GetChars((char *) &(m_ring_buffer[r]), F); + len = GetChars((char *) &(p->m_ring_buffer[r]), F); // Make sure there is something to be compressed. @@ -386,13 +434,13 @@ void LZSSCompress::Encode(void) // to occur. for (i = 1; i <= F; i++) { - InsertNode((short int) (r - i)); + p->InsertNode((short int) (r - i)); } // Finally, insert the whole string just read. The // member variables match_length and match_position are set. - InsertNode(r); + p->InsertNode(r); // Now that we're preloaded, continue till done. @@ -401,20 +449,20 @@ void LZSSCompress::Encode(void) // m_match_length may be spuriously long near the end of // text. - if (m_match_length > len) { - m_match_length = len; + if (p->m_match_length > len) { + p->m_match_length = len; } // Is it cheaper to store this as a single character? If so, // make it so. - if (m_match_length < THRESHOLD) { + if (p->m_match_length < THRESHOLD) { // Send one character. Remember that code_buf[0] is the // set of flags for the next eight items. - m_match_length = 1; + p->m_match_length = 1; code_buf[0] |= mask; - code_buf[code_buf_pos++] = m_ring_buffer[r]; + code_buf[code_buf_pos++] = p->m_ring_buffer[r]; } // Otherwise, we do indeed have a string that can be stored @@ -424,10 +472,10 @@ void LZSSCompress::Encode(void) // The next 16 bits need to contain the position (12 bits) // and the length (4 bits). - code_buf[code_buf_pos++] = (unsigned char) m_match_position; + code_buf[code_buf_pos++] = (unsigned char) p->m_match_position; code_buf[code_buf_pos++] = (unsigned char) ( - ((m_match_position >> 4) & 0xf0) | - (m_match_length - THRESHOLD) ); + ((p->m_match_position >> 4) & 0xf0) | + (p->m_match_length - THRESHOLD) ); } // Shift the mask one bit to the left so that it will be ready @@ -452,7 +500,7 @@ void LZSSCompress::Encode(void) mask = 1; } - last_match_length = m_match_length; + last_match_length = p->m_match_length; // Delete old strings and read new bytes... @@ -464,7 +512,7 @@ void LZSSCompress::Encode(void) // Delete "old strings" - DeleteNode(s); + p->DeleteNode(s); // Put this character into the ring buffer. // @@ -491,10 +539,10 @@ void LZSSCompress::Encode(void) // | // duplicate of front of buffer - m_ring_buffer[s] = c; + p->m_ring_buffer[s] = c; if (s < F - 1) { - m_ring_buffer[s + N] = c; + p->m_ring_buffer[s + N] = c; } // Increment the position, and wrap around when we're at @@ -506,7 +554,7 @@ void LZSSCompress::Encode(void) // Register the string that is found in // m_ring_buffer[r..r+F-1]. - InsertNode(r); + p->InsertNode(r); } // If we didn't quit because we hit the last_match_length, @@ -514,7 +562,7 @@ void LZSSCompress::Encode(void) // to process. while (i++ < last_match_length) { - DeleteNode(s); + p->DeleteNode(s); s = (short int) ( (s + 1) & (N - 1) ); r = (short int) ( (r + 1) & (N - 1) ); @@ -527,7 +575,7 @@ void LZSSCompress::Encode(void) // short strings). if (--len) { - InsertNode(r); /* buffer may not be empty. */ + p->InsertNode(r); /* buffer may not be empty. */ } } @@ -577,7 +625,7 @@ void LZSSCompress::Decode(void) // // Note that the last F bytes of the ring buffer are not filled. - memset(m_ring_buffer, ' ', N - F); + memset(p->m_ring_buffer, ' ', N - F); r = N - F; @@ -624,7 +672,7 @@ void LZSSCompress::Decode(void) // Add to buffer, and increment to next spot. Wrap at end. - m_ring_buffer[r] = c[0]; + p->m_ring_buffer[r] = c[0]; r = (short int) ( (r + 1) & (N - 1) ); } @@ -662,11 +710,11 @@ void LZSSCompress::Decode(void) // len is never more than F. for (k = 0; k < len; k++) { - c[k] = m_ring_buffer[(pos + k) & (N - 1)]; + c[k] = p->m_ring_buffer[(pos + k) & (N - 1)]; // Add to buffer, and increment to next spot. Wrap at end. - m_ring_buffer[r] = c[k]; + p->m_ring_buffer[r] = c[k]; r = (short int) ( (r + 1) & (N - 1) ); } diff --git a/src/modules/common/lzsscomprs.txt b/src/modules/common/lzsscomprs.txt new file mode 100644 index 0000000..b6817f2 --- /dev/null +++ b/src/modules/common/lzsscomprs.txt @@ -0,0 +1,802 @@ +The following is the original information send from Parson's Technologies via +Craig Rairden. +_______________________________________________________________________________ +Compression Info, 10-11-95 +Jeff Wheeler + +Source of Algorithm +------------------- + +The compression algorithms used here are based upon the algorithms developed +and published by Haruhiko Okumura in a paper entitled "Data Compression +Algorithms of LARC and LHarc." This paper discusses three compression +algorithms, LSZZ, LZARI, and LZHUF. LZSS is described as the "first" of +these, and is described as providing moderate compression with good speed. +LZARI is described as an improved LZSS, a combination of the LZSS algorithm +with adaptive arithmetic compression. It is described as being slower than +LZSS but with better compression. LZHUF (the basis of the common LHA +compression program) was included in the paper, however, a free usage license +was not included. + +The following are copies of the statements included at the beginning of each +source code listing that was supplied in the working paper. + + LZSS, dated 4/6/89, marked as "Use, distribute and + modify this program freely." + + LZARI, dated 4/7/89, marked as "Use, distribute and + modify this program freely." + + LZHUF, dated 11/20/88, written by Haruyasu Yoshizaki, + translated by Haruhiko Okumura on 4/7/89. Not + expressly marked as redistributable or modifiable. + +Since both LZSS and LZARI are marked as "use, distribute and modify freely" we +have felt at liberty basing our compression algorithm on either of these. + +Selection of Algorithm +---------------------- + +Working samples of three possible compression algorithms are supplied in +Okumura's paper. Which should be used? + +LZSS is the fastest at decompression, but does not generated as small a +compressed file as the other methods. The other two methods provided, perhaps, +a 15% improvement in compression. Or, put another way, on a 100K file, LZSS +might compress it to 50K while the others might approach 40-45K. For STEP +purposes, it was decided that decoding speed was of more importance than +tighter compression. For these reasons, the first compression algorithm +implemented is the LZSS algorithm. + +About LZSS Encoding +------------------- + +(adapted from Haruhiko Okumura's paper) + +This scheme was proposed by Ziv and Lempel [1]. A slightly modified version +is described by Storer and Szymanski [2]. An implementation using a binary +tree has been proposed by Bell [3]. + +The algorithm is quite simple. +1. Keep a ring buffer which initially contains all space characters. +2. Read several letters from the file to the buffer. +3. Search the buffer for the longest string that matches the letters just + read, and send its length and position into the buffer. + +If the ring buffer is 4096 bytes, the position can be stored in 12 bits. If the +length is represented in 4 bits, the pair is two bytes +long. If the longest match is no more than two characters, then just one +character is sent without encoding. The process starts again with the next +character. An extra bit is sent each time to tell the decoder whether the +next item is a character of a pair. + +[1] J. Ziv and A. Lempel, IEEE Transactions IT-23, 337-343 (1977). +[2] J. A. Storer and T. G. Szymanski, J. ACM, 29, 928-951 (1982). +[3] T.C. Gell, IEEE Transactions COM-34, 1176-1182 (1986). + +class SWCompress { +public: +void InitTree( // no return value + void); // no parameters + +void InsertNode( // no return value + short int Pos); // position in the buffer + +void DeleteNode( // no return value + short int Node); // node to be removed + +void Encode( // no return value + void); // no parameters + +void Decode( // no return value + void); // no parameters +}; + +// The following are constant sizes used by the compression algorithm. +// +// N - This is the size of the ring buffer. It is set +// to 4K. It is important to note that a position +// within the ring buffer requires 12 bits. +// +// F - This is the maximum length of a character sequence +// that can be taken from the ring buffer. It is set +// to 18. Note that a length must be 3 before it is +// worthwhile to store a position/length pair, so the +// length can be encoded in only 4 bits. Or, put yet +// another way, it is not necessary to encode a length +// of 0-18, it is necessary to encode a length of +// 3-18, which requires 4 bits. +// +// THRESHOLD - It takes 2 bytes to store an offset and +// a length. If a character sequence only +// requires 1 or 2 characters to store +// uncompressed, then it is better to store +// it uncompressed than as an offset into +// the ring buffer. +// +// Note that the 12 bits used to store the position and the 4 bits +// used to store the length equal a total of 16 bits, or 2 bytes. + +#define N 4096 +#define F 18 +#define THRESHOLD 3 +#define NOT_USED N + +// m_ring_buffer is a text buffer. It contains "nodes" of +// uncompressed text that can be indexed by position. That is, +// a substring of the ring buffer can be indexed by a position +// and a length. When decoding, the compressed text may contain +// a position in the ring buffer and a count of the number of +// bytes from the ring buffer that are to be moved into the +// uncompressed buffer. +// +// This ring buffer is not maintained as part of the compressed +// text. Instead, it is reconstructed dynamically. That is, +// it starts out empty and gets built as the text is decompressed. +// +// The ring buffer contain N bytes, with an additional F - 1 bytes +// to facilitate string comparison. + +unsigned char m_ring_buffer[N + F - 1]; + +// m_match_position and m_match_length are set by InsertNode(). +// +// These variables indicate the position in the ring buffer +// and the number of characters at that position that match +// a given string. + +short int m_match_position; +short int m_match_length; + +// m_lson, m_rson, and m_dad are the Japanese way of referring to +// a tree structure. The dad is the parent and it has a right and +// left son (child). +// +// For i = 0 to N-1, m_rson[i] and m_lson[i] will be the right +// and left children of node i. +// +// For i = 0 to N-1, m_dad[i] is the parent of node i. +// +// For i = 0 to 255, rson[N + i + 1] is the root of the tree for +// strings that begin with the character i. Note that this requires +// one byte characters. +// +// These nodes store values of 0...(N-1). Memory requirements +// can be reduces by using 2-byte integers instead of full 4-byte +// integers (for 32-bit applications). Therefore, these are +// defined as "short ints." + +short int m_lson[N + 1]; +short int m_rson[N + 257]; +short int m_dad[N + 1]; + + + + +/* + ------------------------------------------------------------------------- + cLZSS::InitTree + + This function initializes the tree nodes to "empty" states. + ------------------------------------------------------------------------- +*/ + +void cLZSS::InitTree( // no return value + void) // no parameters + throw() // exception list + + { + int i; + + // For i = 0 to N - 1, m_rson[i] and m_lson[i] will be the right + // and left children of node i. These nodes need not be + // initialized. However, for debugging purposes, it is nice to + // have them initialized. Since this is only used for compression + // (not decompression), I don't mind spending the time to do it. + // + // For the same range of i, m_dad[i] is the parent of node i. + // These are initialized to a known value that can represent + // a "not used" state. + + for (i = 0; i < N; i++) + { + m_lson[i] = NOT_USED; + m_rson[i] = NOT_USED; + m_dad[i] = NOT_USED; + } + + // For i = 0 to 255, m_rson[N + i + 1] is the root of the tree + // for strings that begin with the character i. This is why + // the right child array is larger than the left child array. + // These are also initialzied to a "not used" state. + // + // Note that there are 256 of these, one for each of the possible + // 256 characters. + + for (i = N + 1; i <= (N + 256); i++) + { + m_rson[i] = NOT_USED; + } + + // Done. + } + +/* + ------------------------------------------------------------------------- + cLZSS::InsertNode + + This function inserts a string from the ring buffer into one of + the trees. It loads the match position and length member variables + for the longest match. + + The string to be inserted is identified by the parameter Pos, + A full F bytes are inserted. So, m_ring_buffer[Pos ... Pos+F-1] + are inserted. + + If the matched length is exactly F, then an old node is removed + in favor of the new one (because the old one will be deleted + sooner). + + Note that Pos plays a dual role. It is used as both a position + in the ring buffer and also as a tree node. m_ring_buffer[Pos] + defines a character that is used to identify a tree node. + ------------------------------------------------------------------------- +*/ + +void cLZSS::InsertNode( // no return value + short int Pos) // position in the buffer + throw() // exception list + + { + short int i; + short int p; + int cmp; + unsigned char * key; + + ASSERT(Pos >= 0); + ASSERT(Pos < N); + + cmp = 1; + key = &(m_ring_buffer[Pos]); + + // The last 256 entries in m_rson contain the root nodes for + // strings that begin with a letter. Get an index for the + // first letter in this string. + + p = (short int) (N + 1 + key[0]); + + // Set the left and right tree nodes for this position to "not + // used." + + m_lson[Pos] = NOT_USED; + m_rson[Pos] = NOT_USED; + + // Haven't matched anything yet. + + m_match_length = 0; + + for ( ; ; ) + { + if (cmp >= 0) + { + if (m_rson[p] != NOT_USED) + { + p = m_rson[p]; + } + else + { + m_rson[p] = Pos; + m_dad[Pos] = p; + return; + } + } + else + { + if (m_lson[p] != NOT_USED) + { + p = m_lson[p]; + } + else + { + m_lson[p] = Pos; + m_dad[Pos] = p; + return; + } + } + + // Should we go to the right or the left to look for the + // next match? + + for (i = 1; i < F; i++) + { + cmp = key[i] - m_ring_buffer[p + i]; + if (cmp != 0) + break; + } + + if (i > m_match_length) + { + m_match_position = p; + m_match_length = i; + + if (i >= F) + break; + } + } + + m_dad[Pos] = m_dad[p]; + m_lson[Pos] = m_lson[p]; + m_rson[Pos] = m_rson[p]; + + m_dad[ m_lson[p] ] = Pos; + m_dad[ m_rson[p] ] = Pos; + + if (m_rson[ m_dad[p] ] == p) + { + m_rson[ m_dad[p] ] = Pos; + } + else + { + m_lson[ m_dad[p] ] = Pos; + } + + // Remove "p" + + m_dad[p] = NOT_USED; + } + +/* + ------------------------------------------------------------------------- + cLZSS::DeleteNode + + This function removes the node "Node" from the tree. + ------------------------------------------------------------------------- +*/ + +void cLZSS::DeleteNode( // no return value + short int Node) // node to be removed + throw() // exception list + + { + short int q; + + ASSERT(Node >= 0); + ASSERT(Node < (N+1)); + + if (m_dad[Node] == NOT_USED) + { + // not in tree, nothing to do + return; + } + + if (m_rson[Node] == NOT_USED) + { + q = m_lson[Node]; + } + else if (m_lson[Node] == NOT_USED) + { + q = m_rson[Node]; + } + else + { + q = m_lson[Node]; + if (m_rson[q] != NOT_USED) + { + do + { + q = m_rson[q]; + } + while (m_rson[q] != NOT_USED); + + m_rson[ m_dad[q] ] = m_lson[q]; + m_dad[ m_lson[q] ] = m_dad[q]; + m_lson[q] = m_lson[Node]; + m_dad[ m_lson[Node] ] = q; + } + + m_rson[q] = m_rson[Node]; + m_dad[ m_rson[Node] ] = q; + } + + m_dad[q] = m_dad[Node]; + + if (m_rson[ m_dad[Node] ] == Node) + { + m_rson[ m_dad[Node] ] = q; + } + else + { + m_lson[ m_dad[Node] ] = q; + } + + m_dad[Node] = NOT_USED; + } + +/* + ------------------------------------------------------------------------- + cLZSS::Encode + + This function "encodes" the input stream into the output stream. + The GetChars() and SendChars() functions are used to separate + this method from the actual i/o. + ------------------------------------------------------------------------- +*/ + +void cLZSS::Encode( // no return value + void) // no parameters + + { + short int i; // an iterator + short int r; // node number in the binary tree + short int s; // position in the ring buffer + unsigned short int len; // len of initial string + short int last_match_length; // length of last match + short int code_buf_pos; // position in the output buffer + unsigned char code_buf[17]; // the output buffer + unsigned char mask; // bit mask for byte 0 of out buf + unsigned char c; // character read from string + + // Start with a clean tree. + + InitTree(); + + // code_buf[0] works as eight flags. A "1" represents that the + // unit is an unencoded letter (1 byte), and a "0" represents + // that the next unit is a pair (2 bytes). + // + // code_buf[1..16] stores eight units of code. Since the best + // we can do is store eight pairs, at most 16 + // bytes are needed to store this. + // + // This is why the maximum size of the code buffer is 17 bytes. + + code_buf[0] = 0; + code_buf_pos = 1; + + // Mask iterates over the 8 bits in the code buffer. The first + // character ends up being stored in the low bit. + // + // bit 8 7 6 5 4 3 2 1 + // | | + // | first sequence in code buffer + // | + // last sequence in code buffer + + mask = 1; + + s = 0; + r = (short int) N - (short int) F; + + // Initialize the ring buffer with spaces... + + // Note that the last F bytes of the ring buffer are not filled. + // This is because those F bytes will be filled in immediately + // with bytes from the input stream. + + memset(m_ring_buffer, ' ', N - F); + + // Read F bytes into the last F bytes of the ring buffer. + // + // This function loads the buffer with X characters and returns + // the actual amount loaded. + + len = GetChars(&(m_ring_buffer[r]), F); + + // Make sure there is something to be compressed. + + if (len == 0) + return; + + // Insert the F strings, each of which begins with one or more + // 'space' characters. Note the order in which these strings + // are inserted. This way, degenerate trees will be less likely + // to occur. + + for (i = 1; i <= F; i++) + { + InsertNode((short int) (r - i)); + } + + // Finally, insert the whole string just read. The + // member variables match_length and match_position are set. + + InsertNode(r); + + // Now that we're preloaded, continue till done. + + do + { + + // m_match_length may be spuriously long near the end of + // text. + + if (m_match_length > len) + { + m_match_length = len; + } + + // Is it cheaper to store this as a single character? If so, + // make it so. + + if (m_match_length < THRESHOLD) + { + // Send one character. Remember that code_buf[0] is the + // set of flags for the next eight items. + + m_match_length = 1; + code_buf[0] |= mask; + code_buf[code_buf_pos++] = m_ring_buffer[r]; + } + + // Otherwise, we do indeed have a string that can be stored + // compressed to save space. + + else + { + // The next 16 bits need to contain the position (12 bits) + // and the length (4 bits). + + code_buf[code_buf_pos++] = (unsigned char) m_match_position; + code_buf[code_buf_pos++] = (unsigned char) ( + ((m_match_position >> 4) & 0xf0) | + (m_match_length - THRESHOLD) ); + } + + // Shift the mask one bit to the left so that it will be ready + // to store the new bit. + + mask = (unsigned char) (mask << 1); + + // If the mask is now 0, then we know that we have a full set + // of flags and items in the code buffer. These need to be + // output. + + if (mask == 0) + { + // code_buf is the buffer of characters to be output. + // code_buf_pos is the number of characters it contains. + + SendChars(code_buf, code_buf_pos); + + // Reset for next buffer... + + code_buf[0] = 0; + code_buf_pos = 1; + mask = 1; + } + + last_match_length = m_match_length; + + // Delete old strings and read new bytes... + + for (i = 0; i < last_match_length; i++) + { + + // Get next character... + + if (GetChars(&c, 1) != 1) + break; + + // Delete "old strings" + + DeleteNode(s); + + // Put this character into the ring buffer. + // + // The original comment here says "If the position is near + // the end of the buffer, extend the buffer to make + // string comparison easier." + // + // That's a little misleading, because the "end" of the + // buffer is really what we consider to be the "beginning" + // of the buffer, that is, positions 0 through F. + // + // The idea is that the front end of the buffer is duplicated + // into the back end so that when you're looking at characters + // at the back end of the buffer, you can index ahead (beyond + // the normal end of the buffer) and see the characters + // that are at the front end of the buffer wihtout having + // to adjust the index. + // + // That is... + // + // 1234xxxxxxxxxxxxxxxxxxxxxxxxxxxxx1234 + // | | | + // position 0 end of buffer | + // | + // duplicate of front of buffer + + m_ring_buffer[s] = c; + + if (s < F - 1) + { + m_ring_buffer[s + N] = c; + } + + // Increment the position, and wrap around when we're at + // the end. Note that this relies on N being a power of 2. + + s = (short int) ( (s + 1) & (N - 1) ); + r = (short int) ( (r + 1) & (N - 1) ); + + // Register the string that is found in + // m_ring_buffer[r..r+F-1]. + + InsertNode(r); + } + + // If we didn't quit because we hit the last_match_length, + // then we must have quit because we ran out of characters + // to process. + + while (i++ < last_match_length) + { + DeleteNode(s); + + s = (short int) ( (s + 1) & (N - 1) ); + r = (short int) ( (r + 1) & (N - 1) ); + + // Note that len hitting 0 is the key that causes the + // do...while() to terminate. This is the only place + // within the loop that len is modified. + // + // Its original value is F (or a number less than F for + // short strings). + + if (--len) + { + InsertNode(r); /* buffer may not be empty. */ + } + } + + // End of do...while() loop. Continue processing until there + // are no more characters to be compressed. The variable + // "len" is used to signal this condition. + } + while (len > 0); + + // There could still be something in the output buffer. Send it + // now. + + if (code_buf_pos > 1) + { + // code_buf is the encoded string to send. + // code_buf_ptr is the number of characters. + + SendChars(code_buf, code_buf_pos); + } + + // Done! + } + +/* + ------------------------------------------------------------------------- + cLZSS::Decode + + This function "decodes" the input stream into the output stream. + The GetChars() and SendChars() functions are used to separate + this method from the actual i/o. + ------------------------------------------------------------------------- +*/ + +void cLZSS::Decode( // no return value + void) // no parameters + + { + int k; + int r; // node number + unsigned char c[F]; // an array of chars + unsigned char flags; // 8 bits of flags + int flag_count; // which flag we're on + short int pos; // position in the ring buffer + short int len; // number of chars in ring buffer + + // Initialize the ring buffer with a common string. + // + // Note that the last F bytes of the ring buffer are not filled. + + memset(m_ring_buffer, ' ', N - F); + + r = N - F; + + flags = (char) 0; + flag_count = 0; + + for ( ; ; ) + { + + // If there are more bits of interest in this flag, then + // shift that next interesting bit into the 1's position. + // + // If this flag has been exhausted, the next byte must + // be a flag. + + if (flag_count > 0) + { + flags = (unsigned char) (flags >> 1); + flag_count--; + } + else + { + // Next byte must be a flag. + + if (GetChars(&flags, 1) != 1) + break; + + // Set the flag counter. While at first it might appear + // that this should be an 8 since there are 8 bits in the + // flag, it should really be a 7 because the shift must + // be performed 7 times in order to see all 8 bits. + + flag_count = 7; + } + + // If the low order bit of the flag is now set, then we know + // that the next byte is a single, unencoded character. + + if (flags & 1) + { + if (GetChars(c, 1) != 1) + break; + + if (SendChars(c, 1) != 1) + break; + + // Add to buffer, and increment to next spot. Wrap at end. + + m_ring_buffer[r] = c[0]; + r = (short int) ( (r + 1) & (N - 1) ); + } + + // Otherwise, we know that the next two bytes are a + // pair. The position is in 12 bits and + // the length is in 4 bits. + + else + { + // Original code: + // if ((i = getc(infile)) == EOF) + // break; + // if ((j = getc(infile)) == EOF) + // break; + // i |= ((j & 0xf0) << 4); + // j = (j & 0x0f) + THRESHOLD; + // + // I've modified this to only make one input call, and + // have changed the variable names to something more + // obvious. + + if (GetChars(c, 2) != 2) + break; + + // Convert these two characters into the position and + // length. Note that the length is always at least + // THRESHOLD, which is why we're able to get a length + // of 18 out of only 4 bits. + + pos = (short int) ( c[0] | ((c[1] & 0xf0) << 4) ); + + len = (short int) ( (c[1] & 0x0f) + THRESHOLD ); + + // There are now "len" characters at position "pos" in + // the ring buffer that can be pulled out. Note that + // len is never more than F. + + for (k = 0; k < len; k++) + { + c[k] = m_ring_buffer[(pos + k) & (N - 1)]; + + // Add to buffer, and increment to next spot. Wrap at end. + + m_ring_buffer[r] = c[k]; + r = (short int) ( (r + 1) & (N - 1) ); + } + + // Add the "len" characters to the output stream. + + if (SendChars(c, len) != len) + break; + } + } + } + diff --git a/src/modules/common/rawstr.cpp b/src/modules/common/rawstr.cpp index 6f17628..788ab6e 100644 --- a/src/modules/common/rawstr.cpp +++ b/src/modules/common/rawstr.cpp @@ -1,11 +1,13 @@ /****************************************************************************** - * rawstr.cpp - code for class 'RawStr'- a module that reads raw text - * files: ot and nt using indexs ??.bks ??.cps ??.vss - * and provides lookup and parsing functions based on - * class StrKey * + * rawstr.cpp - code for class 'RawStr'- a module that reads raw text + * files: ot and nt using indexs ??.bks ??.cps ??.vss + * and provides lookup and parsing functions based on + * class StrKey * - * Copyright 2009 CrossWire Bible Society (http://www.crosswire.org) + * $Id: rawstr.cpp 2833 2013-06-29 06:40:28Z chrislit $ + * + * Copyright 1998-2013 CrossWire Bible Society (http://www.crosswire.org) * CrossWire Bible Society * P. O. Box 2528 * Tempe, AZ 85280-2528 @@ -54,7 +56,7 @@ const int RawStr::IDXENTRYSIZE = 6; * (e.g. 'modules/texts/rawtext/webster/') */ -RawStr::RawStr(const char *ipath, int fileMode) +RawStr::RawStr(const char *ipath, int fileMode, bool caseSensitive) : caseSensitive(caseSensitive) { SWBuf buf; @@ -121,7 +123,7 @@ void RawStr::getIDXBufDat(long ioffset, char **buf) const datfd->read(*buf, size); } (*buf)[size] = 0; - toupperstr_utf8(*buf, size*2); + if (!caseSensitive) toupperstr_utf8(*buf, size*2); } else { *buf = (*buf) ? (char *)realloc(*buf, 1) : (char *)malloc(1); @@ -181,7 +183,7 @@ signed char RawStr::findOffset(const char *ikey, __u32 *start, __u16 *size, long headoff = 0; stdstr(&key, ikey, 3); - toupperstr_utf8(key, strlen(key)*3); + if (!caseSensitive) toupperstr_utf8(key, strlen(key)*3); int keylen = strlen(key); bool substr = false; @@ -302,7 +304,7 @@ signed char RawStr::findOffset(const char *ikey, __u32 *start, __u16 *size, long * */ -void RawStr::readText(__u32 istart, __u16 *isize, char **idxbuf, SWBuf &buf) +void RawStr::readText(__u32 istart, __u16 *isize, char **idxbuf, SWBuf &buf) const { unsigned int ch; char *idxbuflocal = 0; @@ -380,7 +382,7 @@ void RawStr::doSetText(const char *ikey, const char *buf, long len) char errorStatus = findOffset(ikey, &start, &size, 0, &idxoff); stdstr(&key, ikey, 2); - toupperstr_utf8(key, strlen(key)*2); + if (!caseSensitive) toupperstr_utf8(key, strlen(key)*2); len = (len < 0) ? strlen(buf) : len; diff --git a/src/modules/common/rawstr4.cpp b/src/modules/common/rawstr4.cpp index 003b2fe..e2ce899 100644 --- a/src/modules/common/rawstr4.cpp +++ b/src/modules/common/rawstr4.cpp @@ -1,11 +1,13 @@ /****************************************************************************** - * rawstr.cpp - code for class 'RawStr'- a module that reads raw text - * files: ot and nt using indexs ??.bks ??.cps ??.vss - * and provides lookup and parsing functions based on - * class StrKey * + * rawstr4.cpp - code for class 'RawStr'- a module that reads raw text + * files: ot and nt using indexs ??.bks ??.cps ??.vss + * and provides lookup and parsing functions based on + * class StrKey * - * Copyright 2009 CrossWire Bible Society (http://www.crosswire.org) + * $Id: rawstr4.cpp 2833 2013-06-29 06:40:28Z chrislit $ + * + * Copyright 2001-2013 CrossWire Bible Society (http://www.crosswire.org) * CrossWire Bible Society * P. O. Box 2528 * Tempe, AZ 85280-2528 @@ -53,7 +55,7 @@ const int RawStr4::IDXENTRYSIZE = 8; * (e.g. 'modules/texts/rawtext/webster/') */ -RawStr4::RawStr4(const char *ipath, int fileMode) +RawStr4::RawStr4(const char *ipath, int fileMode, bool caseSensitive) : caseSensitive(caseSensitive) { SWBuf buf; @@ -121,7 +123,7 @@ void RawStr4::getIDXBufDat(long ioffset, char **buf) const datfd->read(*buf, size); } (*buf)[size] = 0; - toupperstr_utf8(*buf, size*2); + if (!caseSensitive) toupperstr_utf8(*buf, size*2); } else { *buf = (*buf) ? (char *)realloc(*buf, 1) : (char *)malloc(1); @@ -157,7 +159,7 @@ void RawStr4::getIDXBuf(long ioffset, char **buf) const } *targetbuf = 0; trybuf = 0; - toupperstr_utf8(targetbuf); + if (!caseSensitive) toupperstr_utf8(targetbuf); */ } } @@ -191,7 +193,7 @@ signed char RawStr4::findOffset(const char *ikey, __u32 *start, __u32 *size, lon headoff = 0; stdstr(&key, ikey, 3); - toupperstr_utf8(key, strlen(key)*3); + if (!caseSensitive) toupperstr_utf8(key, strlen(key)*3); int keylen = strlen(key); bool substr = false; @@ -311,7 +313,7 @@ signed char RawStr4::findOffset(const char *ikey, __u32 *start, __u32 *size, lon * */ -void RawStr4::readText(__u32 istart, __u32 *isize, char **idxbuf, SWBuf &buf) +void RawStr4::readText(__u32 istart, __u32 *isize, char **idxbuf, SWBuf &buf) const { unsigned int ch; char *idxbuflocal = 0; @@ -388,7 +390,7 @@ void RawStr4::doSetText(const char *ikey, const char *buf, long len) { char errorStatus = findOffset(ikey, &start, &size, 0, &idxoff); stdstr(&key, ikey, 3); - toupperstr_utf8(key, strlen(key)*3); + if (!caseSensitive) toupperstr_utf8(key, strlen(key)*3); len = (len < 0) ? strlen(buf) : len; getIDXBufDat(start, &dbKey); diff --git a/src/modules/common/rawverse.cpp b/src/modules/common/rawverse.cpp index 99beaa2..5527d38 100644 --- a/src/modules/common/rawverse.cpp +++ b/src/modules/common/rawverse.cpp @@ -1,11 +1,12 @@ /****************************************************************************** - * rawverse.cpp - code for class 'RawVerse'- a module that reads raw text + * + * rawverse.cpp - code for class 'RawVerse'- a module that reads raw text * files: ot and nt using indexs ??.bks ??.cps ??.vss * and provides lookup and parsing functions based on * class VerseKey * * - * Copyright 2009 CrossWire Bible Society (http://www.crosswire.org) + * Copyright 1997-2013 CrossWire Bible Society (http://www.crosswire.org) * CrossWire Bible Society * P. O. Box 2528 * Tempe, AZ 85280-2528 @@ -149,7 +150,7 @@ void RawVerse::findOffset(char testmt, long idxoff, long *start, unsigned short * */ -void RawVerse::readText(char testmt, long start, unsigned short size, SWBuf &buf) { +void RawVerse::readText(char testmt, long start, unsigned short size, SWBuf &buf) const { buf = ""; buf.setFillByte(0); buf.setSize(size + 1); @@ -278,15 +279,15 @@ char RawVerse::createModule(const char *ipath, const char *v11n) VerseKey vk; vk.setVersificationSystem(v11n); - vk.Headings(1); + vk.setIntros(1); __s32 offset = 0; __u16 size = 0; offset = archtosword32(offset); size = archtosword16(size); - for (vk = TOP; !vk.Error(); vk++) { - if (vk.Testament() < 2) { + for (vk = TOP; !vk.popError(); vk++) { + if (vk.getTestament() < 2) { fd->write(&offset, 4); fd->write(&size, 2); } diff --git a/src/modules/common/rawverse4.cpp b/src/modules/common/rawverse4.cpp index ee0b207..b87ea0d 100644 --- a/src/modules/common/rawverse4.cpp +++ b/src/modules/common/rawverse4.cpp @@ -1,11 +1,14 @@ /****************************************************************************** - * rawverse.cpp - code for class 'RawVerse4'- a module that reads raw text - * files: ot and nt using indexs ??.bks ??.cps ??.vss + * + * rawverse4.cpp - code for class 'RawVerse4'- a module that reads raw + * text files: + * ot and nt using indexs ??.bks ??.cps ??.vss * and provides lookup and parsing functions based on * class VerseKey * + * $Id: rawverse4.cpp 2833 2013-06-29 06:40:28Z chrislit $ * - * Copyright 2009 CrossWire Bible Society (http://www.crosswire.org) + * Copyright 2007-2013 CrossWire Bible Society (http://www.crosswire.org) * CrossWire Bible Society * P. O. Box 2528 * Tempe, AZ 85280-2528 @@ -149,7 +152,7 @@ void RawVerse4::findOffset(char testmt, long idxoff, long *start, unsigned long * */ -void RawVerse4::readText(char testmt, long start, unsigned long size, SWBuf &buf) { +void RawVerse4::readText(char testmt, long start, unsigned long size, SWBuf &buf) const { buf = ""; buf.setFillByte(0); buf.setSize(size + 1); @@ -278,14 +281,14 @@ char RawVerse4::createModule(const char *ipath, const char *v11n) VerseKey vk; vk.setVersificationSystem(v11n); - vk.Headings(1); + vk.setIntros(1); __u32 offset = 0; __u32 size = 0; offset = archtosword32(offset); size = archtosword32(size); - for (vk = TOP; !vk.Error(); vk++) { - if (vk.Testament() < 2) { + for (vk = TOP; !vk.popError(); vk++) { + if (vk.getTestament() < 2) { fd->write(&offset, 4); fd->write(&size, 4); } diff --git a/src/modules/common/sapphire.cpp b/src/modules/common/sapphire.cpp index 7b141d2..8cc3e15 100644 --- a/src/modules/common/sapphire.cpp +++ b/src/modules/common/sapphire.cpp @@ -1,15 +1,39 @@ -/* sapphire.cpp -- the Saphire II stream cipher class. - Dedicated to the Public Domain the author and inventor: - (Michael Paul Johnson). This code comes with no warranty. - Use it at your own risk. - Ported from the Pascal implementation of the Sapphire Stream - Cipher 9 December 1994. - Added hash pre- and post-processing 27 December 1994. - Modified initialization to make index variables key dependent, - made the output function more resistant to cryptanalysis, - and renamed to Sapphire II 2 January 1995 -*/ - +/****************************************************************************** + * + * sapphire.cpp - the Saphire II stream cipher class + * + * $Id: sapphire.cpp 2833 2013-06-29 06:40:28Z chrislit $ + * + * Copyright 1999-2013 CrossWire Bible Society (http://www.crosswire.org) + * CrossWire Bible Society + * P. O. Box 2528 + * Tempe, AZ 85280-2528 + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + */ + +/****************************************************************************** + * + * Original license notice & credits: + * Dedicated to the Public Domain the author and inventor: + * (Michael Paul Johnson). This code comes with no warranty. + * Use it at your own risk. + * Ported from the Pascal implementation of the Sapphire Stream + * Cipher 9 December 1994. + * Added hash pre- and post-processing 27 December 1994. + * Modified initialization to make index variables key dependent, + * made the output function more resistant to cryptanalysis, + * and renamed to Sapphire II 2 January 1995 + * + */ #include diff --git a/src/modules/common/swcipher.cpp b/src/modules/common/swcipher.cpp index 5ab91ea..16279dc 100644 --- a/src/modules/common/swcipher.cpp +++ b/src/modules/common/swcipher.cpp @@ -1,9 +1,11 @@ /****************************************************************************** - * swcipher.cpp - code for class 'SWCipher'- a driver class that provides - * cipher utilities. * + * swcipher.cpp - code for class 'SWCipher'- a driver class that + * provides cipher utilities * - * Copyright 2009 CrossWire Bible Society (http://www.crosswire.org) + * $Id: swcipher.cpp 2833 2013-06-29 06:40:28Z chrislit $ + * + * Copyright 1999-2013 CrossWire Bible Society (http://www.crosswire.org) * CrossWire Bible Society * P. O. Box 2528 * Tempe, AZ 85280-2528 diff --git a/src/modules/common/swcomprs.cpp b/src/modules/common/swcomprs.cpp index 378c8b1..9df8e7d 100644 --- a/src/modules/common/swcomprs.cpp +++ b/src/modules/common/swcomprs.cpp @@ -1,9 +1,10 @@ /****************************************************************************** - * swcomprs.cpp - code for class 'SWCompress'- a driver class that provides - * compression utilities. * + * swcomprs.cpp - a driver class that provides compression utilities * - * Copyright 2009 CrossWire Bible Society (http://www.crosswire.org) + * $Id: swcomprs.cpp 2833 2013-06-29 06:40:28Z chrislit $ + * + * Copyright 1996-2013 CrossWire Bible Society (http://www.crosswire.org) * CrossWire Bible Society * P. O. Box 2528 * Tempe, AZ 85280-2528 diff --git a/src/modules/common/swcomprs.doc b/src/modules/common/swcomprs.doc deleted file mode 100644 index b6817f2..0000000 --- a/src/modules/common/swcomprs.doc +++ /dev/null @@ -1,802 +0,0 @@ -The following is the original information send from Parson's Technologies via -Craig Rairden. -_______________________________________________________________________________ -Compression Info, 10-11-95 -Jeff Wheeler - -Source of Algorithm -------------------- - -The compression algorithms used here are based upon the algorithms developed -and published by Haruhiko Okumura in a paper entitled "Data Compression -Algorithms of LARC and LHarc." This paper discusses three compression -algorithms, LSZZ, LZARI, and LZHUF. LZSS is described as the "first" of -these, and is described as providing moderate compression with good speed. -LZARI is described as an improved LZSS, a combination of the LZSS algorithm -with adaptive arithmetic compression. It is described as being slower than -LZSS but with better compression. LZHUF (the basis of the common LHA -compression program) was included in the paper, however, a free usage license -was not included. - -The following are copies of the statements included at the beginning of each -source code listing that was supplied in the working paper. - - LZSS, dated 4/6/89, marked as "Use, distribute and - modify this program freely." - - LZARI, dated 4/7/89, marked as "Use, distribute and - modify this program freely." - - LZHUF, dated 11/20/88, written by Haruyasu Yoshizaki, - translated by Haruhiko Okumura on 4/7/89. Not - expressly marked as redistributable or modifiable. - -Since both LZSS and LZARI are marked as "use, distribute and modify freely" we -have felt at liberty basing our compression algorithm on either of these. - -Selection of Algorithm ----------------------- - -Working samples of three possible compression algorithms are supplied in -Okumura's paper. Which should be used? - -LZSS is the fastest at decompression, but does not generated as small a -compressed file as the other methods. The other two methods provided, perhaps, -a 15% improvement in compression. Or, put another way, on a 100K file, LZSS -might compress it to 50K while the others might approach 40-45K. For STEP -purposes, it was decided that decoding speed was of more importance than -tighter compression. For these reasons, the first compression algorithm -implemented is the LZSS algorithm. - -About LZSS Encoding -------------------- - -(adapted from Haruhiko Okumura's paper) - -This scheme was proposed by Ziv and Lempel [1]. A slightly modified version -is described by Storer and Szymanski [2]. An implementation using a binary -tree has been proposed by Bell [3]. - -The algorithm is quite simple. -1. Keep a ring buffer which initially contains all space characters. -2. Read several letters from the file to the buffer. -3. Search the buffer for the longest string that matches the letters just - read, and send its length and position into the buffer. - -If the ring buffer is 4096 bytes, the position can be stored in 12 bits. If the -length is represented in 4 bits, the pair is two bytes -long. If the longest match is no more than two characters, then just one -character is sent without encoding. The process starts again with the next -character. An extra bit is sent each time to tell the decoder whether the -next item is a character of a pair. - -[1] J. Ziv and A. Lempel, IEEE Transactions IT-23, 337-343 (1977). -[2] J. A. Storer and T. G. Szymanski, J. ACM, 29, 928-951 (1982). -[3] T.C. Gell, IEEE Transactions COM-34, 1176-1182 (1986). - -class SWCompress { -public: -void InitTree( // no return value - void); // no parameters - -void InsertNode( // no return value - short int Pos); // position in the buffer - -void DeleteNode( // no return value - short int Node); // node to be removed - -void Encode( // no return value - void); // no parameters - -void Decode( // no return value - void); // no parameters -}; - -// The following are constant sizes used by the compression algorithm. -// -// N - This is the size of the ring buffer. It is set -// to 4K. It is important to note that a position -// within the ring buffer requires 12 bits. -// -// F - This is the maximum length of a character sequence -// that can be taken from the ring buffer. It is set -// to 18. Note that a length must be 3 before it is -// worthwhile to store a position/length pair, so the -// length can be encoded in only 4 bits. Or, put yet -// another way, it is not necessary to encode a length -// of 0-18, it is necessary to encode a length of -// 3-18, which requires 4 bits. -// -// THRESHOLD - It takes 2 bytes to store an offset and -// a length. If a character sequence only -// requires 1 or 2 characters to store -// uncompressed, then it is better to store -// it uncompressed than as an offset into -// the ring buffer. -// -// Note that the 12 bits used to store the position and the 4 bits -// used to store the length equal a total of 16 bits, or 2 bytes. - -#define N 4096 -#define F 18 -#define THRESHOLD 3 -#define NOT_USED N - -// m_ring_buffer is a text buffer. It contains "nodes" of -// uncompressed text that can be indexed by position. That is, -// a substring of the ring buffer can be indexed by a position -// and a length. When decoding, the compressed text may contain -// a position in the ring buffer and a count of the number of -// bytes from the ring buffer that are to be moved into the -// uncompressed buffer. -// -// This ring buffer is not maintained as part of the compressed -// text. Instead, it is reconstructed dynamically. That is, -// it starts out empty and gets built as the text is decompressed. -// -// The ring buffer contain N bytes, with an additional F - 1 bytes -// to facilitate string comparison. - -unsigned char m_ring_buffer[N + F - 1]; - -// m_match_position and m_match_length are set by InsertNode(). -// -// These variables indicate the position in the ring buffer -// and the number of characters at that position that match -// a given string. - -short int m_match_position; -short int m_match_length; - -// m_lson, m_rson, and m_dad are the Japanese way of referring to -// a tree structure. The dad is the parent and it has a right and -// left son (child). -// -// For i = 0 to N-1, m_rson[i] and m_lson[i] will be the right -// and left children of node i. -// -// For i = 0 to N-1, m_dad[i] is the parent of node i. -// -// For i = 0 to 255, rson[N + i + 1] is the root of the tree for -// strings that begin with the character i. Note that this requires -// one byte characters. -// -// These nodes store values of 0...(N-1). Memory requirements -// can be reduces by using 2-byte integers instead of full 4-byte -// integers (for 32-bit applications). Therefore, these are -// defined as "short ints." - -short int m_lson[N + 1]; -short int m_rson[N + 257]; -short int m_dad[N + 1]; - - - - -/* - ------------------------------------------------------------------------- - cLZSS::InitTree - - This function initializes the tree nodes to "empty" states. - ------------------------------------------------------------------------- -*/ - -void cLZSS::InitTree( // no return value - void) // no parameters - throw() // exception list - - { - int i; - - // For i = 0 to N - 1, m_rson[i] and m_lson[i] will be the right - // and left children of node i. These nodes need not be - // initialized. However, for debugging purposes, it is nice to - // have them initialized. Since this is only used for compression - // (not decompression), I don't mind spending the time to do it. - // - // For the same range of i, m_dad[i] is the parent of node i. - // These are initialized to a known value that can represent - // a "not used" state. - - for (i = 0; i < N; i++) - { - m_lson[i] = NOT_USED; - m_rson[i] = NOT_USED; - m_dad[i] = NOT_USED; - } - - // For i = 0 to 255, m_rson[N + i + 1] is the root of the tree - // for strings that begin with the character i. This is why - // the right child array is larger than the left child array. - // These are also initialzied to a "not used" state. - // - // Note that there are 256 of these, one for each of the possible - // 256 characters. - - for (i = N + 1; i <= (N + 256); i++) - { - m_rson[i] = NOT_USED; - } - - // Done. - } - -/* - ------------------------------------------------------------------------- - cLZSS::InsertNode - - This function inserts a string from the ring buffer into one of - the trees. It loads the match position and length member variables - for the longest match. - - The string to be inserted is identified by the parameter Pos, - A full F bytes are inserted. So, m_ring_buffer[Pos ... Pos+F-1] - are inserted. - - If the matched length is exactly F, then an old node is removed - in favor of the new one (because the old one will be deleted - sooner). - - Note that Pos plays a dual role. It is used as both a position - in the ring buffer and also as a tree node. m_ring_buffer[Pos] - defines a character that is used to identify a tree node. - ------------------------------------------------------------------------- -*/ - -void cLZSS::InsertNode( // no return value - short int Pos) // position in the buffer - throw() // exception list - - { - short int i; - short int p; - int cmp; - unsigned char * key; - - ASSERT(Pos >= 0); - ASSERT(Pos < N); - - cmp = 1; - key = &(m_ring_buffer[Pos]); - - // The last 256 entries in m_rson contain the root nodes for - // strings that begin with a letter. Get an index for the - // first letter in this string. - - p = (short int) (N + 1 + key[0]); - - // Set the left and right tree nodes for this position to "not - // used." - - m_lson[Pos] = NOT_USED; - m_rson[Pos] = NOT_USED; - - // Haven't matched anything yet. - - m_match_length = 0; - - for ( ; ; ) - { - if (cmp >= 0) - { - if (m_rson[p] != NOT_USED) - { - p = m_rson[p]; - } - else - { - m_rson[p] = Pos; - m_dad[Pos] = p; - return; - } - } - else - { - if (m_lson[p] != NOT_USED) - { - p = m_lson[p]; - } - else - { - m_lson[p] = Pos; - m_dad[Pos] = p; - return; - } - } - - // Should we go to the right or the left to look for the - // next match? - - for (i = 1; i < F; i++) - { - cmp = key[i] - m_ring_buffer[p + i]; - if (cmp != 0) - break; - } - - if (i > m_match_length) - { - m_match_position = p; - m_match_length = i; - - if (i >= F) - break; - } - } - - m_dad[Pos] = m_dad[p]; - m_lson[Pos] = m_lson[p]; - m_rson[Pos] = m_rson[p]; - - m_dad[ m_lson[p] ] = Pos; - m_dad[ m_rson[p] ] = Pos; - - if (m_rson[ m_dad[p] ] == p) - { - m_rson[ m_dad[p] ] = Pos; - } - else - { - m_lson[ m_dad[p] ] = Pos; - } - - // Remove "p" - - m_dad[p] = NOT_USED; - } - -/* - ------------------------------------------------------------------------- - cLZSS::DeleteNode - - This function removes the node "Node" from the tree. - ------------------------------------------------------------------------- -*/ - -void cLZSS::DeleteNode( // no return value - short int Node) // node to be removed - throw() // exception list - - { - short int q; - - ASSERT(Node >= 0); - ASSERT(Node < (N+1)); - - if (m_dad[Node] == NOT_USED) - { - // not in tree, nothing to do - return; - } - - if (m_rson[Node] == NOT_USED) - { - q = m_lson[Node]; - } - else if (m_lson[Node] == NOT_USED) - { - q = m_rson[Node]; - } - else - { - q = m_lson[Node]; - if (m_rson[q] != NOT_USED) - { - do - { - q = m_rson[q]; - } - while (m_rson[q] != NOT_USED); - - m_rson[ m_dad[q] ] = m_lson[q]; - m_dad[ m_lson[q] ] = m_dad[q]; - m_lson[q] = m_lson[Node]; - m_dad[ m_lson[Node] ] = q; - } - - m_rson[q] = m_rson[Node]; - m_dad[ m_rson[Node] ] = q; - } - - m_dad[q] = m_dad[Node]; - - if (m_rson[ m_dad[Node] ] == Node) - { - m_rson[ m_dad[Node] ] = q; - } - else - { - m_lson[ m_dad[Node] ] = q; - } - - m_dad[Node] = NOT_USED; - } - -/* - ------------------------------------------------------------------------- - cLZSS::Encode - - This function "encodes" the input stream into the output stream. - The GetChars() and SendChars() functions are used to separate - this method from the actual i/o. - ------------------------------------------------------------------------- -*/ - -void cLZSS::Encode( // no return value - void) // no parameters - - { - short int i; // an iterator - short int r; // node number in the binary tree - short int s; // position in the ring buffer - unsigned short int len; // len of initial string - short int last_match_length; // length of last match - short int code_buf_pos; // position in the output buffer - unsigned char code_buf[17]; // the output buffer - unsigned char mask; // bit mask for byte 0 of out buf - unsigned char c; // character read from string - - // Start with a clean tree. - - InitTree(); - - // code_buf[0] works as eight flags. A "1" represents that the - // unit is an unencoded letter (1 byte), and a "0" represents - // that the next unit is a pair (2 bytes). - // - // code_buf[1..16] stores eight units of code. Since the best - // we can do is store eight pairs, at most 16 - // bytes are needed to store this. - // - // This is why the maximum size of the code buffer is 17 bytes. - - code_buf[0] = 0; - code_buf_pos = 1; - - // Mask iterates over the 8 bits in the code buffer. The first - // character ends up being stored in the low bit. - // - // bit 8 7 6 5 4 3 2 1 - // | | - // | first sequence in code buffer - // | - // last sequence in code buffer - - mask = 1; - - s = 0; - r = (short int) N - (short int) F; - - // Initialize the ring buffer with spaces... - - // Note that the last F bytes of the ring buffer are not filled. - // This is because those F bytes will be filled in immediately - // with bytes from the input stream. - - memset(m_ring_buffer, ' ', N - F); - - // Read F bytes into the last F bytes of the ring buffer. - // - // This function loads the buffer with X characters and returns - // the actual amount loaded. - - len = GetChars(&(m_ring_buffer[r]), F); - - // Make sure there is something to be compressed. - - if (len == 0) - return; - - // Insert the F strings, each of which begins with one or more - // 'space' characters. Note the order in which these strings - // are inserted. This way, degenerate trees will be less likely - // to occur. - - for (i = 1; i <= F; i++) - { - InsertNode((short int) (r - i)); - } - - // Finally, insert the whole string just read. The - // member variables match_length and match_position are set. - - InsertNode(r); - - // Now that we're preloaded, continue till done. - - do - { - - // m_match_length may be spuriously long near the end of - // text. - - if (m_match_length > len) - { - m_match_length = len; - } - - // Is it cheaper to store this as a single character? If so, - // make it so. - - if (m_match_length < THRESHOLD) - { - // Send one character. Remember that code_buf[0] is the - // set of flags for the next eight items. - - m_match_length = 1; - code_buf[0] |= mask; - code_buf[code_buf_pos++] = m_ring_buffer[r]; - } - - // Otherwise, we do indeed have a string that can be stored - // compressed to save space. - - else - { - // The next 16 bits need to contain the position (12 bits) - // and the length (4 bits). - - code_buf[code_buf_pos++] = (unsigned char) m_match_position; - code_buf[code_buf_pos++] = (unsigned char) ( - ((m_match_position >> 4) & 0xf0) | - (m_match_length - THRESHOLD) ); - } - - // Shift the mask one bit to the left so that it will be ready - // to store the new bit. - - mask = (unsigned char) (mask << 1); - - // If the mask is now 0, then we know that we have a full set - // of flags and items in the code buffer. These need to be - // output. - - if (mask == 0) - { - // code_buf is the buffer of characters to be output. - // code_buf_pos is the number of characters it contains. - - SendChars(code_buf, code_buf_pos); - - // Reset for next buffer... - - code_buf[0] = 0; - code_buf_pos = 1; - mask = 1; - } - - last_match_length = m_match_length; - - // Delete old strings and read new bytes... - - for (i = 0; i < last_match_length; i++) - { - - // Get next character... - - if (GetChars(&c, 1) != 1) - break; - - // Delete "old strings" - - DeleteNode(s); - - // Put this character into the ring buffer. - // - // The original comment here says "If the position is near - // the end of the buffer, extend the buffer to make - // string comparison easier." - // - // That's a little misleading, because the "end" of the - // buffer is really what we consider to be the "beginning" - // of the buffer, that is, positions 0 through F. - // - // The idea is that the front end of the buffer is duplicated - // into the back end so that when you're looking at characters - // at the back end of the buffer, you can index ahead (beyond - // the normal end of the buffer) and see the characters - // that are at the front end of the buffer wihtout having - // to adjust the index. - // - // That is... - // - // 1234xxxxxxxxxxxxxxxxxxxxxxxxxxxxx1234 - // | | | - // position 0 end of buffer | - // | - // duplicate of front of buffer - - m_ring_buffer[s] = c; - - if (s < F - 1) - { - m_ring_buffer[s + N] = c; - } - - // Increment the position, and wrap around when we're at - // the end. Note that this relies on N being a power of 2. - - s = (short int) ( (s + 1) & (N - 1) ); - r = (short int) ( (r + 1) & (N - 1) ); - - // Register the string that is found in - // m_ring_buffer[r..r+F-1]. - - InsertNode(r); - } - - // If we didn't quit because we hit the last_match_length, - // then we must have quit because we ran out of characters - // to process. - - while (i++ < last_match_length) - { - DeleteNode(s); - - s = (short int) ( (s + 1) & (N - 1) ); - r = (short int) ( (r + 1) & (N - 1) ); - - // Note that len hitting 0 is the key that causes the - // do...while() to terminate. This is the only place - // within the loop that len is modified. - // - // Its original value is F (or a number less than F for - // short strings). - - if (--len) - { - InsertNode(r); /* buffer may not be empty. */ - } - } - - // End of do...while() loop. Continue processing until there - // are no more characters to be compressed. The variable - // "len" is used to signal this condition. - } - while (len > 0); - - // There could still be something in the output buffer. Send it - // now. - - if (code_buf_pos > 1) - { - // code_buf is the encoded string to send. - // code_buf_ptr is the number of characters. - - SendChars(code_buf, code_buf_pos); - } - - // Done! - } - -/* - ------------------------------------------------------------------------- - cLZSS::Decode - - This function "decodes" the input stream into the output stream. - The GetChars() and SendChars() functions are used to separate - this method from the actual i/o. - ------------------------------------------------------------------------- -*/ - -void cLZSS::Decode( // no return value - void) // no parameters - - { - int k; - int r; // node number - unsigned char c[F]; // an array of chars - unsigned char flags; // 8 bits of flags - int flag_count; // which flag we're on - short int pos; // position in the ring buffer - short int len; // number of chars in ring buffer - - // Initialize the ring buffer with a common string. - // - // Note that the last F bytes of the ring buffer are not filled. - - memset(m_ring_buffer, ' ', N - F); - - r = N - F; - - flags = (char) 0; - flag_count = 0; - - for ( ; ; ) - { - - // If there are more bits of interest in this flag, then - // shift that next interesting bit into the 1's position. - // - // If this flag has been exhausted, the next byte must - // be a flag. - - if (flag_count > 0) - { - flags = (unsigned char) (flags >> 1); - flag_count--; - } - else - { - // Next byte must be a flag. - - if (GetChars(&flags, 1) != 1) - break; - - // Set the flag counter. While at first it might appear - // that this should be an 8 since there are 8 bits in the - // flag, it should really be a 7 because the shift must - // be performed 7 times in order to see all 8 bits. - - flag_count = 7; - } - - // If the low order bit of the flag is now set, then we know - // that the next byte is a single, unencoded character. - - if (flags & 1) - { - if (GetChars(c, 1) != 1) - break; - - if (SendChars(c, 1) != 1) - break; - - // Add to buffer, and increment to next spot. Wrap at end. - - m_ring_buffer[r] = c[0]; - r = (short int) ( (r + 1) & (N - 1) ); - } - - // Otherwise, we know that the next two bytes are a - // pair. The position is in 12 bits and - // the length is in 4 bits. - - else - { - // Original code: - // if ((i = getc(infile)) == EOF) - // break; - // if ((j = getc(infile)) == EOF) - // break; - // i |= ((j & 0xf0) << 4); - // j = (j & 0x0f) + THRESHOLD; - // - // I've modified this to only make one input call, and - // have changed the variable names to something more - // obvious. - - if (GetChars(c, 2) != 2) - break; - - // Convert these two characters into the position and - // length. Note that the length is always at least - // THRESHOLD, which is why we're able to get a length - // of 18 out of only 4 bits. - - pos = (short int) ( c[0] | ((c[1] & 0xf0) << 4) ); - - len = (short int) ( (c[1] & 0x0f) + THRESHOLD ); - - // There are now "len" characters at position "pos" in - // the ring buffer that can be pulled out. Note that - // len is never more than F. - - for (k = 0; k < len; k++) - { - c[k] = m_ring_buffer[(pos + k) & (N - 1)]; - - // Add to buffer, and increment to next spot. Wrap at end. - - m_ring_buffer[r] = c[k]; - r = (short int) ( (r + 1) & (N - 1) ); - } - - // Add the "len" characters to the output stream. - - if (SendChars(c, len) != len) - break; - } - } - } - diff --git a/src/modules/common/xzcomprs.cpp b/src/modules/common/xzcomprs.cpp new file mode 100644 index 0000000..db8a4a8 --- /dev/null +++ b/src/modules/common/xzcomprs.cpp @@ -0,0 +1,181 @@ +/****************************************************************************** + * + * xzcomprs.cpp - XzCompress, a driver class that provides xz (LZMA2) + * compression + * + * $Id: xzcomprs.cpp 2850 2013-07-02 09:57:20Z chrislit $ + * + * Copyright 2013 CrossWire Bible Society (http://www.crosswire.org) + * CrossWire Bible Society + * P. O. Box 2528 + * Tempe, AZ 85280-2528 + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + */ + + +#include +#include +#include +#include +#include + +SWORD_NAMESPACE_START + +/****************************************************************************** + * XzCompress Constructor - Initializes data for instance of XzCompress + * + */ + +XzCompress::XzCompress() : SWCompress() { +} + + +/****************************************************************************** + * XzCompress Destructor - Cleans up instance of XzCompress + */ + +XzCompress::~XzCompress() { +} + + +/****************************************************************************** + * XzCompress::Encode - This function "encodes" the input stream into the + * output stream. + * The GetChars() and SendChars() functions are + * used to separate this method from the actual + * i/o. + * NOTE: must set zlen for parent class to know length of + * compressed buffer. + */ + +void XzCompress::Encode(void) +{ +/* +ZEXTERN int ZEXPORT compress OF((Bytef *dest, uLongf *destLen, + const Bytef *source, uLong sourceLen)); + Compresses the source buffer into the destination buffer. sourceLen is + the byte length of the source buffer. Upon entry, destLen is the total + size of the destination buffer, which must be at least 0.1% larger than + sourceLen plus 12 bytes. Upon exit, destLen is the actual size of the + compressed buffer. + This function can be used to compress a whole file at once if the + input file is mmap'ed. + compress returns Z_OK if success, Z_MEM_ERROR if there was not + enough memory, Z_BUF_ERROR if there was not enough room in the output + buffer. +*/ + direct = 0; // set direction needed by parent [Get|Send]Chars() + + // get buffer + char chunk[1024]; + char *buf = (char *)calloc(1, 1024); + char *chunkbuf = buf; + unsigned long chunklen; + unsigned long len = 0; + while((chunklen = GetChars(chunk, 1023))) { + memcpy(chunkbuf, chunk, chunklen); + len += chunklen; + if (chunklen < 1023) + break; + else buf = (char *)realloc(buf, len + 1024); + chunkbuf = buf+len; + } + + + zlen = (long) (len*1.001)+15; + char *zbuf = new char[zlen+1]; + if (len) + { + //printf("Doing compress\n"); + if (compress((Bytef*)zbuf, &zlen, (const Bytef*)buf, len) != Z_OK) + { + printf("ERROR in compression\n"); + } + else { + SendChars(zbuf, zlen); + } + } + else + { + fprintf(stderr, "ERROR: no buffer to compress\n"); + } + delete [] zbuf; + free (buf); +} + + +/****************************************************************************** + * XzCompress::Decode - This function "decodes" the input stream into the + * output stream. + * The GetChars() and SendChars() functions are + * used to separate this method from the actual + * i/o. + */ + +void XzCompress::Decode(void) +{ +/* +ZEXTERN int ZEXPORT uncompress OF((Bytef *dest, uLongf *destLen, + const Bytef *source, uLong sourceLen)); + Decompresses the source buffer into the destination buffer. sourceLen is + the byte length of the source buffer. Upon entry, destLen is the total + size of the destination buffer, which must be large enough to hold the + entire uncompressed data. (The size of the uncompressed data must have + been saved previously by the compressor and transmitted to the decompressor + by some mechanism outside the scope of this compression library.) + Upon exit, destLen is the actual size of the compressed buffer. + This function can be used to decompress a whole file at once if the + input file is mmap'ed. + + uncompress returns Z_OK if success, Z_MEM_ERROR if there was not + enough memory, Z_BUF_ERROR if there was not enough room in the output + buffer, or Z_DATA_ERROR if the input data was corrupted. +*/ + + // get buffer + char chunk[1024]; + char *zbuf = (char *)calloc(1, 1024); + char *chunkbuf = zbuf; + int chunklen; + unsigned long zlen = 0; + while((chunklen = GetChars(chunk, 1023))) { + memcpy(chunkbuf, chunk, chunklen); + zlen += chunklen; + if (chunklen < 1023) + break; + else zbuf = (char *)realloc(zbuf, zlen + 1024); + chunkbuf = zbuf + zlen; + } + + //printf("Decoding complength{%ld} uncomp{%ld}\n", zlen, blen); + if (zlen) { + unsigned long blen = zlen*20; // trust compression is less than 1000% + char *buf = new char[blen]; + //printf("Doing decompress {%s}\n", zbuf); + slen = 0; + switch (uncompress((Bytef*)buf, &blen, (Bytef*)zbuf, zlen)){ + case Z_OK: SendChars(buf, blen); slen = blen; break; + case Z_MEM_ERROR: fprintf(stderr, "ERROR: not enough memory during decompression.\n"); break; + case Z_BUF_ERROR: fprintf(stderr, "ERROR: not enough room in the out buffer during decompression.\n"); break; + case Z_DATA_ERROR: fprintf(stderr, "ERROR: corrupt data during decompression.\n"); break; + default: fprintf(stderr, "ERROR: an unknown error occured during decompression.\n"); break; + } + delete [] buf; + } + else { + fprintf(stderr, "ERROR: no buffer to decompress!\n"); + } + //printf("Finished decoding\n"); + free (zbuf); +} + +SWORD_NAMESPACE_END diff --git a/src/modules/common/zipcomprs.cpp b/src/modules/common/zipcomprs.cpp index 534d840..3e44abd 100644 --- a/src/modules/common/zipcomprs.cpp +++ b/src/modules/common/zipcomprs.cpp @@ -1,9 +1,11 @@ /****************************************************************************** - * swcomprs.cpp - code for class 'ZipCompress'- a driver class that provides - * compression utilities. - using zlib * + * zipcomprs.cpp - ZipCompress, a driver class that provides zlib + * compression + * + * $Id: zipcomprs.cpp 2833 2013-06-29 06:40:28Z chrislit $ * - * Copyright 2009 CrossWire Bible Society (http://www.crosswire.org) + * Copyright 2000-2013 CrossWire Bible Society (http://www.crosswire.org) * CrossWire Bible Society * P. O. Box 2528 * Tempe, AZ 85280-2528 diff --git a/src/modules/common/zstr.cpp b/src/modules/common/zstr.cpp index 5b4da64..a745502 100644 --- a/src/modules/common/zstr.cpp +++ b/src/modules/common/zstr.cpp @@ -1,10 +1,12 @@ /****************************************************************************** - * zstr.cpp - code for class 'zStr'- a module that reads compressed text + * + * zstr.cpp - code for class 'zStr'- a module that reads compressed text * files and provides lookup and parsing functions based on * class StrKey * + * $Id: zstr.cpp 2980 2013-09-14 21:51:47Z scribe $ * - * Copyright 2009 CrossWire Bible Society (http://www.crosswire.org) + * Copyright 2001-2013 CrossWire Bible Society (http://www.crosswire.org) * CrossWire Bible Society * P. O. Box 2528 * Tempe, AZ 85280-2528 @@ -54,11 +56,10 @@ const int zStr::ZDXENTRYSIZE = 8; * ENT: ipath - path of the directory where data and index files are located. */ -zStr::zStr(const char *ipath, int fileMode, long blockCount, SWCompress *icomp) +zStr::zStr(const char *ipath, int fileMode, long blockCount, SWCompress *icomp, bool caseSensitive) : caseSensitive(caseSensitive) { SWBuf buf; - nl = '\n'; lastoff = -1; path = 0; stdstr(&path, ipath); @@ -144,7 +145,7 @@ void zStr::getKeyFromDatOffset(long ioffset, char **buf) const datfd->read(*buf, size); } (*buf)[size] = 0; - toupperstr_utf8(*buf, size*2); + if (!caseSensitive) toupperstr_utf8(*buf, size*2); } else { *buf = (*buf) ? (char *)realloc(*buf, 1) : (char *)malloc(1); @@ -201,7 +202,7 @@ signed char zStr::findKeyIndex(const char *ikey, long *idxoff, long away) const if (*ikey) { headoff = 0; stdstr(&key, ikey, 3); - toupperstr_utf8(key, strlen(key)*3); + if (!caseSensitive) toupperstr_utf8(key, strlen(key)*3); int keylen = strlen(key); bool substr = false; @@ -321,7 +322,7 @@ signed char zStr::findKeyIndex(const char *ikey, long *idxoff, long away) const * */ -void zStr::getText(long offset, char **idxbuf, char **buf) { +void zStr::getText(long offset, char **idxbuf, char **buf) const { char *ch; char *idxbuflocal = 0; getKeyFromIdxOffset(offset, &idxbuflocal); @@ -386,7 +387,7 @@ void zStr::getText(long offset, char **idxbuf, char **buf) { * file. */ -void zStr::getCompressedText(long block, long entry, char **buf) { +void zStr::getCompressedText(long block, long entry, char **buf) const { __u32 size = 0; @@ -431,12 +432,13 @@ void zStr::getCompressedText(long block, long entry, char **buf) { void zStr::setText(const char *ikey, const char *buf, long len) { + static const char nl[] = {13, 10}; + __u32 start, outstart; __u32 size, outsize; __s32 endoff; long idxoff = 0; __s32 shiftSize; - static const char nl[] = {13, 10}; char *tmpbuf = 0; char *key = 0; char *dbKey = 0; @@ -446,7 +448,7 @@ void zStr::setText(const char *ikey, const char *buf, long len) { len = (len < 0) ? strlen(buf) : len; stdstr(&key, ikey, 3); - toupperstr_utf8(key, strlen(key)*3); + if (!caseSensitive) toupperstr_utf8(key, strlen(key)*3); char notFound = findKeyIndex(ikey, &idxoff, 0); if (!notFound) { @@ -583,7 +585,10 @@ void zStr::linkEntry(const char *destkey, const char *srckey) { } -void zStr::flushCache() { +void zStr::flushCache() const { + + static const char nl[] = {13, 10}; + if (cacheBlock) { if (cacheDirty) { __u32 start = 0; diff --git a/src/modules/common/zverse.cpp b/src/modules/common/zverse.cpp index fa76467..c280d98 100644 --- a/src/modules/common/zverse.cpp +++ b/src/modules/common/zverse.cpp @@ -1,11 +1,13 @@ /****************************************************************************** - * zverse.h - code for class 'zVerse'- a module that reads raw text + * + * zverse.cpp - code for class 'zVerse'- a module that reads raw text * files: ot and nt using indexs ??.bks ??.cps ??.vss * and provides lookup and parsing functions based on * class VerseKey for compressed modules * + * $Id: zverse.cpp 2833 2013-06-29 06:40:28Z chrislit $ * - * Copyright 2009 CrossWire Bible Society (http://www.crosswire.org) + * Copyright 1996-2013 CrossWire Bible Society (http://www.crosswire.org) * CrossWire Bible Society * P. O. Box 2528 * Tempe, AZ 85280-2528 @@ -155,7 +157,7 @@ void zVerse::findOffset(char testmt, long idxoff, long *start, unsigned short *s // set size to // set *start = *size = *buffnum = 0; - //printf ("Finding offset %ld\n", idxoff); + //fprintf(stderr, "Finding offset %ld\n", idxoff); idxoff *= 10; if (!testmt) { testmt = ((idxfp[0]) ? 1:2); @@ -168,7 +170,7 @@ void zVerse::findOffset(char testmt, long idxoff, long *start, unsigned short *s long newOffset = compfp[testmt-1]->seek(idxoff, SEEK_SET); if (newOffset == idxoff) { if (compfp[testmt-1]->read(&ulBuffNum, 4) != 4) { - printf ("Error reading ulBuffNum\n"); + fprintf(stderr, "Error reading ulBuffNum\n"); return; } } @@ -176,12 +178,12 @@ void zVerse::findOffset(char testmt, long idxoff, long *start, unsigned short *s if (compfp[testmt-1]->read(&ulVerseStart, 4) < 2) { - printf ("Error reading ulVerseStart\n"); + fprintf(stderr, "Error reading ulVerseStart\n"); return; } if (compfp[testmt-1]->read(&usVerseSize, 2) < 2) { - printf ("Error reading usVerseSize\n"); + fprintf(stderr, "Error reading usVerseSize\n"); return; } @@ -202,7 +204,7 @@ void zVerse::findOffset(char testmt, long idxoff, long *start, unsigned short *s * */ -void zVerse::zReadText(char testmt, long start, unsigned short size, unsigned long ulBuffNum, SWBuf &inBuf) { +void zVerse::zReadText(char testmt, long start, unsigned short size, unsigned long ulBuffNum, SWBuf &inBuf) const { __u32 ulCompOffset = 0; // compressed buffer start __u32 ulCompSize = 0; // buffer size compressed __u32 ulUnCompSize = 0; // buffer size uncompressed @@ -217,26 +219,26 @@ void zVerse::zReadText(char testmt, long start, unsigned short size, unsigned lo if (size && !(((long) ulBuffNum == cacheBufIdx) && (testmt == cacheTestament) && (cacheBuf))) { - //printf ("Got buffer number{%ld} versestart{%ld} versesize{%d}\n", ulBuffNum, ulVerseStart, usVerseSize); + //fprintf(stderr, "Got buffer number{%ld} versestart{%ld} versesize{%d}\n", ulBuffNum, ulVerseStart, usVerseSize); if (idxfp[testmt-1]->seek(ulBuffNum*12, SEEK_SET)!=(long) ulBuffNum*12) { - printf ("Error seeking compressed file index\n"); + fprintf(stderr, "Error seeking compressed file index\n"); return; } if (idxfp[testmt-1]->read(&ulCompOffset, 4)<4) { - printf ("Error reading ulCompOffset\n"); + fprintf(stderr, "Error reading ulCompOffset\n"); return; } if (idxfp[testmt-1]->read(&ulCompSize, 4)<4) { - printf ("Error reading ulCompSize\n"); + fprintf(stderr, "Error reading ulCompSize\n"); return; } if (idxfp[testmt-1]->read(&ulUnCompSize, 4)<4) { - printf ("Error reading ulUnCompSize\n"); + fprintf(stderr, "Error reading ulUnCompSize\n"); return; } @@ -246,14 +248,14 @@ void zVerse::zReadText(char testmt, long start, unsigned short size, unsigned lo if (textfp[testmt-1]->seek(ulCompOffset, SEEK_SET)!=(long)ulCompOffset) { - printf ("Error: could not seek to right place in compressed text\n"); + fprintf(stderr, "Error: could not seek to right place in compressed text\n"); return; } SWBuf pcCompText; pcCompText.setSize(ulCompSize+5); if (textfp[testmt-1]->read(pcCompText.getRawData(), ulCompSize)<(long)ulCompSize) { - printf ("Error reading compressed text\n"); + fprintf(stderr, "Error reading compressed text\n"); return; } pcCompText.setSize(ulCompSize); @@ -335,7 +337,7 @@ void zVerse::doSetText(char testmt, long idxoff, const char *buf, long len) { } -void zVerse::flushCache() { +void zVerse::flushCache() const { if (dirtyCache) { __u32 idxoff; __u32 start, outstart; @@ -469,15 +471,15 @@ char zVerse::createModule(const char *ipath, int blockBound, const char *v11n) VerseKey vk; vk.setVersificationSystem(v11n); - vk.Headings(1); + vk.setIntros(true); __s32 offset = 0; __s16 size = 0; offset = archtosword32(offset); size = archtosword16(size); - for (vk = TOP; !vk.Error(); vk++) { - if (vk.Testament() < 2) { + for (vk = TOP; !vk.popError(); vk++) { + if (vk.getTestament() < 2) { fd->write(&offset, 4); //compBufIdxOffset fd->write(&offset, 4); fd->write(&size, 2); -- cgit v1.2.3