From 065dc6f8cd168e3ee6e71ddfb06f42a92abfabbd Mon Sep 17 00:00:00 2001 From: Martin Ebourne Date: Wed, 7 Dec 2005 16:24:49 +0000 Subject: Merged chromi/diffopt at r116 to trunk --- lib/backupclient/BackupStoreConstants.h | 4 +- lib/backupclient/BackupStoreFileDiff.cpp | 128 +++++++++++++++++++++++++------ lib/crypto/RollingChecksum.cpp | 28 ++++++- lib/crypto/RollingChecksum.h | 25 ++++-- 4 files changed, 149 insertions(+), 36 deletions(-) (limited to 'lib') diff --git a/lib/backupclient/BackupStoreConstants.h b/lib/backupclient/BackupStoreConstants.h index 8254e918..907072ea 100755 --- a/lib/backupclient/BackupStoreConstants.h +++ b/lib/backupclient/BackupStoreConstants.h @@ -28,10 +28,10 @@ #define BACKUP_FILE_AVOID_BLOCKS_LESS_THAN 128 // Maximum number of sizes to do an rsync-like scan for -#define BACKUP_FILE_DIFF_MAX_BLOCK_SIZES 8 +#define BACKUP_FILE_DIFF_MAX_BLOCK_SIZES 64 // When doing rsync scans, do not scan for blocks smaller than -#define BACKUP_FILE_DIFF_MIN_BLOCK_SIZE 256 +#define BACKUP_FILE_DIFF_MIN_BLOCK_SIZE 128 // A limit to stop diffing running out of control: If more than this // times the number of blocks in the original index are found, stop diff --git a/lib/backupclient/BackupStoreFileDiff.cpp b/lib/backupclient/BackupStoreFileDiff.cpp index b72ce328..22395151 100755 --- a/lib/backupclient/BackupStoreFileDiff.cpp +++ b/lib/backupclient/BackupStoreFileDiff.cpp @@ -41,11 +41,12 @@ static void LoadIndex(IOStream &rBlockIndex, int64_t ThisID, BlocksAvailableEntr static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]); static void SearchForMatchingBlocks(IOStream &rFile, std::map &rFoundBlocks, BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]); static void SetupHashTable(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t BlockSize, BlocksAvailableEntry **pHashTable); -static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, uint16_t Hash, uint8_t *pBeginnings, uint8_t *pEndings, int Offset, int32_t BlockSize, int64_t FileBlockNumber, BlocksAvailableEntry *pIndex, std::map &rFoundBlocks); +static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings, int Offset, int32_t BlockSize, int64_t FileBlockNumber, +BlocksAvailableEntry *pIndex, std::map &rFoundBlocks); static void GenerateRecipe(BackupStoreFileEncodeStream::Recipe &rRecipe, BlocksAvailableEntry *pIndex, int64_t NumBlocks, std::map &rFoundBlocks, int64_t SizeOfInputFile); // Avoid running on too long with diffs -static int sMaximumDiffTime = 10; // maximum time to spend diffing +static int sMaximumDiffTime = 600; // maximum time to spend diffing static bool sDiffTimedOut = false; static bool sSetTimerSignelHandler = false; static void TimerSignalHandler(int signal); @@ -426,7 +427,9 @@ static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, i // Find the position of the size in the array for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t) { - if(i->second > sizeCounts[t]) + // Instead of sorting on the raw count of blocks, + // take the file area covered by this block size. + if(i->second * i->first > sizeCounts[t] * Sizes[t]) { // Then this size belong before this entry -- shuffle them up for(int s = (BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1); s >= t; --s) @@ -470,6 +473,8 @@ static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, i static void SearchForMatchingBlocks(IOStream &rFile, std::map &rFoundBlocks, BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]) { + std::map goodnessOfFit; + // Allocate the hash lookup table BlocksAvailableEntry **phashTable = (BlocksAvailableEntry **)::malloc(sizeof(BlocksAvailableEntry *) * (64*1024)); @@ -500,7 +505,7 @@ static void SearchForMatchingBlocks(IOStream &rFile, std::map // Check buffer allocation if(pbuffer0 == 0 || pbuffer1 == 0 || phashTable == 0) { - // If a buffer got alloocated, it will be cleaned up in the catch block + // If a buffer got allocated, it will be cleaned up in the catch block throw std::bad_alloc(); } @@ -547,38 +552,85 @@ static void SearchForMatchingBlocks(IOStream &rFile, std::map // Then roll, until the file is exhausted int64_t fileBlockNumber = 0; + int64_t fileOffset = 0; int rollOverInitialBytes = 0; while(true) { // Load in another block of data, and record how big it is int bytesInEndings = rFile.Read(endings, Sizes[s]); + int tmp; // Skip any bytes from a previous matched block - while(rollOverInitialBytes > 0 && offset < bytesInEndings) + if(rollOverInitialBytes > 0 && offset < bytesInEndings) { - rolling.RollForward(beginnings[offset], endings[offset], Sizes[s]); - ++offset; - --rollOverInitialBytes; + int spaceLeft = bytesInEndings - offset; + int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes; + + rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll); + + offset += thisRoll; + fileOffset += thisRoll; + rollOverInitialBytes -= thisRoll; + + if(rollOverInitialBytes) + { + goto refresh; + } + } + + if(goodnessOfFit.count(fileOffset)) + { + tmp = goodnessOfFit[fileOffset]; + } + else + { + tmp = 0; + } + + if(tmp >= Sizes[s]) + { + // Skip over bigger ready-matched blocks completely + rollOverInitialBytes = tmp; + int spaceLeft = bytesInEndings - offset; + int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes; + + rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll); + + offset += thisRoll; + fileOffset += thisRoll; + rollOverInitialBytes -= thisRoll; + + if(rollOverInitialBytes) + { + goto refresh; + } } while(offset < bytesInEndings) { - // Put is current checksum in hash list? + // Is current checksum in hash list? uint16_t hash = rolling.GetComponentForHashing(); - if(phashTable[hash] != 0) + if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s])) { - if(SecondStageMatch(phashTable[hash], hash, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks)) + if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks)) { + goodnessOfFit[fileOffset] = Sizes[s]; + // Block matched, roll the checksum forward to the next block without doing // any more comparisons, because these are pointless (as any more matches will be ignored when // the receipe is generated) and just take up valuable processor time. Edge cases are // especially nasty, using huge amounts of time and memory. int skip = Sizes[s]; - while(offset < bytesInEndings && skip > 0) + if(offset < bytesInEndings && skip > 0) { - rolling.RollForward(beginnings[offset], endings[offset], Sizes[s]); - ++offset; - --skip; + int spaceLeft = bytesInEndings - offset; + int thisRoll = (skip > spaceLeft) ? spaceLeft : skip; + + rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll); + + offset += thisRoll; + fileOffset += thisRoll; + skip -= thisRoll; } // Not all the bytes necessary will have been skipped, so get them // skipped after the next block is loaded. @@ -588,8 +640,7 @@ static void SearchForMatchingBlocks(IOStream &rFile, std::map break; } - if(static_cast(rFoundBlocks.size()) > (NumBlocks * BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE) - || sDiffTimedOut) + if(static_cast(rFoundBlocks.size()) > (NumBlocks * BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE) || sDiffTimedOut) { abortSearch = true; break; @@ -599,12 +650,14 @@ static void SearchForMatchingBlocks(IOStream &rFile, std::map // Roll checksum forward rolling.RollForward(beginnings[offset], endings[offset], Sizes[s]); - // Increment offset + // Increment offsets ++offset; + ++fileOffset; } if(abortSearch) break; + refresh: // Finished? if(bytesInEndings != Sizes[s]) { @@ -612,9 +665,12 @@ static void SearchForMatchingBlocks(IOStream &rFile, std::map // (Do a copy and paste of 5 lines of code instead of introducing a comparison for // each byte of the file) uint16_t hash = rolling.GetComponentForHashing(); - if(phashTable[hash] != 0) + if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s])) { - SecondStageMatch(phashTable[hash], hash, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks); + if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks)) + { + goodnessOfFit[fileOffset] = Sizes[s]; + } } // finish @@ -716,7 +772,7 @@ static void SetupHashTable(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int3 // Created: 14/1/04 // // -------------------------------------------------------------------------- -static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, uint16_t Hash, uint8_t *pBeginnings, uint8_t *pEndings, +static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings, int Offset, int32_t BlockSize, int64_t FileBlockNumber, BlocksAvailableEntry *pIndex, std::map &rFoundBlocks) { // Check parameters @@ -727,6 +783,26 @@ static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, uint16_t Ha ASSERT(pFirstInHashList != 0); ASSERT(pIndex != 0); + uint16_t Hash = fastSum.GetComponentForHashing(); + uint32_t Checksum = fastSum.GetChecksum(); + + // Before we go to the expense of the MD5, make sure it's a darn good match on the checksum we already know. + BlocksAvailableEntry *scan = pFirstInHashList; + bool found=false; + while(scan != 0) + { + if(scan->mWeakChecksum == Checksum) + { + found = true; + break; + } + scan = scan->mpNextInHashList; + } + if(!found) + { + return false; + } + // Calculate the strong MD5 digest for this block MD5Digest strong; // Add the data from the beginnings @@ -739,7 +815,7 @@ static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, uint16_t Ha strong.Finish(); // Then go through the entries in the hash list, comparing with the strong digest calculated - BlocksAvailableEntry *scan = pFirstInHashList; + scan = pFirstInHashList; //TRACE0("second stage match\n"); while(scan != 0) { @@ -753,10 +829,12 @@ static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, uint16_t Ha //TRACE0("Match!\n"); // Found! Add to list of found blocks... int64_t fileOffset = (FileBlockNumber * BlockSize) + Offset; - int64_t blockIndex = (scan - pIndex); // pointer arthmitic is frowed apon. But most efficient way of doing it here -- alternative is to use more memory + int64_t blockIndex = (scan - pIndex); // pointer arthmitic is frowned upon. But most efficient way of doing it here -- alternative is to use more memory - // Because we search for smallest blocks first, if there's an entry there - // we'll just be replacing it with a larger one, which is great news. + // We do NOT search for smallest blocks first, as this code originally assumed. + // To prevent this from potentially overwriting a better match, the caller must determine + // the relative "goodness" of any existing match and this one, and avoid the call if it + // could be detrimental. rFoundBlocks[fileOffset] = blockIndex; // No point in searching further, report success diff --git a/lib/crypto/RollingChecksum.cpp b/lib/crypto/RollingChecksum.cpp index 75bad7df..a2954e3a 100755 --- a/lib/crypto/RollingChecksum.cpp +++ b/lib/crypto/RollingChecksum.cpp @@ -20,11 +20,11 @@ // Created: 6/12/03 // // -------------------------------------------------------------------------- -RollingChecksum::RollingChecksum(const void *data, unsigned int Length) +RollingChecksum::RollingChecksum(const void * const data, const unsigned int Length) : a(0), b(0) { - uint8_t *block = (uint8_t *)data; + const uint8_t *block = (const uint8_t *)data; for(unsigned int x = Length; x >= 1; --x) { a += (*block); @@ -34,5 +34,29 @@ RollingChecksum::RollingChecksum(const void *data, unsigned int Length) } } +// -------------------------------------------------------------------------- +// +// Function +// Name: RollingChecksum::RollForwardSeveral(uint8_t*, uint8_t*, unsigned int, unsigned int) +// Purpose: Move the checksum forward a block, given a pointer to the first byte of the current block, +// and a pointer just after the last byte of the current block and the length of the block and of the skip. +// Created: 7/14/05 +// +// -------------------------------------------------------------------------- +void RollingChecksum::RollForwardSeveral(const uint8_t * const StartOfThisBlock, const uint8_t * const LastOfNextBlock, const unsigned int Length, const unsigned int Skip) +{ + // IMPLEMENTATION NOTE: Everything is implicitly mod 2^16 -- uint16_t's will overflow nicely. + unsigned int i; + uint16_t sumBegin=0, j,k; + for(i=0; i < Skip; i++) + { + j = StartOfThisBlock[i]; + k = LastOfNextBlock[i]; + sumBegin += j; + a += (k - j); + b += a; + } + b -= Length * sumBegin; +} diff --git a/lib/crypto/RollingChecksum.h b/lib/crypto/RollingChecksum.h index 99d116b9..be79c36f 100755 --- a/lib/crypto/RollingChecksum.h +++ b/lib/crypto/RollingChecksum.h @@ -24,7 +24,7 @@ class RollingChecksum { public: - RollingChecksum(const void *data, unsigned int Length); + RollingChecksum(const void * const data, const unsigned int Length); // -------------------------------------------------------------------------- // @@ -35,7 +35,7 @@ public: // Created: 6/12/03 // // -------------------------------------------------------------------------- - inline void RollForward(uint8_t StartOfThisBlock, uint8_t LastOfNextBlock, unsigned int Length) + inline void RollForward(const uint8_t StartOfThisBlock, const uint8_t LastOfNextBlock, const unsigned int Length) { // IMPLEMENTATION NOTE: Everything is implicitly mod 2^16 -- uint16_t's will overflow nicely. a -= StartOfThisBlock; @@ -44,6 +44,17 @@ public: b += a; } + // -------------------------------------------------------------------------- + // + // Function + // Name: RollingChecksum::RollForwardSeveral(uint8_t*, uint8_t*, unsigned int, unsigned int) + // Purpose: Move the checksum forward a block, given a pointer to the first byte of the current block, + // and a pointer just after the last byte of the current block and the length of the block and of the skip. + // Created: 7/14/05 + // + // -------------------------------------------------------------------------- + void RollForwardSeveral(const uint8_t * const StartOfThisBlock, const uint8_t * const LastOfNextBlock, const unsigned int Length, const unsigned int Skip); + // -------------------------------------------------------------------------- // // Function @@ -52,14 +63,14 @@ public: // Created: 6/12/03 // // -------------------------------------------------------------------------- - inline uint32_t GetChecksum() + inline uint32_t GetChecksum() const { return ((uint32_t)a) | (((uint32_t)b) << 16); } // Components, just in case they're handy - inline uint16_t GetComponent1() {return a;} - inline uint16_t GetComponent2() {return b;} + inline uint16_t GetComponent1() const {return a;} + inline uint16_t GetComponent2() const {return b;} // -------------------------------------------------------------------------- // @@ -69,7 +80,7 @@ public: // Created: 6/12/03 // // -------------------------------------------------------------------------- - inline uint16_t GetComponentForHashing() + inline uint16_t GetComponentForHashing() const { return b; } @@ -82,7 +93,7 @@ public: // Created: 14/1/04 // // -------------------------------------------------------------------------- - static inline uint16_t ExtractHashingComponent(uint32_t Checksum) + static inline uint16_t ExtractHashingComponent(const uint32_t Checksum) { return Checksum >> 16; } -- cgit v1.2.3