diff options
author | Martin Ebourne <martin@ebourne.me.uk> | 2005-12-07 16:24:49 +0000 |
---|---|---|
committer | Martin Ebourne <martin@ebourne.me.uk> | 2005-12-07 16:24:49 +0000 |
commit | 065dc6f8cd168e3ee6e71ddfb06f42a92abfabbd (patch) | |
tree | ec09f9aff78d337b64ef4af7c7cba2d23a4ef0e9 /lib/backupclient/BackupStoreFileDiff.cpp | |
parent | 9950474440f85c5a35be8cfe7a85cee884209f20 (diff) |
Merged chromi/diffopt at r116 to trunk
Diffstat (limited to 'lib/backupclient/BackupStoreFileDiff.cpp')
-rwxr-xr-x | lib/backupclient/BackupStoreFileDiff.cpp | 128 |
1 files changed, 103 insertions, 25 deletions
diff --git a/lib/backupclient/BackupStoreFileDiff.cpp b/lib/backupclient/BackupStoreFileDiff.cpp index b72ce328..22395151 100755 --- a/lib/backupclient/BackupStoreFileDiff.cpp +++ b/lib/backupclient/BackupStoreFileDiff.cpp @@ -41,11 +41,12 @@ static void LoadIndex(IOStream &rBlockIndex, int64_t ThisID, BlocksAvailableEntr static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]); static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> &rFoundBlocks, BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]); static void SetupHashTable(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t BlockSize, BlocksAvailableEntry **pHashTable); -static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, uint16_t Hash, uint8_t *pBeginnings, uint8_t *pEndings, int Offset, int32_t BlockSize, int64_t FileBlockNumber, BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks); +static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings, int Offset, int32_t BlockSize, int64_t FileBlockNumber, +BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks); static void GenerateRecipe(BackupStoreFileEncodeStream::Recipe &rRecipe, BlocksAvailableEntry *pIndex, int64_t NumBlocks, std::map<int64_t, int64_t> &rFoundBlocks, int64_t SizeOfInputFile); // Avoid running on too long with diffs -static int sMaximumDiffTime = 10; // maximum time to spend diffing +static int sMaximumDiffTime = 600; // maximum time to spend diffing static bool sDiffTimedOut = false; static bool sSetTimerSignelHandler = false; static void TimerSignalHandler(int signal); @@ -426,7 +427,9 @@ static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, i // Find the position of the size in the array for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t) { - if(i->second > sizeCounts[t]) + // Instead of sorting on the raw count of blocks, + // take the file area covered by this block size. + if(i->second * i->first > sizeCounts[t] * Sizes[t]) { // Then this size belong before this entry -- shuffle them up for(int s = (BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1); s >= t; --s) @@ -470,6 +473,8 @@ static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, i static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> &rFoundBlocks, BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]) { + std::map<int64_t, int32_t> goodnessOfFit; + // Allocate the hash lookup table BlocksAvailableEntry **phashTable = (BlocksAvailableEntry **)::malloc(sizeof(BlocksAvailableEntry *) * (64*1024)); @@ -500,7 +505,7 @@ static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> // Check buffer allocation if(pbuffer0 == 0 || pbuffer1 == 0 || phashTable == 0) { - // If a buffer got alloocated, it will be cleaned up in the catch block + // If a buffer got allocated, it will be cleaned up in the catch block throw std::bad_alloc(); } @@ -547,38 +552,85 @@ static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> // Then roll, until the file is exhausted int64_t fileBlockNumber = 0; + int64_t fileOffset = 0; int rollOverInitialBytes = 0; while(true) { // Load in another block of data, and record how big it is int bytesInEndings = rFile.Read(endings, Sizes[s]); + int tmp; // Skip any bytes from a previous matched block - while(rollOverInitialBytes > 0 && offset < bytesInEndings) + if(rollOverInitialBytes > 0 && offset < bytesInEndings) { - rolling.RollForward(beginnings[offset], endings[offset], Sizes[s]); - ++offset; - --rollOverInitialBytes; + int spaceLeft = bytesInEndings - offset; + int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes; + + rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll); + + offset += thisRoll; + fileOffset += thisRoll; + rollOverInitialBytes -= thisRoll; + + if(rollOverInitialBytes) + { + goto refresh; + } + } + + if(goodnessOfFit.count(fileOffset)) + { + tmp = goodnessOfFit[fileOffset]; + } + else + { + tmp = 0; + } + + if(tmp >= Sizes[s]) + { + // Skip over bigger ready-matched blocks completely + rollOverInitialBytes = tmp; + int spaceLeft = bytesInEndings - offset; + int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes; + + rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll); + + offset += thisRoll; + fileOffset += thisRoll; + rollOverInitialBytes -= thisRoll; + + if(rollOverInitialBytes) + { + goto refresh; + } } while(offset < bytesInEndings) { - // Put is current checksum in hash list? + // Is current checksum in hash list? uint16_t hash = rolling.GetComponentForHashing(); - if(phashTable[hash] != 0) + if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s])) { - if(SecondStageMatch(phashTable[hash], hash, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks)) + if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks)) { + goodnessOfFit[fileOffset] = Sizes[s]; + // Block matched, roll the checksum forward to the next block without doing // any more comparisons, because these are pointless (as any more matches will be ignored when // the receipe is generated) and just take up valuable processor time. Edge cases are // especially nasty, using huge amounts of time and memory. int skip = Sizes[s]; - while(offset < bytesInEndings && skip > 0) + if(offset < bytesInEndings && skip > 0) { - rolling.RollForward(beginnings[offset], endings[offset], Sizes[s]); - ++offset; - --skip; + int spaceLeft = bytesInEndings - offset; + int thisRoll = (skip > spaceLeft) ? spaceLeft : skip; + + rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll); + + offset += thisRoll; + fileOffset += thisRoll; + skip -= thisRoll; } // Not all the bytes necessary will have been skipped, so get them // skipped after the next block is loaded. @@ -588,8 +640,7 @@ static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> break; } - if(static_cast<int64_t>(rFoundBlocks.size()) > (NumBlocks * BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE) - || sDiffTimedOut) + if(static_cast<int64_t>(rFoundBlocks.size()) > (NumBlocks * BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE) || sDiffTimedOut) { abortSearch = true; break; @@ -599,12 +650,14 @@ static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> // Roll checksum forward rolling.RollForward(beginnings[offset], endings[offset], Sizes[s]); - // Increment offset + // Increment offsets ++offset; + ++fileOffset; } if(abortSearch) break; + refresh: // Finished? if(bytesInEndings != Sizes[s]) { @@ -612,9 +665,12 @@ static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> // (Do a copy and paste of 5 lines of code instead of introducing a comparison for // each byte of the file) uint16_t hash = rolling.GetComponentForHashing(); - if(phashTable[hash] != 0) + if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s])) { - SecondStageMatch(phashTable[hash], hash, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks); + if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks)) + { + goodnessOfFit[fileOffset] = Sizes[s]; + } } // finish @@ -716,7 +772,7 @@ static void SetupHashTable(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int3 // Created: 14/1/04 // // -------------------------------------------------------------------------- -static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, uint16_t Hash, uint8_t *pBeginnings, uint8_t *pEndings, +static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings, int Offset, int32_t BlockSize, int64_t FileBlockNumber, BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks) { // Check parameters @@ -727,6 +783,26 @@ static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, uint16_t Ha ASSERT(pFirstInHashList != 0); ASSERT(pIndex != 0); + uint16_t Hash = fastSum.GetComponentForHashing(); + uint32_t Checksum = fastSum.GetChecksum(); + + // Before we go to the expense of the MD5, make sure it's a darn good match on the checksum we already know. + BlocksAvailableEntry *scan = pFirstInHashList; + bool found=false; + while(scan != 0) + { + if(scan->mWeakChecksum == Checksum) + { + found = true; + break; + } + scan = scan->mpNextInHashList; + } + if(!found) + { + return false; + } + // Calculate the strong MD5 digest for this block MD5Digest strong; // Add the data from the beginnings @@ -739,7 +815,7 @@ static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, uint16_t Ha strong.Finish(); // Then go through the entries in the hash list, comparing with the strong digest calculated - BlocksAvailableEntry *scan = pFirstInHashList; + scan = pFirstInHashList; //TRACE0("second stage match\n"); while(scan != 0) { @@ -753,10 +829,12 @@ static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, uint16_t Ha //TRACE0("Match!\n"); // Found! Add to list of found blocks... int64_t fileOffset = (FileBlockNumber * BlockSize) + Offset; - int64_t blockIndex = (scan - pIndex); // pointer arthmitic is frowed apon. But most efficient way of doing it here -- alternative is to use more memory + int64_t blockIndex = (scan - pIndex); // pointer arthmitic is frowned upon. But most efficient way of doing it here -- alternative is to use more memory - // Because we search for smallest blocks first, if there's an entry there - // we'll just be replacing it with a larger one, which is great news. + // We do NOT search for smallest blocks first, as this code originally assumed. + // To prevent this from potentially overwriting a better match, the caller must determine + // the relative "goodness" of any existing match and this one, and avoid the call if it + // could be detrimental. rFoundBlocks[fileOffset] = blockIndex; // No point in searching further, report success |