diff options
author | Reinhard Tartler <siretart@tauware.de> | 2017-06-14 19:53:34 -0400 |
---|---|---|
committer | Reinhard Tartler <siretart@tauware.de> | 2017-06-14 19:55:14 -0400 |
commit | e0c122119afea4c951c0c57144d26a473118c254 (patch) | |
tree | 34a02a56f9b017201dfb721ef678c711351466d6 /lib/backupclient/BackupStoreFileDiff.cpp | |
parent | e0eb815b67734abd09ff41e2271630d4b2a6d760 (diff) |
Fixup botched merge
Diffstat (limited to 'lib/backupclient/BackupStoreFileDiff.cpp')
-rw-r--r-- | lib/backupclient/BackupStoreFileDiff.cpp | 1046 |
1 files changed, 0 insertions, 1046 deletions
diff --git a/lib/backupclient/BackupStoreFileDiff.cpp b/lib/backupclient/BackupStoreFileDiff.cpp deleted file mode 100644 index 5705c3aa..00000000 --- a/lib/backupclient/BackupStoreFileDiff.cpp +++ /dev/null @@ -1,1046 +0,0 @@ -// -------------------------------------------------------------------------- -// -// File -// Name: BackupStoreFileDiff.cpp -// Purpose: Functions relating to diffing BackupStoreFiles -// Created: 12/1/04 -// -// -------------------------------------------------------------------------- - -#include "Box.h" - -#include <string.h> - -#include <new> -#include <map> - -#ifdef HAVE_TIME_H - #include <time.h> -#elif HAVE_SYS_TIME_H - #include <sys/time.h> -#endif - -#include "BackupStoreConstants.h" -#include "BackupStoreException.h" -#include "BackupStoreFile.h" -#include "BackupStoreFileCryptVar.h" -#include "BackupStoreFileEncodeStream.h" -#include "BackupStoreFileWire.h" -#include "BackupStoreObjectMagic.h" -#include "CommonException.h" -#include "FileStream.h" -#include "MD5Digest.h" -#include "RollingChecksum.h" -#include "Timer.h" - -#include "MemLeakFindOn.h" - -#include <cstring> - -using namespace BackupStoreFileCryptVar; -using namespace BackupStoreFileCreation; - -// By default, don't trace out details of the diff as we go along -- would fill up logs significantly. -// But it's useful for the test. -#ifndef BOX_RELEASE_BUILD - bool BackupStoreFile::TraceDetailsOfDiffProcess = false; -#endif - -static void LoadIndex(IOStream &rBlockIndex, int64_t ThisID, BlocksAvailableEntry **ppIndex, int64_t &rNumBlocksOut, int Timeout, bool &rCanDiffFromThis); -static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]); -static void SearchForMatchingBlocks(IOStream &rFile, - std::map<int64_t, int64_t> &rFoundBlocks, BlocksAvailableEntry *pIndex, - int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES], - DiffTimer *pDiffTimer); -static void SetupHashTable(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t BlockSize, BlocksAvailableEntry **pHashTable); -static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings, int Offset, int32_t BlockSize, int64_t FileBlockNumber, -BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks); -static void GenerateRecipe(BackupStoreFileEncodeStream::Recipe &rRecipe, BlocksAvailableEntry *pIndex, int64_t NumBlocks, std::map<int64_t, int64_t> &rFoundBlocks, int64_t SizeOfInputFile); - -// -------------------------------------------------------------------------- -// -// Function -// Name: BackupStoreFile::MoveStreamPositionToBlockIndex(IOStream &) -// Purpose: Move the file pointer in this stream to just before the block index. -// Assumes that the stream is at the beginning, seekable, and -// reading from the stream is OK. -// Created: 12/1/04 -// -// -------------------------------------------------------------------------- -void BackupStoreFile::MoveStreamPositionToBlockIndex(IOStream &rStream) -{ - // Size of file - int64_t fileSize = rStream.BytesLeftToRead(); - - // Get header - file_StreamFormat hdr; - - // Read the header - if(!rStream.ReadFullBuffer(&hdr, sizeof(hdr), 0 /* not interested in bytes read if this fails */, IOStream::TimeOutInfinite)) - { - // Couldn't read header - THROW_EXCEPTION(BackupStoreException, CouldntReadEntireStructureFromStream) - } - - // Check magic number - if(ntohl(hdr.mMagicValue) != OBJECTMAGIC_FILE_MAGIC_VALUE_V1 -#ifndef BOX_DISABLE_BACKWARDS_COMPATIBILITY_BACKUPSTOREFILE - && ntohl(hdr.mMagicValue) != OBJECTMAGIC_FILE_MAGIC_VALUE_V0 -#endif - ) - { - THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile) - } - - // Work out where the index is - int64_t numBlocks = box_ntoh64(hdr.mNumBlocks); - int64_t blockHeaderPosFromEnd = ((numBlocks * sizeof(file_BlockIndexEntry)) + sizeof(file_BlockIndexHeader)); - - // Sanity check - if(blockHeaderPosFromEnd > static_cast<int64_t>(fileSize - sizeof(file_StreamFormat))) - { - THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile) - } - - // Seek to that position - rStream.Seek(0 - blockHeaderPosFromEnd, IOStream::SeekType_End); - - // Done. Stream now in right position (as long as the file is formatted correctly) -} - - -// -------------------------------------------------------------------------- -// -// Function -// Name: BackupStoreFile::EncodeFileDiff(const char *, int64_t, const BackupStoreFilename &, int64_t, IOStream &, int64_t *) -// Purpose: Similar to EncodeFile, but takes the object ID of the file it's -// diffing from, and the index of the blocks in a stream. It'll then -// calculate which blocks can be reused from that old file. -// The timeout is the timeout value for reading the diff block index. -// If pIsCompletelyDifferent != 0, it will be set to true if the -// the two files are completely different (do not share any block), false otherwise. -// -// Created: 12/1/04 -// -// -------------------------------------------------------------------------- -std::auto_ptr<IOStream> BackupStoreFile::EncodeFileDiff -( - const char *Filename, int64_t ContainerID, - const BackupStoreFilename &rStoreFilename, int64_t DiffFromObjectID, - IOStream &rDiffFromBlockIndex, int Timeout, DiffTimer *pDiffTimer, - int64_t *pModificationTime, bool *pIsCompletelyDifferent) -{ - // Is it a symlink? - { - EMU_STRUCT_STAT st; - if(EMU_LSTAT(Filename, &st) != 0) - { - THROW_EXCEPTION(CommonException, OSFileError) - } - if((st.st_mode & S_IFLNK) == S_IFLNK) - { - // Don't do diffs for symlinks - if(pIsCompletelyDifferent != 0) - { - *pIsCompletelyDifferent = true; - } - return EncodeFile(Filename, ContainerID, rStoreFilename, pModificationTime); - } - } - - // Load in the blocks - BlocksAvailableEntry *pindex = 0; - int64_t blocksInIndex = 0; - bool canDiffFromThis = false; - LoadIndex(rDiffFromBlockIndex, DiffFromObjectID, &pindex, blocksInIndex, Timeout, canDiffFromThis); - // BOX_TRACE("Diff: Blocks in index: " << blocksInIndex); - - if(!canDiffFromThis) - { - // Don't do diffing... - if(pIsCompletelyDifferent != 0) - { - *pIsCompletelyDifferent = true; - } - return EncodeFile(Filename, ContainerID, rStoreFilename, pModificationTime); - } - - // Pointer to recipe we're going to create - BackupStoreFileEncodeStream::Recipe *precipe = 0; - - try - { - // Find which sizes should be scanned - int32_t sizesToScan[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; - FindMostUsedSizes(pindex, blocksInIndex, sizesToScan); - - // Flag for reporting to the user - bool completelyDifferent; - - // BLOCK - { - // Search the file to find matching blocks - std::map<int64_t, int64_t> foundBlocks; // map of offset in file to index in block index - int64_t sizeOfInputFile = 0; - // BLOCK - { - FileStream file(Filename); - // Get size of file - sizeOfInputFile = file.BytesLeftToRead(); - // Find all those lovely matching blocks - SearchForMatchingBlocks(file, foundBlocks, pindex, - blocksInIndex, sizesToScan, pDiffTimer); - - // Is it completely different? - completelyDifferent = (foundBlocks.size() == 0); - } - - // Create a recipe -- if the two files are completely different, don't put the from file ID in the recipe. - precipe = new BackupStoreFileEncodeStream::Recipe(pindex, blocksInIndex, completelyDifferent?(0):(DiffFromObjectID)); - BlocksAvailableEntry *pindexKeptRef = pindex; // we need this later, but must set pindex == 0 now, because of exceptions - pindex = 0; // Recipe now has ownership - - // Fill it in - GenerateRecipe(*precipe, pindexKeptRef, blocksInIndex, foundBlocks, sizeOfInputFile); - } - // foundBlocks no longer required - - // Create the stream - std::auto_ptr<IOStream> stream(new BackupStoreFileEncodeStream); - - // Do the initial setup - ((BackupStoreFileEncodeStream*)stream.get())->Setup(Filename, precipe, ContainerID, rStoreFilename, pModificationTime); - precipe = 0; // Stream has taken ownership of this - - // Tell user about completely different status? - if(pIsCompletelyDifferent != 0) - { - *pIsCompletelyDifferent = completelyDifferent; - } - - // Return the stream for the caller - return stream; - } - catch(...) - { - // cleanup - if(pindex != 0) - { - ::free(pindex); - pindex = 0; - } - if(precipe != 0) - { - delete precipe; - precipe = 0; - } - throw; - } -} - -// -------------------------------------------------------------------------- -// -// Function -// Name: static LoadIndex(IOStream &, int64_t, BlocksAvailableEntry **, int64_t, bool &) -// Purpose: Read in an index, and decrypt, and store in the in memory block format. -// rCanDiffFromThis is set to false if the version of the from file is too old. -// Created: 12/1/04 -// -// -------------------------------------------------------------------------- -static void LoadIndex(IOStream &rBlockIndex, int64_t ThisID, BlocksAvailableEntry **ppIndex, int64_t &rNumBlocksOut, int Timeout, bool &rCanDiffFromThis) -{ - // Reset - rNumBlocksOut = 0; - rCanDiffFromThis = false; - - // Read header - file_BlockIndexHeader hdr; - if(!rBlockIndex.ReadFullBuffer(&hdr, sizeof(hdr), 0 /* not interested in bytes read if this fails */, Timeout)) - { - // Couldn't read header - THROW_EXCEPTION(BackupStoreException, CouldntReadEntireStructureFromStream) - } - -#ifndef BOX_DISABLE_BACKWARDS_COMPATIBILITY_BACKUPSTOREFILE - // Check against backwards comptaibility stuff - if(hdr.mMagicValue == (int32_t)htonl(OBJECTMAGIC_FILE_BLOCKS_MAGIC_VALUE_V0)) - { - // Won't diff against old version - - // Absorb rest of stream - char buffer[2048]; - while(rBlockIndex.StreamDataLeft()) - { - rBlockIndex.Read(buffer, sizeof(buffer), 1000 /* 1 sec timeout */); - } - - // Tell caller - rCanDiffFromThis = false; - return; - } -#endif - - // Check magic - if(hdr.mMagicValue != (int32_t)htonl(OBJECTMAGIC_FILE_BLOCKS_MAGIC_VALUE_V1)) - { - THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile) - } - - // Check that we're not trying to diff against a file which references blocks from another file - if(((int64_t)box_ntoh64(hdr.mOtherFileID)) != 0) - { - THROW_EXCEPTION(BackupStoreException, CannotDiffAnIncompleteStoreFile) - } - - // Mark as an acceptable diff. - rCanDiffFromThis = true; - - // Get basic information - int64_t numBlocks = box_ntoh64(hdr.mNumBlocks); - uint64_t entryIVBase = box_ntoh64(hdr.mEntryIVBase); - - //TODO: Verify that these sizes look reasonable - - // Allocate space for the index - BlocksAvailableEntry *pindex = (BlocksAvailableEntry*)::malloc(sizeof(BlocksAvailableEntry) * numBlocks); - if(pindex == 0) - { - throw std::bad_alloc(); - } - - try - { - for(int64_t b = 0; b < numBlocks; ++b) - { - // Read an entry from the stream - file_BlockIndexEntry entry; - if(!rBlockIndex.ReadFullBuffer(&entry, sizeof(entry), 0 /* not interested in bytes read if this fails */, Timeout)) - { - // Couldn't read entry - THROW_EXCEPTION(BackupStoreException, CouldntReadEntireStructureFromStream) - } - - // Calculate IV for this entry - uint64_t iv = entryIVBase; - iv += b; - // Network byte order - iv = box_hton64(iv); - sBlowfishDecryptBlockEntry.SetIV(&iv); - - // Decrypt the encrypted section - file_BlockIndexEntryEnc entryEnc; - int sectionSize = sBlowfishDecryptBlockEntry.TransformBlock(&entryEnc, sizeof(entryEnc), - entry.mEnEnc, sizeof(entry.mEnEnc)); - if(sectionSize != sizeof(entryEnc)) - { - THROW_EXCEPTION(BackupStoreException, BlockEntryEncodingDidntGiveExpectedLength) - } - - // Check that we're not trying to diff against a file which references blocks from another file - if(((int64_t)box_ntoh64(entry.mEncodedSize)) <= 0) - { - THROW_EXCEPTION(BackupStoreException, CannotDiffAnIncompleteStoreFile) - } - - // Store all the required information - pindex[b].mpNextInHashList = 0; // hash list not set up yet - pindex[b].mSize = ntohl(entryEnc.mSize); - pindex[b].mWeakChecksum = ntohl(entryEnc.mWeakChecksum); - ::memcpy(pindex[b].mStrongChecksum, entryEnc.mStrongChecksum, sizeof(pindex[b].mStrongChecksum)); - } - - // Store index pointer for called - ASSERT(ppIndex != 0); - *ppIndex = pindex; - - // Store number of blocks for caller - rNumBlocksOut = numBlocks; - - } - catch(...) - { - // clean up and send the exception along its way - ::free(pindex); - throw; - } -} - - -// -------------------------------------------------------------------------- -// -// Function -// Name: static FindMostUsedSizes(BlocksAvailableEntry *, int64_t, int32_t[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]) -// Purpose: Finds the most commonly used block sizes in the index -// Created: 12/1/04 -// -// -------------------------------------------------------------------------- -static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]) -{ - // Array for lengths - int64_t sizeCounts[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; - - // Set arrays to lots of zeros (= unused entries) - for(int l = 0; l < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++l) - { - Sizes[l] = 0; - sizeCounts[l] = 0; - } - - // Array for collecting sizes - std::map<int32_t, int64_t> foundSizes; - - // Run through blocks and make a count of the entries - for(int64_t b = 0; b < NumBlocks; ++b) - { - // Only if the block size is bigger than the minimum size we'll scan for - if(pIndex[b].mSize > BACKUP_FILE_DIFF_MIN_BLOCK_SIZE) - { - // Find entry? - std::map<int32_t, int64_t>::const_iterator f(foundSizes.find(pIndex[b].mSize)); - if(f != foundSizes.end()) - { - // Increment existing entry - foundSizes[pIndex[b].mSize] = foundSizes[pIndex[b].mSize] + 1; - } - else - { - // New entry - foundSizes[pIndex[b].mSize] = 1; - } - } - } - - // Make the block sizes - for(std::map<int32_t, int64_t>::const_iterator i(foundSizes.begin()); i != foundSizes.end(); ++i) - { - // Find the position of the size in the array - for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t) - { - // Instead of sorting on the raw count of blocks, - // take the file area covered by this block size. - if(i->second * i->first > sizeCounts[t] * Sizes[t]) - { - // Then this size belong before this entry -- shuffle them up - for(int s = (BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1); s >= t; --s) - { - Sizes[s] = Sizes[s-1]; - sizeCounts[s] = sizeCounts[s-1]; - } - - // Insert this size - Sizes[t] = i->first; - sizeCounts[t] = i->second; - - // Shouldn't do any more searching - break; - } - } - } - - // trace the size table in debug builds -#ifndef BOX_RELEASE_BUILD - if(BackupStoreFile::TraceDetailsOfDiffProcess) - { - for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t) - { - BOX_TRACE("Diff block size " << t << ": " << - Sizes[t] << " (count = " << - sizeCounts[t] << ")"); - } - } -#endif -} - - - -// -------------------------------------------------------------------------- -// -// Function -// Name: static SearchForMatchingBlocks(IOStream &, std::map<int64_t, int64_t> &, BlocksAvailableEntry *, int64_t, int32_t[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]) -// Purpose: Find the matching blocks within the file. -// Created: 12/1/04 -// -// -------------------------------------------------------------------------- -static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> &rFoundBlocks, - BlocksAvailableEntry *pIndex, int64_t NumBlocks, - int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES], DiffTimer *pDiffTimer) -{ - Timer maximumDiffingTime(0, "MaximumDiffingTime"); - - if(pDiffTimer && pDiffTimer->IsManaged()) - { - maximumDiffingTime = Timer(pDiffTimer->GetMaximumDiffingTime(), - "MaximumDiffingTime"); - } - - std::map<int64_t, int32_t> goodnessOfFit; - - // Allocate the hash lookup table - BlocksAvailableEntry **phashTable = (BlocksAvailableEntry **)::malloc(sizeof(BlocksAvailableEntry *) * (64*1024)); - - // Choose a size for the buffer, just a little bit more than the maximum block size - int32_t bufSize = Sizes[0]; - for(int z = 1; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z) - { - if(Sizes[z] > bufSize) bufSize = Sizes[z]; - } - bufSize += 4; - ASSERT(bufSize > Sizes[0]); - ASSERT(bufSize > 0); - if(bufSize > (BACKUP_FILE_MAX_BLOCK_SIZE + 1024)) - { - THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile) - } - - // TODO: Because we read in the file a scanned block size at a time, - // it is likely to be inefficient. Probably will be much better to - // calculate checksums for all block sizes in a single pass. - - // Allocate the buffers. - uint8_t *pbuffer0 = (uint8_t *)::malloc(bufSize); - uint8_t *pbuffer1 = (uint8_t *)::malloc(bufSize); - try - { - // Check buffer allocation - if(pbuffer0 == 0 || pbuffer1 == 0 || phashTable == 0) - { - // If a buffer got allocated, it will be cleaned up in the catch block - throw std::bad_alloc(); - } - - // Flag to abort the run, if too many blocks are found -- avoid using - // huge amounts of processor time when files contain many similar blocks. - bool abortSearch = false; - - // Search for each block size in turn - // NOTE: Do the smallest size first, so that the scheme for adding - // entries in the found list works as expected and replaces smallers block - // with larger blocks when it finds matches at the same offset in the file. - for(int s = BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1; s >= 0; --s) - { - ASSERT(Sizes[s] <= bufSize); - BOX_TRACE("Diff pass " << s << ", for block size " << - Sizes[s]); - - // Check we haven't finished - if(Sizes[s] == 0) - { - // empty entry, try next size - continue; - } - - // Set up the hash table entries - SetupHashTable(pIndex, NumBlocks, Sizes[s], phashTable); - - // Shift file position to beginning - rFile.Seek(0, IOStream::SeekType_Absolute); - - // Read first block - if(rFile.Read(pbuffer0, Sizes[s]) != Sizes[s]) - { - // Size of file too short to match -- do next size - continue; - } - - // Setup block pointers - uint8_t *beginnings = pbuffer0; - uint8_t *endings = pbuffer1; - int offset = 0; - - // Calculate the first checksum, ready for rolling - RollingChecksum rolling(beginnings, Sizes[s]); - - // Then roll, until the file is exhausted - int64_t fileBlockNumber = 0; - int64_t fileOffset = 0; - int rollOverInitialBytes = 0; - while(true) - { - if(maximumDiffingTime.HasExpired()) - { - ASSERT(pDiffTimer != NULL); - BOX_INFO("MaximumDiffingTime reached - " - "suspending file diff"); - abortSearch = true; - break; - } - - if(pDiffTimer) - { - pDiffTimer->DoKeepAlive(); - } - - // Load in another block of data, and record how big it is - int bytesInEndings = rFile.Read(endings, Sizes[s]); - int tmp; - - // Skip any bytes from a previous matched block - if(rollOverInitialBytes > 0 && offset < bytesInEndings) - { - int spaceLeft = bytesInEndings - offset; - int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes; - - rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll); - - offset += thisRoll; - fileOffset += thisRoll; - rollOverInitialBytes -= thisRoll; - - if(rollOverInitialBytes) - { - goto refresh; - } - } - - if(goodnessOfFit.count(fileOffset)) - { - tmp = goodnessOfFit[fileOffset]; - } - else - { - tmp = 0; - } - - if(tmp >= Sizes[s]) - { - // Skip over bigger ready-matched blocks completely - rollOverInitialBytes = tmp; - int spaceLeft = bytesInEndings - offset; - int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes; - - rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll); - - offset += thisRoll; - fileOffset += thisRoll; - rollOverInitialBytes -= thisRoll; - - if(rollOverInitialBytes) - { - goto refresh; - } - } - - while(offset < bytesInEndings) - { - // Is current checksum in hash list? - uint16_t hash = rolling.GetComponentForHashing(); - if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s])) - { - if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks)) - { - BOX_TRACE("Found block match for " << hash << " of " << Sizes[s] << " bytes at offset " << fileOffset); - goodnessOfFit[fileOffset] = Sizes[s]; - - // Block matched, roll the checksum forward to the next block without doing - // any more comparisons, because these are pointless (as any more matches will be ignored when - // the recipe is generated) and just take up valuable processor time. Edge cases are - // especially nasty, using huge amounts of time and memory. - int skip = Sizes[s]; - if(offset < bytesInEndings && skip > 0) - { - int spaceLeft = bytesInEndings - offset; - int thisRoll = (skip > spaceLeft) ? spaceLeft : skip; - - rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll); - - offset += thisRoll; - fileOffset += thisRoll; - skip -= thisRoll; - } - // Not all the bytes necessary will have been skipped, so get them - // skipped after the next block is loaded. - rollOverInitialBytes = skip; - - // End this loop, so the final byte isn't used again - break; - } - else - { - BOX_TRACE("False alarm match for " << hash << " of " << Sizes[s] << " bytes at offset " << fileOffset); - } - - int64_t NumBlocksFound = static_cast<int64_t>( - rFoundBlocks.size()); - int64_t MaxBlocksFound = NumBlocks * - BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE; - - if(NumBlocksFound > MaxBlocksFound) - { - abortSearch = true; - break; - } - } - - // Roll checksum forward - rolling.RollForward(beginnings[offset], endings[offset], Sizes[s]); - - // Increment offsets - ++offset; - ++fileOffset; - } - - if(abortSearch) break; - - refresh: - // Finished? - if(bytesInEndings != Sizes[s]) - { - // No more data in file -- check the final block - // (Do a copy and paste of 5 lines of code instead of introducing a comparison for - // each byte of the file) - uint16_t hash = rolling.GetComponentForHashing(); - if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s])) - { - if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks)) - { - goodnessOfFit[fileOffset] = Sizes[s]; - } - } - - // finish - break; - } - - // Switch buffers, reset offset - beginnings = endings; - endings = (beginnings == pbuffer0)?(pbuffer1):(pbuffer0); // ie the other buffer - offset = 0; - - // And count the blocks which have been done - ++fileBlockNumber; - } - - if(abortSearch) break; - } - - // Free buffers and hash table - ::free(pbuffer1); - pbuffer1 = 0; - ::free(pbuffer0); - pbuffer0 = 0; - ::free(phashTable); - phashTable = 0; - } - catch(...) - { - // Cleanup and throw - if(pbuffer1 != 0) ::free(pbuffer1); - if(pbuffer0 != 0) ::free(pbuffer0); - if(phashTable != 0) ::free(phashTable); - throw; - } - -#ifndef BOX_RELEASE_BUILD - if(BackupStoreFile::TraceDetailsOfDiffProcess) - { - // Trace out the found blocks in debug mode - BOX_TRACE("Diff: list of found blocks"); - BOX_TRACE("======== ======== ======== ========"); - BOX_TRACE(" Offset BlkIdx Size Movement"); - for(std::map<int64_t, int64_t>::const_iterator i(rFoundBlocks.begin()); i != rFoundBlocks.end(); ++i) - { - int64_t orgLoc = 0; - for(int64_t b = 0; b < i->second; ++b) - { - orgLoc += pIndex[b].mSize; - } - BOX_TRACE(std::setw(8) << i->first << " " << - std::setw(8) << i->second << " " << - std::setw(8) << pIndex[i->second].mSize << - " " << - std::setw(8) << (i->first - orgLoc)); - } - BOX_TRACE("======== ======== ======== ========"); - } -#endif -} - - -// -------------------------------------------------------------------------- -// -// Function -// Name: static SetupHashTable(BlocksAvailableEntry *, int64_t, in32_t, BlocksAvailableEntry **) -// Purpose: Set up the hash table ready for a scan -// Created: 14/1/04 -// -// -------------------------------------------------------------------------- -static void SetupHashTable(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t BlockSize, BlocksAvailableEntry **pHashTable) -{ - // Set all entries in the hash table to zero - ::memset(pHashTable, 0, (sizeof(BlocksAvailableEntry *) * (64*1024))); - - // Scan through the blocks, building the hash table - for(int64_t b = 0; b < NumBlocks; ++b) - { - // Only look at the required block size - if(pIndex[b].mSize == BlockSize) - { - // Get the value under which to hash this entry - uint16_t hash = RollingChecksum::ExtractHashingComponent(pIndex[b].mWeakChecksum); - - // Already present in table? - if(pHashTable[hash] != 0) - { - //BOX_TRACE("Another hash entry for " << hash << " found"); - // Yes -- need to set the pointer in this entry to the current entry to build the linked list - pIndex[b].mpNextInHashList = pHashTable[hash]; - } - - // Put a pointer to this entry in the hash table - pHashTable[hash] = pIndex + b; - } - } -} - - -// -------------------------------------------------------------------------- -// -// Function -// Name: static bool SecondStageMatch(xxx) -// Purpose: When a match in the hash table is found, scan for second stage match using strong checksum. -// Created: 14/1/04 -// -// -------------------------------------------------------------------------- -static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings, - int Offset, int32_t BlockSize, int64_t FileBlockNumber, BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks) -{ - // Check parameters - ASSERT(pBeginnings != 0); - ASSERT(pEndings != 0); - ASSERT(Offset >= 0); - ASSERT(BlockSize > 0); - ASSERT(pFirstInHashList != 0); - ASSERT(pIndex != 0); - -#ifndef BOX_RELEASE_BUILD - uint16_t DEBUG_Hash = fastSum.GetComponentForHashing(); -#endif - uint32_t Checksum = fastSum.GetChecksum(); - - // Before we go to the expense of the MD5, make sure it's a darn good match on the checksum we already know. - BlocksAvailableEntry *scan = pFirstInHashList; - bool found=false; - while(scan != 0) - { - if(scan->mWeakChecksum == Checksum) - { - found = true; - break; - } - scan = scan->mpNextInHashList; - } - if(!found) - { - return false; - } - - // Calculate the strong MD5 digest for this block - MD5Digest strong; - // Add the data from the beginnings - strong.Add(pBeginnings + Offset, BlockSize - Offset); - // Add any data from the endings - if(Offset > 0) - { - strong.Add(pEndings, Offset); - } - strong.Finish(); - - // Then go through the entries in the hash list, comparing with the strong digest calculated - scan = pFirstInHashList; - //BOX_TRACE("second stage match"); - while(scan != 0) - { - //BOX_TRACE("scan size " << scan->mSize << - // ", block size " << BlockSize << - // ", hash " << Hash); - ASSERT(scan->mSize == BlockSize); - ASSERT(RollingChecksum::ExtractHashingComponent(scan->mWeakChecksum) == DEBUG_Hash); - - // Compare? - if(strong.DigestMatches(scan->mStrongChecksum)) - { - //BOX_TRACE("Match!\n"); - // Found! Add to list of found blocks... - int64_t fileOffset = (FileBlockNumber * BlockSize) + Offset; - int64_t blockIndex = (scan - pIndex); // pointer arthmitic is frowned upon. But most efficient way of doing it here -- alternative is to use more memory - - // We do NOT search for smallest blocks first, as this code originally assumed. - // To prevent this from potentially overwriting a better match, the caller must determine - // the relative "goodness" of any existing match and this one, and avoid the call if it - // could be detrimental. - rFoundBlocks[fileOffset] = blockIndex; - - // No point in searching further, report success - return true; - } - - // Next - scan = scan->mpNextInHashList; - } - - // Not matched - return false; -} - - -// -------------------------------------------------------------------------- -// -// Function -// Name: static GenerateRecipe(BackupStoreFileEncodeStream::Recipe &, BlocksAvailableEntry *, int64_t, std::map<int64_t, int64_t> &) -// Purpose: Fills in the recipe from the found block list -// Created: 15/1/04 -// -// -------------------------------------------------------------------------- -static void GenerateRecipe(BackupStoreFileEncodeStream::Recipe &rRecipe, BlocksAvailableEntry *pIndex, - int64_t NumBlocks, std::map<int64_t, int64_t> &rFoundBlocks, int64_t SizeOfInputFile) -{ - // NOTE: This function could be a lot more sophisiticated. For example, if - // a small block overlaps a big block like this - // **** - // ******************************* - // then the small block will be used, not the big one. But it'd be better to - // just ignore the small block and keep the big one. However, some stats should - // be gathered about real world files before writing complex code which might - // go wrong. - - // Initialise a blank instruction - BackupStoreFileEncodeStream::RecipeInstruction instruction; - #define RESET_INSTRUCTION \ - instruction.mSpaceBefore = 0; \ - instruction.mBlocks = 0; \ - instruction.mpStartBlock = 0; - RESET_INSTRUCTION - - // First, a special case for when there are no found blocks - if(rFoundBlocks.size() == 0) - { - // No blocks, just a load of space - instruction.mSpaceBefore = SizeOfInputFile; - rRecipe.push_back(instruction); - - #ifndef BOX_RELEASE_BUILD - if(BackupStoreFile::TraceDetailsOfDiffProcess) - { - BOX_TRACE("Diff: Default recipe generated, " << - SizeOfInputFile << " bytes of file"); - } - #endif - - // Don't do anything - return; - } - - // Current location - int64_t loc = 0; - - // Then iterate through the list, generating the recipe - std::map<int64_t, int64_t>::const_iterator i(rFoundBlocks.begin()); - ASSERT(i != rFoundBlocks.end()); // check logic - - // Counting for debug tracing -#ifndef BOX_RELEASE_BUILD - int64_t debug_NewBytesFound = 0; - int64_t debug_OldBlocksUsed = 0; -#endif - - for(; i != rFoundBlocks.end(); ++i) - { - // Remember... map is (position in file) -> (index of block in pIndex) - - if(i->first < loc) - { - // This block overlaps the last one - continue; - } - else if(i->first > loc) - { - // There's a gap between the end of the last thing and this block. - // If there's an instruction waiting, push it onto the list - if(instruction.mSpaceBefore != 0 || instruction.mpStartBlock != 0) - { - rRecipe.push_back(instruction); - } - // Start a new instruction, with the gap ready - RESET_INSTRUCTION - instruction.mSpaceBefore = i->first - loc; - // Move location forward to match - loc += instruction.mSpaceBefore; -#ifndef BOX_RELEASE_BUILD - debug_NewBytesFound += instruction.mSpaceBefore; -#endif - } - - // First, does the current instruction need pushing back, because this block is not - // sequential to the last one? - if(instruction.mpStartBlock != 0 && (pIndex + i->second) != (instruction.mpStartBlock + instruction.mBlocks)) - { - rRecipe.push_back(instruction); - RESET_INSTRUCTION - } - - // Add in this block - if(instruction.mpStartBlock == 0) - { - // This block starts a new instruction - instruction.mpStartBlock = pIndex + i->second; - instruction.mBlocks = 1; - } - else - { - // It continues the previous section of blocks - instruction.mBlocks += 1; - } - -#ifndef BOX_RELEASE_BUILD - debug_OldBlocksUsed++; -#endif - - // Move location forward - loc += pIndex[i->second].mSize; - } - - // Push the last instruction generated - rRecipe.push_back(instruction); - - // Is there any space left at the end which needs sending? - if(loc != SizeOfInputFile) - { - RESET_INSTRUCTION - instruction.mSpaceBefore = SizeOfInputFile - loc; -#ifndef BOX_RELEASE_BUILD - debug_NewBytesFound += instruction.mSpaceBefore; -#endif - rRecipe.push_back(instruction); - } - - // dump out the recipe -#ifndef BOX_RELEASE_BUILD - BOX_TRACE("Diff: " << - debug_NewBytesFound << " new bytes found, " << - debug_OldBlocksUsed << " old blocks used"); - if(BackupStoreFile::TraceDetailsOfDiffProcess) - { - BOX_TRACE("Diff: Recipe generated (size " << rRecipe.size()); - BOX_TRACE("======== ========= ========"); - BOX_TRACE("Space b4 FirstBlk NumBlks"); - { - for(unsigned int e = 0; e < rRecipe.size(); ++e) - { - char b[64]; -#ifdef WIN32 - sprintf(b, "%8I64d", (int64_t)(rRecipe[e].mpStartBlock - pIndex)); -#else - sprintf(b, "%8lld", (int64_t)(rRecipe[e].mpStartBlock - pIndex)); -#endif - BOX_TRACE(std::setw(8) << - rRecipe[e].mSpaceBefore << - " " << - ((rRecipe[e].mpStartBlock == 0)?" -":b) << - " " << std::setw(8) << - rRecipe[e].mBlocks); - } - } - BOX_TRACE("======== ========= ========"); - } -#endif -} |