From 757294d769a7defe6a531a09fba9674cc6b388f7 Mon Sep 17 00:00:00 2001 From: Reinhard Tartler Date: Thu, 27 Oct 2011 19:53:48 -0400 Subject: Import boxbackup_0.11.1~r2837.orig.tar.gz [dgit import orig boxbackup_0.11.1~r2837.orig.tar.gz] --- lib/backupclient/BackupStoreFileDiff.cpp | 1046 ++++++++++++++++++++++++++++++ 1 file changed, 1046 insertions(+) create mode 100644 lib/backupclient/BackupStoreFileDiff.cpp (limited to 'lib/backupclient/BackupStoreFileDiff.cpp') diff --git a/lib/backupclient/BackupStoreFileDiff.cpp b/lib/backupclient/BackupStoreFileDiff.cpp new file mode 100644 index 00000000..5705c3aa --- /dev/null +++ b/lib/backupclient/BackupStoreFileDiff.cpp @@ -0,0 +1,1046 @@ +// -------------------------------------------------------------------------- +// +// File +// Name: BackupStoreFileDiff.cpp +// Purpose: Functions relating to diffing BackupStoreFiles +// Created: 12/1/04 +// +// -------------------------------------------------------------------------- + +#include "Box.h" + +#include + +#include +#include + +#ifdef HAVE_TIME_H + #include +#elif HAVE_SYS_TIME_H + #include +#endif + +#include "BackupStoreConstants.h" +#include "BackupStoreException.h" +#include "BackupStoreFile.h" +#include "BackupStoreFileCryptVar.h" +#include "BackupStoreFileEncodeStream.h" +#include "BackupStoreFileWire.h" +#include "BackupStoreObjectMagic.h" +#include "CommonException.h" +#include "FileStream.h" +#include "MD5Digest.h" +#include "RollingChecksum.h" +#include "Timer.h" + +#include "MemLeakFindOn.h" + +#include + +using namespace BackupStoreFileCryptVar; +using namespace BackupStoreFileCreation; + +// By default, don't trace out details of the diff as we go along -- would fill up logs significantly. +// But it's useful for the test. +#ifndef BOX_RELEASE_BUILD + bool BackupStoreFile::TraceDetailsOfDiffProcess = false; +#endif + +static void LoadIndex(IOStream &rBlockIndex, int64_t ThisID, BlocksAvailableEntry **ppIndex, int64_t &rNumBlocksOut, int Timeout, bool &rCanDiffFromThis); +static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]); +static void SearchForMatchingBlocks(IOStream &rFile, + std::map &rFoundBlocks, BlocksAvailableEntry *pIndex, + int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES], + DiffTimer *pDiffTimer); +static void SetupHashTable(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t BlockSize, BlocksAvailableEntry **pHashTable); +static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings, int Offset, int32_t BlockSize, int64_t FileBlockNumber, +BlocksAvailableEntry *pIndex, std::map &rFoundBlocks); +static void GenerateRecipe(BackupStoreFileEncodeStream::Recipe &rRecipe, BlocksAvailableEntry *pIndex, int64_t NumBlocks, std::map &rFoundBlocks, int64_t SizeOfInputFile); + +// -------------------------------------------------------------------------- +// +// Function +// Name: BackupStoreFile::MoveStreamPositionToBlockIndex(IOStream &) +// Purpose: Move the file pointer in this stream to just before the block index. +// Assumes that the stream is at the beginning, seekable, and +// reading from the stream is OK. +// Created: 12/1/04 +// +// -------------------------------------------------------------------------- +void BackupStoreFile::MoveStreamPositionToBlockIndex(IOStream &rStream) +{ + // Size of file + int64_t fileSize = rStream.BytesLeftToRead(); + + // Get header + file_StreamFormat hdr; + + // Read the header + if(!rStream.ReadFullBuffer(&hdr, sizeof(hdr), 0 /* not interested in bytes read if this fails */, IOStream::TimeOutInfinite)) + { + // Couldn't read header + THROW_EXCEPTION(BackupStoreException, CouldntReadEntireStructureFromStream) + } + + // Check magic number + if(ntohl(hdr.mMagicValue) != OBJECTMAGIC_FILE_MAGIC_VALUE_V1 +#ifndef BOX_DISABLE_BACKWARDS_COMPATIBILITY_BACKUPSTOREFILE + && ntohl(hdr.mMagicValue) != OBJECTMAGIC_FILE_MAGIC_VALUE_V0 +#endif + ) + { + THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile) + } + + // Work out where the index is + int64_t numBlocks = box_ntoh64(hdr.mNumBlocks); + int64_t blockHeaderPosFromEnd = ((numBlocks * sizeof(file_BlockIndexEntry)) + sizeof(file_BlockIndexHeader)); + + // Sanity check + if(blockHeaderPosFromEnd > static_cast(fileSize - sizeof(file_StreamFormat))) + { + THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile) + } + + // Seek to that position + rStream.Seek(0 - blockHeaderPosFromEnd, IOStream::SeekType_End); + + // Done. Stream now in right position (as long as the file is formatted correctly) +} + + +// -------------------------------------------------------------------------- +// +// Function +// Name: BackupStoreFile::EncodeFileDiff(const char *, int64_t, const BackupStoreFilename &, int64_t, IOStream &, int64_t *) +// Purpose: Similar to EncodeFile, but takes the object ID of the file it's +// diffing from, and the index of the blocks in a stream. It'll then +// calculate which blocks can be reused from that old file. +// The timeout is the timeout value for reading the diff block index. +// If pIsCompletelyDifferent != 0, it will be set to true if the +// the two files are completely different (do not share any block), false otherwise. +// +// Created: 12/1/04 +// +// -------------------------------------------------------------------------- +std::auto_ptr BackupStoreFile::EncodeFileDiff +( + const char *Filename, int64_t ContainerID, + const BackupStoreFilename &rStoreFilename, int64_t DiffFromObjectID, + IOStream &rDiffFromBlockIndex, int Timeout, DiffTimer *pDiffTimer, + int64_t *pModificationTime, bool *pIsCompletelyDifferent) +{ + // Is it a symlink? + { + EMU_STRUCT_STAT st; + if(EMU_LSTAT(Filename, &st) != 0) + { + THROW_EXCEPTION(CommonException, OSFileError) + } + if((st.st_mode & S_IFLNK) == S_IFLNK) + { + // Don't do diffs for symlinks + if(pIsCompletelyDifferent != 0) + { + *pIsCompletelyDifferent = true; + } + return EncodeFile(Filename, ContainerID, rStoreFilename, pModificationTime); + } + } + + // Load in the blocks + BlocksAvailableEntry *pindex = 0; + int64_t blocksInIndex = 0; + bool canDiffFromThis = false; + LoadIndex(rDiffFromBlockIndex, DiffFromObjectID, &pindex, blocksInIndex, Timeout, canDiffFromThis); + // BOX_TRACE("Diff: Blocks in index: " << blocksInIndex); + + if(!canDiffFromThis) + { + // Don't do diffing... + if(pIsCompletelyDifferent != 0) + { + *pIsCompletelyDifferent = true; + } + return EncodeFile(Filename, ContainerID, rStoreFilename, pModificationTime); + } + + // Pointer to recipe we're going to create + BackupStoreFileEncodeStream::Recipe *precipe = 0; + + try + { + // Find which sizes should be scanned + int32_t sizesToScan[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; + FindMostUsedSizes(pindex, blocksInIndex, sizesToScan); + + // Flag for reporting to the user + bool completelyDifferent; + + // BLOCK + { + // Search the file to find matching blocks + std::map foundBlocks; // map of offset in file to index in block index + int64_t sizeOfInputFile = 0; + // BLOCK + { + FileStream file(Filename); + // Get size of file + sizeOfInputFile = file.BytesLeftToRead(); + // Find all those lovely matching blocks + SearchForMatchingBlocks(file, foundBlocks, pindex, + blocksInIndex, sizesToScan, pDiffTimer); + + // Is it completely different? + completelyDifferent = (foundBlocks.size() == 0); + } + + // Create a recipe -- if the two files are completely different, don't put the from file ID in the recipe. + precipe = new BackupStoreFileEncodeStream::Recipe(pindex, blocksInIndex, completelyDifferent?(0):(DiffFromObjectID)); + BlocksAvailableEntry *pindexKeptRef = pindex; // we need this later, but must set pindex == 0 now, because of exceptions + pindex = 0; // Recipe now has ownership + + // Fill it in + GenerateRecipe(*precipe, pindexKeptRef, blocksInIndex, foundBlocks, sizeOfInputFile); + } + // foundBlocks no longer required + + // Create the stream + std::auto_ptr stream(new BackupStoreFileEncodeStream); + + // Do the initial setup + ((BackupStoreFileEncodeStream*)stream.get())->Setup(Filename, precipe, ContainerID, rStoreFilename, pModificationTime); + precipe = 0; // Stream has taken ownership of this + + // Tell user about completely different status? + if(pIsCompletelyDifferent != 0) + { + *pIsCompletelyDifferent = completelyDifferent; + } + + // Return the stream for the caller + return stream; + } + catch(...) + { + // cleanup + if(pindex != 0) + { + ::free(pindex); + pindex = 0; + } + if(precipe != 0) + { + delete precipe; + precipe = 0; + } + throw; + } +} + +// -------------------------------------------------------------------------- +// +// Function +// Name: static LoadIndex(IOStream &, int64_t, BlocksAvailableEntry **, int64_t, bool &) +// Purpose: Read in an index, and decrypt, and store in the in memory block format. +// rCanDiffFromThis is set to false if the version of the from file is too old. +// Created: 12/1/04 +// +// -------------------------------------------------------------------------- +static void LoadIndex(IOStream &rBlockIndex, int64_t ThisID, BlocksAvailableEntry **ppIndex, int64_t &rNumBlocksOut, int Timeout, bool &rCanDiffFromThis) +{ + // Reset + rNumBlocksOut = 0; + rCanDiffFromThis = false; + + // Read header + file_BlockIndexHeader hdr; + if(!rBlockIndex.ReadFullBuffer(&hdr, sizeof(hdr), 0 /* not interested in bytes read if this fails */, Timeout)) + { + // Couldn't read header + THROW_EXCEPTION(BackupStoreException, CouldntReadEntireStructureFromStream) + } + +#ifndef BOX_DISABLE_BACKWARDS_COMPATIBILITY_BACKUPSTOREFILE + // Check against backwards comptaibility stuff + if(hdr.mMagicValue == (int32_t)htonl(OBJECTMAGIC_FILE_BLOCKS_MAGIC_VALUE_V0)) + { + // Won't diff against old version + + // Absorb rest of stream + char buffer[2048]; + while(rBlockIndex.StreamDataLeft()) + { + rBlockIndex.Read(buffer, sizeof(buffer), 1000 /* 1 sec timeout */); + } + + // Tell caller + rCanDiffFromThis = false; + return; + } +#endif + + // Check magic + if(hdr.mMagicValue != (int32_t)htonl(OBJECTMAGIC_FILE_BLOCKS_MAGIC_VALUE_V1)) + { + THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile) + } + + // Check that we're not trying to diff against a file which references blocks from another file + if(((int64_t)box_ntoh64(hdr.mOtherFileID)) != 0) + { + THROW_EXCEPTION(BackupStoreException, CannotDiffAnIncompleteStoreFile) + } + + // Mark as an acceptable diff. + rCanDiffFromThis = true; + + // Get basic information + int64_t numBlocks = box_ntoh64(hdr.mNumBlocks); + uint64_t entryIVBase = box_ntoh64(hdr.mEntryIVBase); + + //TODO: Verify that these sizes look reasonable + + // Allocate space for the index + BlocksAvailableEntry *pindex = (BlocksAvailableEntry*)::malloc(sizeof(BlocksAvailableEntry) * numBlocks); + if(pindex == 0) + { + throw std::bad_alloc(); + } + + try + { + for(int64_t b = 0; b < numBlocks; ++b) + { + // Read an entry from the stream + file_BlockIndexEntry entry; + if(!rBlockIndex.ReadFullBuffer(&entry, sizeof(entry), 0 /* not interested in bytes read if this fails */, Timeout)) + { + // Couldn't read entry + THROW_EXCEPTION(BackupStoreException, CouldntReadEntireStructureFromStream) + } + + // Calculate IV for this entry + uint64_t iv = entryIVBase; + iv += b; + // Network byte order + iv = box_hton64(iv); + sBlowfishDecryptBlockEntry.SetIV(&iv); + + // Decrypt the encrypted section + file_BlockIndexEntryEnc entryEnc; + int sectionSize = sBlowfishDecryptBlockEntry.TransformBlock(&entryEnc, sizeof(entryEnc), + entry.mEnEnc, sizeof(entry.mEnEnc)); + if(sectionSize != sizeof(entryEnc)) + { + THROW_EXCEPTION(BackupStoreException, BlockEntryEncodingDidntGiveExpectedLength) + } + + // Check that we're not trying to diff against a file which references blocks from another file + if(((int64_t)box_ntoh64(entry.mEncodedSize)) <= 0) + { + THROW_EXCEPTION(BackupStoreException, CannotDiffAnIncompleteStoreFile) + } + + // Store all the required information + pindex[b].mpNextInHashList = 0; // hash list not set up yet + pindex[b].mSize = ntohl(entryEnc.mSize); + pindex[b].mWeakChecksum = ntohl(entryEnc.mWeakChecksum); + ::memcpy(pindex[b].mStrongChecksum, entryEnc.mStrongChecksum, sizeof(pindex[b].mStrongChecksum)); + } + + // Store index pointer for called + ASSERT(ppIndex != 0); + *ppIndex = pindex; + + // Store number of blocks for caller + rNumBlocksOut = numBlocks; + + } + catch(...) + { + // clean up and send the exception along its way + ::free(pindex); + throw; + } +} + + +// -------------------------------------------------------------------------- +// +// Function +// Name: static FindMostUsedSizes(BlocksAvailableEntry *, int64_t, int32_t[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]) +// Purpose: Finds the most commonly used block sizes in the index +// Created: 12/1/04 +// +// -------------------------------------------------------------------------- +static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]) +{ + // Array for lengths + int64_t sizeCounts[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; + + // Set arrays to lots of zeros (= unused entries) + for(int l = 0; l < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++l) + { + Sizes[l] = 0; + sizeCounts[l] = 0; + } + + // Array for collecting sizes + std::map foundSizes; + + // Run through blocks and make a count of the entries + for(int64_t b = 0; b < NumBlocks; ++b) + { + // Only if the block size is bigger than the minimum size we'll scan for + if(pIndex[b].mSize > BACKUP_FILE_DIFF_MIN_BLOCK_SIZE) + { + // Find entry? + std::map::const_iterator f(foundSizes.find(pIndex[b].mSize)); + if(f != foundSizes.end()) + { + // Increment existing entry + foundSizes[pIndex[b].mSize] = foundSizes[pIndex[b].mSize] + 1; + } + else + { + // New entry + foundSizes[pIndex[b].mSize] = 1; + } + } + } + + // Make the block sizes + for(std::map::const_iterator i(foundSizes.begin()); i != foundSizes.end(); ++i) + { + // Find the position of the size in the array + for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t) + { + // Instead of sorting on the raw count of blocks, + // take the file area covered by this block size. + if(i->second * i->first > sizeCounts[t] * Sizes[t]) + { + // Then this size belong before this entry -- shuffle them up + for(int s = (BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1); s >= t; --s) + { + Sizes[s] = Sizes[s-1]; + sizeCounts[s] = sizeCounts[s-1]; + } + + // Insert this size + Sizes[t] = i->first; + sizeCounts[t] = i->second; + + // Shouldn't do any more searching + break; + } + } + } + + // trace the size table in debug builds +#ifndef BOX_RELEASE_BUILD + if(BackupStoreFile::TraceDetailsOfDiffProcess) + { + for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t) + { + BOX_TRACE("Diff block size " << t << ": " << + Sizes[t] << " (count = " << + sizeCounts[t] << ")"); + } + } +#endif +} + + + +// -------------------------------------------------------------------------- +// +// Function +// Name: static SearchForMatchingBlocks(IOStream &, std::map &, BlocksAvailableEntry *, int64_t, int32_t[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]) +// Purpose: Find the matching blocks within the file. +// Created: 12/1/04 +// +// -------------------------------------------------------------------------- +static void SearchForMatchingBlocks(IOStream &rFile, std::map &rFoundBlocks, + BlocksAvailableEntry *pIndex, int64_t NumBlocks, + int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES], DiffTimer *pDiffTimer) +{ + Timer maximumDiffingTime(0, "MaximumDiffingTime"); + + if(pDiffTimer && pDiffTimer->IsManaged()) + { + maximumDiffingTime = Timer(pDiffTimer->GetMaximumDiffingTime(), + "MaximumDiffingTime"); + } + + std::map goodnessOfFit; + + // Allocate the hash lookup table + BlocksAvailableEntry **phashTable = (BlocksAvailableEntry **)::malloc(sizeof(BlocksAvailableEntry *) * (64*1024)); + + // Choose a size for the buffer, just a little bit more than the maximum block size + int32_t bufSize = Sizes[0]; + for(int z = 1; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z) + { + if(Sizes[z] > bufSize) bufSize = Sizes[z]; + } + bufSize += 4; + ASSERT(bufSize > Sizes[0]); + ASSERT(bufSize > 0); + if(bufSize > (BACKUP_FILE_MAX_BLOCK_SIZE + 1024)) + { + THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile) + } + + // TODO: Because we read in the file a scanned block size at a time, + // it is likely to be inefficient. Probably will be much better to + // calculate checksums for all block sizes in a single pass. + + // Allocate the buffers. + uint8_t *pbuffer0 = (uint8_t *)::malloc(bufSize); + uint8_t *pbuffer1 = (uint8_t *)::malloc(bufSize); + try + { + // Check buffer allocation + if(pbuffer0 == 0 || pbuffer1 == 0 || phashTable == 0) + { + // If a buffer got allocated, it will be cleaned up in the catch block + throw std::bad_alloc(); + } + + // Flag to abort the run, if too many blocks are found -- avoid using + // huge amounts of processor time when files contain many similar blocks. + bool abortSearch = false; + + // Search for each block size in turn + // NOTE: Do the smallest size first, so that the scheme for adding + // entries in the found list works as expected and replaces smallers block + // with larger blocks when it finds matches at the same offset in the file. + for(int s = BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1; s >= 0; --s) + { + ASSERT(Sizes[s] <= bufSize); + BOX_TRACE("Diff pass " << s << ", for block size " << + Sizes[s]); + + // Check we haven't finished + if(Sizes[s] == 0) + { + // empty entry, try next size + continue; + } + + // Set up the hash table entries + SetupHashTable(pIndex, NumBlocks, Sizes[s], phashTable); + + // Shift file position to beginning + rFile.Seek(0, IOStream::SeekType_Absolute); + + // Read first block + if(rFile.Read(pbuffer0, Sizes[s]) != Sizes[s]) + { + // Size of file too short to match -- do next size + continue; + } + + // Setup block pointers + uint8_t *beginnings = pbuffer0; + uint8_t *endings = pbuffer1; + int offset = 0; + + // Calculate the first checksum, ready for rolling + RollingChecksum rolling(beginnings, Sizes[s]); + + // Then roll, until the file is exhausted + int64_t fileBlockNumber = 0; + int64_t fileOffset = 0; + int rollOverInitialBytes = 0; + while(true) + { + if(maximumDiffingTime.HasExpired()) + { + ASSERT(pDiffTimer != NULL); + BOX_INFO("MaximumDiffingTime reached - " + "suspending file diff"); + abortSearch = true; + break; + } + + if(pDiffTimer) + { + pDiffTimer->DoKeepAlive(); + } + + // Load in another block of data, and record how big it is + int bytesInEndings = rFile.Read(endings, Sizes[s]); + int tmp; + + // Skip any bytes from a previous matched block + if(rollOverInitialBytes > 0 && offset < bytesInEndings) + { + int spaceLeft = bytesInEndings - offset; + int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes; + + rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll); + + offset += thisRoll; + fileOffset += thisRoll; + rollOverInitialBytes -= thisRoll; + + if(rollOverInitialBytes) + { + goto refresh; + } + } + + if(goodnessOfFit.count(fileOffset)) + { + tmp = goodnessOfFit[fileOffset]; + } + else + { + tmp = 0; + } + + if(tmp >= Sizes[s]) + { + // Skip over bigger ready-matched blocks completely + rollOverInitialBytes = tmp; + int spaceLeft = bytesInEndings - offset; + int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes; + + rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll); + + offset += thisRoll; + fileOffset += thisRoll; + rollOverInitialBytes -= thisRoll; + + if(rollOverInitialBytes) + { + goto refresh; + } + } + + while(offset < bytesInEndings) + { + // Is current checksum in hash list? + uint16_t hash = rolling.GetComponentForHashing(); + if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s])) + { + if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks)) + { + BOX_TRACE("Found block match for " << hash << " of " << Sizes[s] << " bytes at offset " << fileOffset); + goodnessOfFit[fileOffset] = Sizes[s]; + + // Block matched, roll the checksum forward to the next block without doing + // any more comparisons, because these are pointless (as any more matches will be ignored when + // the recipe is generated) and just take up valuable processor time. Edge cases are + // especially nasty, using huge amounts of time and memory. + int skip = Sizes[s]; + if(offset < bytesInEndings && skip > 0) + { + int spaceLeft = bytesInEndings - offset; + int thisRoll = (skip > spaceLeft) ? spaceLeft : skip; + + rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll); + + offset += thisRoll; + fileOffset += thisRoll; + skip -= thisRoll; + } + // Not all the bytes necessary will have been skipped, so get them + // skipped after the next block is loaded. + rollOverInitialBytes = skip; + + // End this loop, so the final byte isn't used again + break; + } + else + { + BOX_TRACE("False alarm match for " << hash << " of " << Sizes[s] << " bytes at offset " << fileOffset); + } + + int64_t NumBlocksFound = static_cast( + rFoundBlocks.size()); + int64_t MaxBlocksFound = NumBlocks * + BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE; + + if(NumBlocksFound > MaxBlocksFound) + { + abortSearch = true; + break; + } + } + + // Roll checksum forward + rolling.RollForward(beginnings[offset], endings[offset], Sizes[s]); + + // Increment offsets + ++offset; + ++fileOffset; + } + + if(abortSearch) break; + + refresh: + // Finished? + if(bytesInEndings != Sizes[s]) + { + // No more data in file -- check the final block + // (Do a copy and paste of 5 lines of code instead of introducing a comparison for + // each byte of the file) + uint16_t hash = rolling.GetComponentForHashing(); + if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s])) + { + if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks)) + { + goodnessOfFit[fileOffset] = Sizes[s]; + } + } + + // finish + break; + } + + // Switch buffers, reset offset + beginnings = endings; + endings = (beginnings == pbuffer0)?(pbuffer1):(pbuffer0); // ie the other buffer + offset = 0; + + // And count the blocks which have been done + ++fileBlockNumber; + } + + if(abortSearch) break; + } + + // Free buffers and hash table + ::free(pbuffer1); + pbuffer1 = 0; + ::free(pbuffer0); + pbuffer0 = 0; + ::free(phashTable); + phashTable = 0; + } + catch(...) + { + // Cleanup and throw + if(pbuffer1 != 0) ::free(pbuffer1); + if(pbuffer0 != 0) ::free(pbuffer0); + if(phashTable != 0) ::free(phashTable); + throw; + } + +#ifndef BOX_RELEASE_BUILD + if(BackupStoreFile::TraceDetailsOfDiffProcess) + { + // Trace out the found blocks in debug mode + BOX_TRACE("Diff: list of found blocks"); + BOX_TRACE("======== ======== ======== ========"); + BOX_TRACE(" Offset BlkIdx Size Movement"); + for(std::map::const_iterator i(rFoundBlocks.begin()); i != rFoundBlocks.end(); ++i) + { + int64_t orgLoc = 0; + for(int64_t b = 0; b < i->second; ++b) + { + orgLoc += pIndex[b].mSize; + } + BOX_TRACE(std::setw(8) << i->first << " " << + std::setw(8) << i->second << " " << + std::setw(8) << pIndex[i->second].mSize << + " " << + std::setw(8) << (i->first - orgLoc)); + } + BOX_TRACE("======== ======== ======== ========"); + } +#endif +} + + +// -------------------------------------------------------------------------- +// +// Function +// Name: static SetupHashTable(BlocksAvailableEntry *, int64_t, in32_t, BlocksAvailableEntry **) +// Purpose: Set up the hash table ready for a scan +// Created: 14/1/04 +// +// -------------------------------------------------------------------------- +static void SetupHashTable(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t BlockSize, BlocksAvailableEntry **pHashTable) +{ + // Set all entries in the hash table to zero + ::memset(pHashTable, 0, (sizeof(BlocksAvailableEntry *) * (64*1024))); + + // Scan through the blocks, building the hash table + for(int64_t b = 0; b < NumBlocks; ++b) + { + // Only look at the required block size + if(pIndex[b].mSize == BlockSize) + { + // Get the value under which to hash this entry + uint16_t hash = RollingChecksum::ExtractHashingComponent(pIndex[b].mWeakChecksum); + + // Already present in table? + if(pHashTable[hash] != 0) + { + //BOX_TRACE("Another hash entry for " << hash << " found"); + // Yes -- need to set the pointer in this entry to the current entry to build the linked list + pIndex[b].mpNextInHashList = pHashTable[hash]; + } + + // Put a pointer to this entry in the hash table + pHashTable[hash] = pIndex + b; + } + } +} + + +// -------------------------------------------------------------------------- +// +// Function +// Name: static bool SecondStageMatch(xxx) +// Purpose: When a match in the hash table is found, scan for second stage match using strong checksum. +// Created: 14/1/04 +// +// -------------------------------------------------------------------------- +static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings, + int Offset, int32_t BlockSize, int64_t FileBlockNumber, BlocksAvailableEntry *pIndex, std::map &rFoundBlocks) +{ + // Check parameters + ASSERT(pBeginnings != 0); + ASSERT(pEndings != 0); + ASSERT(Offset >= 0); + ASSERT(BlockSize > 0); + ASSERT(pFirstInHashList != 0); + ASSERT(pIndex != 0); + +#ifndef BOX_RELEASE_BUILD + uint16_t DEBUG_Hash = fastSum.GetComponentForHashing(); +#endif + uint32_t Checksum = fastSum.GetChecksum(); + + // Before we go to the expense of the MD5, make sure it's a darn good match on the checksum we already know. + BlocksAvailableEntry *scan = pFirstInHashList; + bool found=false; + while(scan != 0) + { + if(scan->mWeakChecksum == Checksum) + { + found = true; + break; + } + scan = scan->mpNextInHashList; + } + if(!found) + { + return false; + } + + // Calculate the strong MD5 digest for this block + MD5Digest strong; + // Add the data from the beginnings + strong.Add(pBeginnings + Offset, BlockSize - Offset); + // Add any data from the endings + if(Offset > 0) + { + strong.Add(pEndings, Offset); + } + strong.Finish(); + + // Then go through the entries in the hash list, comparing with the strong digest calculated + scan = pFirstInHashList; + //BOX_TRACE("second stage match"); + while(scan != 0) + { + //BOX_TRACE("scan size " << scan->mSize << + // ", block size " << BlockSize << + // ", hash " << Hash); + ASSERT(scan->mSize == BlockSize); + ASSERT(RollingChecksum::ExtractHashingComponent(scan->mWeakChecksum) == DEBUG_Hash); + + // Compare? + if(strong.DigestMatches(scan->mStrongChecksum)) + { + //BOX_TRACE("Match!\n"); + // Found! Add to list of found blocks... + int64_t fileOffset = (FileBlockNumber * BlockSize) + Offset; + int64_t blockIndex = (scan - pIndex); // pointer arthmitic is frowned upon. But most efficient way of doing it here -- alternative is to use more memory + + // We do NOT search for smallest blocks first, as this code originally assumed. + // To prevent this from potentially overwriting a better match, the caller must determine + // the relative "goodness" of any existing match and this one, and avoid the call if it + // could be detrimental. + rFoundBlocks[fileOffset] = blockIndex; + + // No point in searching further, report success + return true; + } + + // Next + scan = scan->mpNextInHashList; + } + + // Not matched + return false; +} + + +// -------------------------------------------------------------------------- +// +// Function +// Name: static GenerateRecipe(BackupStoreFileEncodeStream::Recipe &, BlocksAvailableEntry *, int64_t, std::map &) +// Purpose: Fills in the recipe from the found block list +// Created: 15/1/04 +// +// -------------------------------------------------------------------------- +static void GenerateRecipe(BackupStoreFileEncodeStream::Recipe &rRecipe, BlocksAvailableEntry *pIndex, + int64_t NumBlocks, std::map &rFoundBlocks, int64_t SizeOfInputFile) +{ + // NOTE: This function could be a lot more sophisiticated. For example, if + // a small block overlaps a big block like this + // **** + // ******************************* + // then the small block will be used, not the big one. But it'd be better to + // just ignore the small block and keep the big one. However, some stats should + // be gathered about real world files before writing complex code which might + // go wrong. + + // Initialise a blank instruction + BackupStoreFileEncodeStream::RecipeInstruction instruction; + #define RESET_INSTRUCTION \ + instruction.mSpaceBefore = 0; \ + instruction.mBlocks = 0; \ + instruction.mpStartBlock = 0; + RESET_INSTRUCTION + + // First, a special case for when there are no found blocks + if(rFoundBlocks.size() == 0) + { + // No blocks, just a load of space + instruction.mSpaceBefore = SizeOfInputFile; + rRecipe.push_back(instruction); + + #ifndef BOX_RELEASE_BUILD + if(BackupStoreFile::TraceDetailsOfDiffProcess) + { + BOX_TRACE("Diff: Default recipe generated, " << + SizeOfInputFile << " bytes of file"); + } + #endif + + // Don't do anything + return; + } + + // Current location + int64_t loc = 0; + + // Then iterate through the list, generating the recipe + std::map::const_iterator i(rFoundBlocks.begin()); + ASSERT(i != rFoundBlocks.end()); // check logic + + // Counting for debug tracing +#ifndef BOX_RELEASE_BUILD + int64_t debug_NewBytesFound = 0; + int64_t debug_OldBlocksUsed = 0; +#endif + + for(; i != rFoundBlocks.end(); ++i) + { + // Remember... map is (position in file) -> (index of block in pIndex) + + if(i->first < loc) + { + // This block overlaps the last one + continue; + } + else if(i->first > loc) + { + // There's a gap between the end of the last thing and this block. + // If there's an instruction waiting, push it onto the list + if(instruction.mSpaceBefore != 0 || instruction.mpStartBlock != 0) + { + rRecipe.push_back(instruction); + } + // Start a new instruction, with the gap ready + RESET_INSTRUCTION + instruction.mSpaceBefore = i->first - loc; + // Move location forward to match + loc += instruction.mSpaceBefore; +#ifndef BOX_RELEASE_BUILD + debug_NewBytesFound += instruction.mSpaceBefore; +#endif + } + + // First, does the current instruction need pushing back, because this block is not + // sequential to the last one? + if(instruction.mpStartBlock != 0 && (pIndex + i->second) != (instruction.mpStartBlock + instruction.mBlocks)) + { + rRecipe.push_back(instruction); + RESET_INSTRUCTION + } + + // Add in this block + if(instruction.mpStartBlock == 0) + { + // This block starts a new instruction + instruction.mpStartBlock = pIndex + i->second; + instruction.mBlocks = 1; + } + else + { + // It continues the previous section of blocks + instruction.mBlocks += 1; + } + +#ifndef BOX_RELEASE_BUILD + debug_OldBlocksUsed++; +#endif + + // Move location forward + loc += pIndex[i->second].mSize; + } + + // Push the last instruction generated + rRecipe.push_back(instruction); + + // Is there any space left at the end which needs sending? + if(loc != SizeOfInputFile) + { + RESET_INSTRUCTION + instruction.mSpaceBefore = SizeOfInputFile - loc; +#ifndef BOX_RELEASE_BUILD + debug_NewBytesFound += instruction.mSpaceBefore; +#endif + rRecipe.push_back(instruction); + } + + // dump out the recipe +#ifndef BOX_RELEASE_BUILD + BOX_TRACE("Diff: " << + debug_NewBytesFound << " new bytes found, " << + debug_OldBlocksUsed << " old blocks used"); + if(BackupStoreFile::TraceDetailsOfDiffProcess) + { + BOX_TRACE("Diff: Recipe generated (size " << rRecipe.size()); + BOX_TRACE("======== ========= ========"); + BOX_TRACE("Space b4 FirstBlk NumBlks"); + { + for(unsigned int e = 0; e < rRecipe.size(); ++e) + { + char b[64]; +#ifdef WIN32 + sprintf(b, "%8I64d", (int64_t)(rRecipe[e].mpStartBlock - pIndex)); +#else + sprintf(b, "%8lld", (int64_t)(rRecipe[e].mpStartBlock - pIndex)); +#endif + BOX_TRACE(std::setw(8) << + rRecipe[e].mSpaceBefore << + " " << + ((rRecipe[e].mpStartBlock == 0)?" -":b) << + " " << std::setw(8) << + rRecipe[e].mBlocks); + } + } + BOX_TRACE("======== ========= ========"); + } +#endif +} -- cgit v1.2.3