diff options
Diffstat (limited to 'lib/backupclient/BackupStoreFileDiff.cpp')
-rwxr-xr-x | lib/backupclient/BackupStoreFileDiff.cpp | 973 |
1 files changed, 973 insertions, 0 deletions
diff --git a/lib/backupclient/BackupStoreFileDiff.cpp b/lib/backupclient/BackupStoreFileDiff.cpp new file mode 100755 index 00000000..b72ce328 --- /dev/null +++ b/lib/backupclient/BackupStoreFileDiff.cpp @@ -0,0 +1,973 @@ +// -------------------------------------------------------------------------- +// +// File +// Name: BackupStoreFileDiff.cpp +// Purpose: Functions relating to diffing BackupStoreFiles +// Created: 12/1/04 +// +// -------------------------------------------------------------------------- + +#include "Box.h" + +#include <new> +#include <map> +#include <signal.h> +#include <sys/time.h> + +#include "BackupStoreFile.h" +#include "BackupStoreFileWire.h" +#include "BackupStoreFileCryptVar.h" +#include "BackupStoreObjectMagic.h" +#include "BackupStoreException.h" +#include "BackupStoreFileEncodeStream.h" +#include "BackupStoreConstants.h" +#include "FileStream.h" +#include "RollingChecksum.h" +#include "MD5Digest.h" +#include "CommonException.h" + +#include "MemLeakFindOn.h" + +using namespace BackupStoreFileCryptVar; +using namespace BackupStoreFileCreation; + +// By default, don't trace out details of the diff as we go along -- would fill up logs significantly. +// But it's useful for the test. +#ifndef NDEBUG + bool BackupStoreFile::TraceDetailsOfDiffProcess = false; +#endif + +static void LoadIndex(IOStream &rBlockIndex, int64_t ThisID, BlocksAvailableEntry **ppIndex, int64_t &rNumBlocksOut, int Timeout, bool &rCanDiffFromThis); +static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]); +static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> &rFoundBlocks, BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]); +static void SetupHashTable(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t BlockSize, BlocksAvailableEntry **pHashTable); +static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, uint16_t Hash, uint8_t *pBeginnings, uint8_t *pEndings, int Offset, int32_t BlockSize, int64_t FileBlockNumber, BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks); +static void GenerateRecipe(BackupStoreFileEncodeStream::Recipe &rRecipe, BlocksAvailableEntry *pIndex, int64_t NumBlocks, std::map<int64_t, int64_t> &rFoundBlocks, int64_t SizeOfInputFile); + +// Avoid running on too long with diffs +static int sMaximumDiffTime = 10; // maximum time to spend diffing +static bool sDiffTimedOut = false; +static bool sSetTimerSignelHandler = false; +static void TimerSignalHandler(int signal); +static void StartDiffTimer(); + + +// -------------------------------------------------------------------------- +// +// Function +// Name: BackupStoreFile::SetMaximumDiffingTime(int) +// Purpose: Sets the maximum time to spend diffing, in seconds. Time is +// process virutal time. +// Created: 19/3/04 +// +// -------------------------------------------------------------------------- +void BackupStoreFile::SetMaximumDiffingTime(int Seconds) +{ + sMaximumDiffTime = Seconds; + TRACE1("Set maximum diffing time to %d seconds\n", Seconds); +} + + +// -------------------------------------------------------------------------- +// +// Function +// Name: BackupStoreFile::MoveStreamPositionToBlockIndex(IOStream &) +// Purpose: Move the file pointer in this stream to just before the block index. +// Assumes that the stream is at the beginning, seekable, and +// reading from the stream is OK. +// Created: 12/1/04 +// +// -------------------------------------------------------------------------- +void BackupStoreFile::MoveStreamPositionToBlockIndex(IOStream &rStream) +{ + // Size of file + int64_t fileSize = rStream.BytesLeftToRead(); + + // Get header + file_StreamFormat hdr; + + // Read the header + if(!rStream.ReadFullBuffer(&hdr, sizeof(hdr), 0 /* not interested in bytes read if this fails */, IOStream::TimeOutInfinite)) + { + // Couldn't read header + THROW_EXCEPTION(BackupStoreException, CouldntReadEntireStructureFromStream) + } + + // Check magic number + if(ntohl(hdr.mMagicValue) != OBJECTMAGIC_FILE_MAGIC_VALUE_V1 +#ifndef BOX_DISABLE_BACKWARDS_COMPATIBILITY_BACKUPSTOREFILE + && ntohl(hdr.mMagicValue) != OBJECTMAGIC_FILE_MAGIC_VALUE_V0 +#endif + ) + { + THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile) + } + + // Work out where the index is + int64_t numBlocks = ntoh64(hdr.mNumBlocks); + int64_t blockHeaderPosFromEnd = ((numBlocks * sizeof(file_BlockIndexEntry)) + sizeof(file_BlockIndexHeader)); + + // Sanity check + if(blockHeaderPosFromEnd > static_cast<int64_t>(fileSize - sizeof(file_StreamFormat))) + { + THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile) + } + + // Seek to that position + rStream.Seek(0 - blockHeaderPosFromEnd, IOStream::SeekType_End); + + // Done. Stream now in right position (as long as the file is formatted correctly) +} + + +// -------------------------------------------------------------------------- +// +// Function +// Name: BackupStoreFile::EncodeFileDiff(const char *, int64_t, const BackupStoreFilename &, int64_t, IOStream &, int64_t *) +// Purpose: Similar to EncodeFile, but takes the object ID of the file it's +// diffing from, and the index of the blocks in a stream. It'll then +// calculate which blocks can be reused from that old file. +// The timeout is the timeout value for reading the diff block index. +// If pIsCompletelyDifferent != 0, it will be set to true if the +// the two files are completely different (do not share any block), false otherwise. +// +// Created: 12/1/04 +// +// -------------------------------------------------------------------------- +std::auto_ptr<IOStream> BackupStoreFile::EncodeFileDiff(const char *Filename, int64_t ContainerID, + const BackupStoreFilename &rStoreFilename, int64_t DiffFromObjectID, IOStream &rDiffFromBlockIndex, + int Timeout, int64_t *pModificationTime, bool *pIsCompletelyDifferent) +{ + // Is it a symlink? + { + struct stat st; + if(::lstat(Filename, &st) != 0) + { + THROW_EXCEPTION(CommonException, OSFileError) + } + if((st.st_mode & S_IFLNK) == S_IFLNK) + { + // Don't do diffs for symlinks + if(pIsCompletelyDifferent != 0) + { + *pIsCompletelyDifferent = true; + } + return EncodeFile(Filename, ContainerID, rStoreFilename, pModificationTime); + } + } + + // Load in the blocks + BlocksAvailableEntry *pindex = 0; + int64_t blocksInIndex = 0; + bool canDiffFromThis = false; + LoadIndex(rDiffFromBlockIndex, DiffFromObjectID, &pindex, blocksInIndex, Timeout, canDiffFromThis); + //TRACE1("Diff: Blocks in index: %lld\n", blocksInIndex); + + if(!canDiffFromThis) + { + // Don't do diffing... + if(pIsCompletelyDifferent != 0) + { + *pIsCompletelyDifferent = true; + } + return EncodeFile(Filename, ContainerID, rStoreFilename, pModificationTime); + } + + // Pointer to recipe we're going to create + BackupStoreFileEncodeStream::Recipe *precipe = 0; + + // Start the timeout timer, so that the operation doesn't continue for ever + StartDiffTimer(); + + try + { + // Find which sizes should be scanned + int32_t sizesToScan[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; + FindMostUsedSizes(pindex, blocksInIndex, sizesToScan); + + // Flag for reporting to the user + bool completelyDifferent; + + // BLOCK + { + // Search the file to find matching blocks + std::map<int64_t, int64_t> foundBlocks; // map of offset in file to index in block index + int64_t sizeOfInputFile = 0; + // BLOCK + { + FileStream file(Filename); + // Get size of file + sizeOfInputFile = file.BytesLeftToRead(); + // Find all those lovely matching blocks + SearchForMatchingBlocks(file, foundBlocks, pindex, blocksInIndex, sizesToScan); + + // Is it completely different? + completelyDifferent = (foundBlocks.size() == 0); + } + + // Create a recipe -- if the two files are completely different, don't put the from file ID in the recipe. + precipe = new BackupStoreFileEncodeStream::Recipe(pindex, blocksInIndex, completelyDifferent?(0):(DiffFromObjectID)); + BlocksAvailableEntry *pindexKeptRef = pindex; // we need this later, but must set pindex == 0 now, because of exceptions + pindex = 0; // Recipe now has ownership + + // Fill it in + GenerateRecipe(*precipe, pindexKeptRef, blocksInIndex, foundBlocks, sizeOfInputFile); + } + // foundBlocks no longer required + + // Create the stream + std::auto_ptr<IOStream> stream(new BackupStoreFileEncodeStream); + + // Do the initial setup + ((BackupStoreFileEncodeStream*)stream.get())->Setup(Filename, precipe, ContainerID, rStoreFilename, pModificationTime); + precipe = 0; // Stream has taken ownership of this + + // Tell user about completely different status? + if(pIsCompletelyDifferent != 0) + { + *pIsCompletelyDifferent = completelyDifferent; + } + + // Return the stream for the caller + return stream; + } + catch(...) + { + // cleanup + if(pindex != 0) + { + ::free(pindex); + pindex = 0; + } + if(precipe != 0) + { + delete precipe; + precipe = 0; + } + throw; + } +} + +// -------------------------------------------------------------------------- +// +// Function +// Name: static LoadIndex(IOStream &, int64_t, BlocksAvailableEntry **, int64_t, bool &) +// Purpose: Read in an index, and decrypt, and store in the in memory block format. +// rCanDiffFromThis is set to false if the version of the from file is too old. +// Created: 12/1/04 +// +// -------------------------------------------------------------------------- +static void LoadIndex(IOStream &rBlockIndex, int64_t ThisID, BlocksAvailableEntry **ppIndex, int64_t &rNumBlocksOut, int Timeout, bool &rCanDiffFromThis) +{ + // Reset + rNumBlocksOut = 0; + rCanDiffFromThis = false; + + // Read header + file_BlockIndexHeader hdr; + if(!rBlockIndex.ReadFullBuffer(&hdr, sizeof(hdr), 0 /* not interested in bytes read if this fails */, Timeout)) + { + // Couldn't read header + THROW_EXCEPTION(BackupStoreException, CouldntReadEntireStructureFromStream) + } + +#ifndef BOX_DISABLE_BACKWARDS_COMPATIBILITY_BACKUPSTOREFILE + // Check against backwards comptaibility stuff + if(hdr.mMagicValue == (int32_t)htonl(OBJECTMAGIC_FILE_BLOCKS_MAGIC_VALUE_V0)) + { + // Won't diff against old version + + // Absorb rest of stream + char buffer[2048]; + while(rBlockIndex.StreamDataLeft()) + { + rBlockIndex.Read(buffer, sizeof(buffer), 1000 /* 1 sec timeout */); + } + + // Tell caller + rCanDiffFromThis = false; + return; + } +#endif + + // Check magic + if(hdr.mMagicValue != (int32_t)htonl(OBJECTMAGIC_FILE_BLOCKS_MAGIC_VALUE_V1)) + { + THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile) + } + + // Check that we're not trying to diff against a file which references blocks from another file + if(((int64_t)ntoh64(hdr.mOtherFileID)) != 0) + { + THROW_EXCEPTION(BackupStoreException, CannotDiffAnIncompleteStoreFile) + } + + // Mark as an acceptable diff. + rCanDiffFromThis = true; + + // Get basic information + int64_t numBlocks = ntoh64(hdr.mNumBlocks); + uint64_t entryIVBase = ntoh64(hdr.mEntryIVBase); + + //TODO: Verify that these sizes look reasonable + + // Allocate space for the index + BlocksAvailableEntry *pindex = (BlocksAvailableEntry*)::malloc(sizeof(BlocksAvailableEntry) * numBlocks); + if(pindex == 0) + { + throw std::bad_alloc(); + } + + try + { + for(int64_t b = 0; b < numBlocks; ++b) + { + // Read an entry from the stream + file_BlockIndexEntry entry; + if(!rBlockIndex.ReadFullBuffer(&entry, sizeof(entry), 0 /* not interested in bytes read if this fails */, Timeout)) + { + // Couldn't read entry + THROW_EXCEPTION(BackupStoreException, CouldntReadEntireStructureFromStream) + } + + // Calculate IV for this entry + uint64_t iv = entryIVBase; + iv += b; + // Network byte order + iv = hton64(iv); + sBlowfishDecryptBlockEntry.SetIV(&iv); + + // Decrypt the encrypted section + file_BlockIndexEntryEnc entryEnc; + int sectionSize = sBlowfishDecryptBlockEntry.TransformBlock(&entryEnc, sizeof(entryEnc), + entry.mEnEnc, sizeof(entry.mEnEnc)); + if(sectionSize != sizeof(entryEnc)) + { + THROW_EXCEPTION(BackupStoreException, BlockEntryEncodingDidntGiveExpectedLength) + } + + // Check that we're not trying to diff against a file which references blocks from another file + if(((int64_t)ntoh64(entry.mEncodedSize)) <= 0) + { + THROW_EXCEPTION(BackupStoreException, CannotDiffAnIncompleteStoreFile) + } + + // Store all the required information + pindex[b].mpNextInHashList = 0; // hash list not set up yet + pindex[b].mSize = ntohl(entryEnc.mSize); + pindex[b].mWeakChecksum = ntohl(entryEnc.mWeakChecksum); + ::memcpy(pindex[b].mStrongChecksum, entryEnc.mStrongChecksum, sizeof(pindex[b].mStrongChecksum)); + } + + // Store index pointer for called + ASSERT(ppIndex != 0); + *ppIndex = pindex; + + // Store number of blocks for caller + rNumBlocksOut = numBlocks; + + } + catch(...) + { + // clean up and send the exception along its way + ::free(pindex); + throw; + } +} + + +// -------------------------------------------------------------------------- +// +// Function +// Name: static FindMostUsedSizes(BlocksAvailableEntry *, int64_t, int32_t[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]) +// Purpose: Finds the most commonly used block sizes in the index +// Created: 12/1/04 +// +// -------------------------------------------------------------------------- +static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]) +{ + // Array for lengths + int64_t sizeCounts[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; + + // Set arrays to lots of zeros (= unused entries) + for(int l = 0; l < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++l) + { + Sizes[l] = 0; + sizeCounts[l] = 0; + } + + // Array for collecting sizes + std::map<int32_t, int64_t> foundSizes; + + // Run through blocks and make a count of the entries + for(int64_t b = 0; b < NumBlocks; ++b) + { + // Only if the block size is bigger than the minimum size we'll scan for + if(pIndex[b].mSize > BACKUP_FILE_DIFF_MIN_BLOCK_SIZE) + { + // Find entry? + std::map<int32_t, int64_t>::const_iterator f(foundSizes.find(pIndex[b].mSize)); + if(f != foundSizes.end()) + { + // Increment existing entry + foundSizes[pIndex[b].mSize] = foundSizes[pIndex[b].mSize] + 1; + } + else + { + // New entry + foundSizes[pIndex[b].mSize] = 1; + } + } + } + + // Make the block sizes + for(std::map<int32_t, int64_t>::const_iterator i(foundSizes.begin()); i != foundSizes.end(); ++i) + { + // Find the position of the size in the array + for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t) + { + if(i->second > sizeCounts[t]) + { + // Then this size belong before this entry -- shuffle them up + for(int s = (BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1); s >= t; --s) + { + Sizes[s] = Sizes[s-1]; + sizeCounts[s] = sizeCounts[s-1]; + } + + // Insert this size + Sizes[t] = i->first; + sizeCounts[t] = i->second; + + // Shouldn't do any more searching + break; + } + } + } + + // trace the size table in debug builds +#ifndef NDEBUG + if(BackupStoreFile::TraceDetailsOfDiffProcess) + { + for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t) + { + TRACE3("Diff block size %d: %d (count = %lld)\n", t, Sizes[t], sizeCounts[t]); + } + } +#endif +} + + + +// -------------------------------------------------------------------------- +// +// Function +// Name: static SearchForMatchingBlocks(IOStream &, std::map<int64_t, int64_t> &, BlocksAvailableEntry *, int64_t, int32_t[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]) +// Purpose: Find the matching blocks within the file. +// Created: 12/1/04 +// +// -------------------------------------------------------------------------- +static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> &rFoundBlocks, + BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]) +{ + // Allocate the hash lookup table + BlocksAvailableEntry **phashTable = (BlocksAvailableEntry **)::malloc(sizeof(BlocksAvailableEntry *) * (64*1024)); + + // Choose a size for the buffer, just a little bit more than the maximum block size + int32_t bufSize = Sizes[0]; + for(int z = 1; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z) + { + if(Sizes[z] > bufSize) bufSize = Sizes[z]; + } + bufSize += 4; + ASSERT(bufSize > Sizes[0]); + ASSERT(bufSize > 0); + if(bufSize > (BACKUP_FILE_MAX_BLOCK_SIZE + 1024)) + { + THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile) + } + + // TODO: Use buffered file class + // Because we read in the file a scanned block size at a time, it is likely to be + // inefficient. Probably will be much better to use a buffering IOStream class which + // reads data in at the size of the filesystem block size. + + // Allocate the buffers. + uint8_t *pbuffer0 = (uint8_t *)::malloc(bufSize); + uint8_t *pbuffer1 = (uint8_t *)::malloc(bufSize); + try + { + // Check buffer allocation + if(pbuffer0 == 0 || pbuffer1 == 0 || phashTable == 0) + { + // If a buffer got alloocated, it will be cleaned up in the catch block + throw std::bad_alloc(); + } + + // Flag to abort the run, if too many blocks are found -- avoid using + // huge amounts of processor time when files contain many similar blocks. + bool abortSearch = false; + + // Search for each block size in turn + // NOTE: Do the smallest size first, so that the scheme for adding + // entries in the found list works as expected and replaces smallers block + // with larger blocks when it finds matches at the same offset in the file. + for(int s = BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1; s >= 0; --s) + { + ASSERT(Sizes[s] <= bufSize); + //TRACE2("Diff pass %d, for block size %d\n", s, Sizes[s]); + + // Check we haven't finished + if(Sizes[s] == 0) + { + // empty entry, try next size + continue; + } + + // Set up the hash table entries + SetupHashTable(pIndex, NumBlocks, Sizes[s], phashTable); + + // Shift file position to beginning + rFile.Seek(0, IOStream::SeekType_Absolute); + + // Read first block + if(rFile.Read(pbuffer0, Sizes[s]) != Sizes[s]) + { + // Size of file too short to match -- do next size + continue; + } + + // Setup block pointers + uint8_t *beginnings = pbuffer0; + uint8_t *endings = pbuffer1; + int offset = 0; + + // Calculate the first checksum, ready for rolling + RollingChecksum rolling(beginnings, Sizes[s]); + + // Then roll, until the file is exhausted + int64_t fileBlockNumber = 0; + int rollOverInitialBytes = 0; + while(true) + { + // Load in another block of data, and record how big it is + int bytesInEndings = rFile.Read(endings, Sizes[s]); + + // Skip any bytes from a previous matched block + while(rollOverInitialBytes > 0 && offset < bytesInEndings) + { + rolling.RollForward(beginnings[offset], endings[offset], Sizes[s]); + ++offset; + --rollOverInitialBytes; + } + + while(offset < bytesInEndings) + { + // Put is current checksum in hash list? + uint16_t hash = rolling.GetComponentForHashing(); + if(phashTable[hash] != 0) + { + if(SecondStageMatch(phashTable[hash], hash, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks)) + { + // Block matched, roll the checksum forward to the next block without doing + // any more comparisons, because these are pointless (as any more matches will be ignored when + // the receipe is generated) and just take up valuable processor time. Edge cases are + // especially nasty, using huge amounts of time and memory. + int skip = Sizes[s]; + while(offset < bytesInEndings && skip > 0) + { + rolling.RollForward(beginnings[offset], endings[offset], Sizes[s]); + ++offset; + --skip; + } + // Not all the bytes necessary will have been skipped, so get them + // skipped after the next block is loaded. + rollOverInitialBytes = skip; + + // End this loop, so the final byte isn't used again + break; + } + + if(static_cast<int64_t>(rFoundBlocks.size()) > (NumBlocks * BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE) + || sDiffTimedOut) + { + abortSearch = true; + break; + } + } + + // Roll checksum forward + rolling.RollForward(beginnings[offset], endings[offset], Sizes[s]); + + // Increment offset + ++offset; + } + + if(abortSearch) break; + + // Finished? + if(bytesInEndings != Sizes[s]) + { + // No more data in file -- check the final block + // (Do a copy and paste of 5 lines of code instead of introducing a comparison for + // each byte of the file) + uint16_t hash = rolling.GetComponentForHashing(); + if(phashTable[hash] != 0) + { + SecondStageMatch(phashTable[hash], hash, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks); + } + + // finish + break; + } + + // Switch buffers, reset offset + beginnings = endings; + endings = (beginnings == pbuffer0)?(pbuffer1):(pbuffer0); // ie the other buffer + offset = 0; + + // And count the blocks which have been done + ++fileBlockNumber; + } + + if(abortSearch) break; + } + + // Free buffers and hash table + ::free(pbuffer1); + pbuffer1 = 0; + ::free(pbuffer0); + pbuffer0 = 0; + ::free(phashTable); + phashTable = 0; + } + catch(...) + { + // Cleanup and throw + if(pbuffer1 != 0) ::free(pbuffer1); + if(pbuffer0 != 0) ::free(pbuffer0); + if(phashTable != 0) ::free(phashTable); + throw; + } + +#ifndef NDEBUG + if(BackupStoreFile::TraceDetailsOfDiffProcess) + { + // Trace out the found blocks in debug mode + TRACE0("Diff: list of found blocks\n======== ======== ======== ========\n Offset BlkIdx Size Movement\n"); + for(std::map<int64_t, int64_t>::const_iterator i(rFoundBlocks.begin()); i != rFoundBlocks.end(); ++i) + { + int64_t orgLoc = 0; + for(int64_t b = 0; b < i->second; ++b) + { + orgLoc += pIndex[b].mSize; + } + TRACE4("%8lld %8lld %8lld %8lld\n", i->first, i->second, + (int64_t)(pIndex[i->second].mSize), i->first - orgLoc); + } + TRACE0("======== ======== ======== ========\n"); + } +#endif +} + + +// -------------------------------------------------------------------------- +// +// Function +// Name: static SetupHashTable(BlocksAvailableEntry *, int64_t, in32_t, BlocksAvailableEntry **) +// Purpose: Set up the hash table ready for a scan +// Created: 14/1/04 +// +// -------------------------------------------------------------------------- +static void SetupHashTable(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t BlockSize, BlocksAvailableEntry **pHashTable) +{ + // Set all entries in the hash table to zero + ::memset(pHashTable, 0, (sizeof(BlocksAvailableEntry *) * (64*1024))); + + // Scan through the blocks, building the hash table + for(int64_t b = 0; b < NumBlocks; ++b) + { + // Only look at the required block size + if(pIndex[b].mSize == BlockSize) + { + // Get the value under which to hash this entry + uint16_t hash = RollingChecksum::ExtractHashingComponent(pIndex[b].mWeakChecksum); + + // Already present in table? + if(pHashTable[hash] != 0) + { + //TRACE1("Another hash entry for %d found\n", hash); + // Yes -- need to set the pointer in this entry to the current entry to build the linked list + pIndex[b].mpNextInHashList = pHashTable[hash]; + } + + // Put a pointer to this entry in the hash table + pHashTable[hash] = pIndex + b; + } + } +} + + +// -------------------------------------------------------------------------- +// +// Function +// Name: static bool SecondStageMatch(xxx) +// Purpose: When a match in the hash table is found, scan for second stage match using strong checksum. +// Created: 14/1/04 +// +// -------------------------------------------------------------------------- +static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, uint16_t Hash, uint8_t *pBeginnings, uint8_t *pEndings, + int Offset, int32_t BlockSize, int64_t FileBlockNumber, BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks) +{ + // Check parameters + ASSERT(pBeginnings != 0); + ASSERT(pEndings != 0); + ASSERT(Offset >= 0); + ASSERT(BlockSize > 0); + ASSERT(pFirstInHashList != 0); + ASSERT(pIndex != 0); + + // Calculate the strong MD5 digest for this block + MD5Digest strong; + // Add the data from the beginnings + strong.Add(pBeginnings + Offset, BlockSize - Offset); + // Add any data from the endings + if(Offset > 0) + { + strong.Add(pEndings, Offset); + } + strong.Finish(); + + // Then go through the entries in the hash list, comparing with the strong digest calculated + BlocksAvailableEntry *scan = pFirstInHashList; + //TRACE0("second stage match\n"); + while(scan != 0) + { + //TRACE3("scan size %d, block size %d, hash %d\n", scan->mSize, BlockSize, Hash); + ASSERT(scan->mSize == BlockSize); + ASSERT(RollingChecksum::ExtractHashingComponent(scan->mWeakChecksum) == Hash); + + // Compare? + if(strong.DigestMatches(scan->mStrongChecksum)) + { + //TRACE0("Match!\n"); + // Found! Add to list of found blocks... + int64_t fileOffset = (FileBlockNumber * BlockSize) + Offset; + int64_t blockIndex = (scan - pIndex); // pointer arthmitic is frowed apon. But most efficient way of doing it here -- alternative is to use more memory + + // Because we search for smallest blocks first, if there's an entry there + // we'll just be replacing it with a larger one, which is great news. + rFoundBlocks[fileOffset] = blockIndex; + + // No point in searching further, report success + return true; + } + + // Next + scan = scan->mpNextInHashList; + } + + // Not matched + return false; +} + + +// -------------------------------------------------------------------------- +// +// Function +// Name: static GenerateRecipe(BackupStoreFileEncodeStream::Recipe &, BlocksAvailableEntry *, int64_t, std::map<int64_t, int64_t> &) +// Purpose: Fills in the recipe from the found block list +// Created: 15/1/04 +// +// -------------------------------------------------------------------------- +static void GenerateRecipe(BackupStoreFileEncodeStream::Recipe &rRecipe, BlocksAvailableEntry *pIndex, + int64_t NumBlocks, std::map<int64_t, int64_t> &rFoundBlocks, int64_t SizeOfInputFile) +{ + // NOTE: This function could be a lot more sophisiticated. For example, if + // a small block overlaps a big block like this + // **** + // ******************************* + // then the small block will be used, not the big one. But it'd be better to + // just ignore the small block and keep the big one. However, some stats should + // be gathered about real world files before writing complex code which might + // go wrong. + + // Initialise a blank instruction + BackupStoreFileEncodeStream::RecipeInstruction instruction; + #define RESET_INSTRUCTION \ + instruction.mSpaceBefore = 0; \ + instruction.mBlocks = 0; \ + instruction.mpStartBlock = 0; + RESET_INSTRUCTION + + // First, a special case for when there are no found blocks + if(rFoundBlocks.size() == 0) + { + // No blocks, just a load of space + instruction.mSpaceBefore = SizeOfInputFile; + rRecipe.push_back(instruction); + + #ifndef NDEBUG + if(BackupStoreFile::TraceDetailsOfDiffProcess) + { + TRACE1("Diff: Default recipe generated, %lld bytes of file\n", SizeOfInputFile); + } + #endif + + // Don't do anything + return; + } + + // Current location + int64_t loc = 0; + + // Then iterate through the list, generating the recipe + std::map<int64_t, int64_t>::const_iterator i(rFoundBlocks.begin()); + ASSERT(i != rFoundBlocks.end()); // check logic + + // Counting for debug tracing +#ifndef NDEBUG + int64_t debug_NewBytesFound = 0; + int64_t debug_OldBlocksUsed = 0; +#endif + + for(; i != rFoundBlocks.end(); ++i) + { + // Remember... map is (position in file) -> (index of block in pIndex) + + if(i->first < loc) + { + // This block overlaps the last one + continue; + } + else if(i->first > loc) + { + // There's a gap between the end of the last thing and this block. + // If there's an instruction waiting, push it onto the list + if(instruction.mSpaceBefore != 0 || instruction.mpStartBlock != 0) + { + rRecipe.push_back(instruction); + } + // Start a new instruction, with the gap ready + RESET_INSTRUCTION + instruction.mSpaceBefore = i->first - loc; + // Move location forward to match + loc += instruction.mSpaceBefore; +#ifndef NDEBUG + debug_NewBytesFound += instruction.mSpaceBefore; +#endif + } + + // First, does the current instruction need pushing back, because this block is not + // sequential to the last one? + if(instruction.mpStartBlock != 0 && (pIndex + i->second) != (instruction.mpStartBlock + instruction.mBlocks)) + { + rRecipe.push_back(instruction); + RESET_INSTRUCTION + } + + // Add in this block + if(instruction.mpStartBlock == 0) + { + // This block starts a new instruction + instruction.mpStartBlock = pIndex + i->second; + instruction.mBlocks = 1; + } + else + { + // It continues the previous section of blocks + instruction.mBlocks += 1; + } + +#ifndef NDEBUG + debug_OldBlocksUsed++; +#endif + + // Move location forward + loc += pIndex[i->second].mSize; + } + + // Push the last instruction generated + rRecipe.push_back(instruction); + + // Is there any space left at the end which needs sending? + if(loc != SizeOfInputFile) + { + RESET_INSTRUCTION + instruction.mSpaceBefore = SizeOfInputFile - loc; +#ifndef NDEBUG + debug_NewBytesFound += instruction.mSpaceBefore; +#endif + rRecipe.push_back(instruction); + } + + // dump out the recipe +#ifndef NDEBUG + TRACE2("Diff: %lld new bytes found, %lld old blocks used\n", debug_NewBytesFound, debug_OldBlocksUsed); + if(BackupStoreFile::TraceDetailsOfDiffProcess) + { + TRACE1("Diff: Recipe generated (size %d)\n======== ========= ========\nSpace b4 FirstBlk NumBlks\n", rRecipe.size()); + { + for(unsigned int e = 0; e < rRecipe.size(); ++e) + { + char b[64]; + sprintf(b, "%8lld", (int64_t)(rRecipe[e].mpStartBlock - pIndex)); + TRACE3("%8lld %s %8lld\n", rRecipe[e].mSpaceBefore, (rRecipe[e].mpStartBlock == 0)?" -":b, (int64_t)rRecipe[e].mBlocks); + } + } + TRACE0("======== ========= ========\n"); + } +#endif +} + + +// -------------------------------------------------------------------------- +// +// Function +// Name: static TimerSignalHandler(int) +// Purpose: Signal handler +// Created: 19/3/04 +// +// -------------------------------------------------------------------------- +void TimerSignalHandler(int signal) +{ + sDiffTimedOut = true; +} + + +// -------------------------------------------------------------------------- +// +// Function +// Name: static StartDiffTimer() +// Purpose: Starts the diff timeout timer +// Created: 19/3/04 +// +// -------------------------------------------------------------------------- +void StartDiffTimer() +{ + // Set timer signal handler + if(!sSetTimerSignelHandler) + { + ::signal(SIGVTALRM, TimerSignalHandler); + sSetTimerSignelHandler = true; + } + + struct itimerval timeout; + // Don't want this to repeat + timeout.it_interval.tv_sec = 0; + timeout.it_interval.tv_usec = 0; + // Single timeout after the specified number of seconds + timeout.it_value.tv_sec = sMaximumDiffTime; + timeout.it_value.tv_usec = 0; + // Set timer + if(::setitimer(ITIMER_VIRTUAL, &timeout, NULL) != 0) + { + TRACE0("WARNING: couldn't set diff timeout\n"); + } + + // Unset flag (last thing) + sDiffTimedOut = false; +} + + + |