summaryrefslogtreecommitdiff
path: root/lib/backupclient/BackupStoreFileDiff.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/backupclient/BackupStoreFileDiff.cpp')
-rwxr-xr-xlib/backupclient/BackupStoreFileDiff.cpp973
1 files changed, 973 insertions, 0 deletions
diff --git a/lib/backupclient/BackupStoreFileDiff.cpp b/lib/backupclient/BackupStoreFileDiff.cpp
new file mode 100755
index 00000000..b72ce328
--- /dev/null
+++ b/lib/backupclient/BackupStoreFileDiff.cpp
@@ -0,0 +1,973 @@
+// --------------------------------------------------------------------------
+//
+// File
+// Name: BackupStoreFileDiff.cpp
+// Purpose: Functions relating to diffing BackupStoreFiles
+// Created: 12/1/04
+//
+// --------------------------------------------------------------------------
+
+#include "Box.h"
+
+#include <new>
+#include <map>
+#include <signal.h>
+#include <sys/time.h>
+
+#include "BackupStoreFile.h"
+#include "BackupStoreFileWire.h"
+#include "BackupStoreFileCryptVar.h"
+#include "BackupStoreObjectMagic.h"
+#include "BackupStoreException.h"
+#include "BackupStoreFileEncodeStream.h"
+#include "BackupStoreConstants.h"
+#include "FileStream.h"
+#include "RollingChecksum.h"
+#include "MD5Digest.h"
+#include "CommonException.h"
+
+#include "MemLeakFindOn.h"
+
+using namespace BackupStoreFileCryptVar;
+using namespace BackupStoreFileCreation;
+
+// By default, don't trace out details of the diff as we go along -- would fill up logs significantly.
+// But it's useful for the test.
+#ifndef NDEBUG
+ bool BackupStoreFile::TraceDetailsOfDiffProcess = false;
+#endif
+
+static void LoadIndex(IOStream &rBlockIndex, int64_t ThisID, BlocksAvailableEntry **ppIndex, int64_t &rNumBlocksOut, int Timeout, bool &rCanDiffFromThis);
+static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]);
+static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> &rFoundBlocks, BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]);
+static void SetupHashTable(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t BlockSize, BlocksAvailableEntry **pHashTable);
+static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, uint16_t Hash, uint8_t *pBeginnings, uint8_t *pEndings, int Offset, int32_t BlockSize, int64_t FileBlockNumber, BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks);
+static void GenerateRecipe(BackupStoreFileEncodeStream::Recipe &rRecipe, BlocksAvailableEntry *pIndex, int64_t NumBlocks, std::map<int64_t, int64_t> &rFoundBlocks, int64_t SizeOfInputFile);
+
+// Avoid running on too long with diffs
+static int sMaximumDiffTime = 10; // maximum time to spend diffing
+static bool sDiffTimedOut = false;
+static bool sSetTimerSignelHandler = false;
+static void TimerSignalHandler(int signal);
+static void StartDiffTimer();
+
+
+// --------------------------------------------------------------------------
+//
+// Function
+// Name: BackupStoreFile::SetMaximumDiffingTime(int)
+// Purpose: Sets the maximum time to spend diffing, in seconds. Time is
+// process virutal time.
+// Created: 19/3/04
+//
+// --------------------------------------------------------------------------
+void BackupStoreFile::SetMaximumDiffingTime(int Seconds)
+{
+ sMaximumDiffTime = Seconds;
+ TRACE1("Set maximum diffing time to %d seconds\n", Seconds);
+}
+
+
+// --------------------------------------------------------------------------
+//
+// Function
+// Name: BackupStoreFile::MoveStreamPositionToBlockIndex(IOStream &)
+// Purpose: Move the file pointer in this stream to just before the block index.
+// Assumes that the stream is at the beginning, seekable, and
+// reading from the stream is OK.
+// Created: 12/1/04
+//
+// --------------------------------------------------------------------------
+void BackupStoreFile::MoveStreamPositionToBlockIndex(IOStream &rStream)
+{
+ // Size of file
+ int64_t fileSize = rStream.BytesLeftToRead();
+
+ // Get header
+ file_StreamFormat hdr;
+
+ // Read the header
+ if(!rStream.ReadFullBuffer(&hdr, sizeof(hdr), 0 /* not interested in bytes read if this fails */, IOStream::TimeOutInfinite))
+ {
+ // Couldn't read header
+ THROW_EXCEPTION(BackupStoreException, CouldntReadEntireStructureFromStream)
+ }
+
+ // Check magic number
+ if(ntohl(hdr.mMagicValue) != OBJECTMAGIC_FILE_MAGIC_VALUE_V1
+#ifndef BOX_DISABLE_BACKWARDS_COMPATIBILITY_BACKUPSTOREFILE
+ && ntohl(hdr.mMagicValue) != OBJECTMAGIC_FILE_MAGIC_VALUE_V0
+#endif
+ )
+ {
+ THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile)
+ }
+
+ // Work out where the index is
+ int64_t numBlocks = ntoh64(hdr.mNumBlocks);
+ int64_t blockHeaderPosFromEnd = ((numBlocks * sizeof(file_BlockIndexEntry)) + sizeof(file_BlockIndexHeader));
+
+ // Sanity check
+ if(blockHeaderPosFromEnd > static_cast<int64_t>(fileSize - sizeof(file_StreamFormat)))
+ {
+ THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile)
+ }
+
+ // Seek to that position
+ rStream.Seek(0 - blockHeaderPosFromEnd, IOStream::SeekType_End);
+
+ // Done. Stream now in right position (as long as the file is formatted correctly)
+}
+
+
+// --------------------------------------------------------------------------
+//
+// Function
+// Name: BackupStoreFile::EncodeFileDiff(const char *, int64_t, const BackupStoreFilename &, int64_t, IOStream &, int64_t *)
+// Purpose: Similar to EncodeFile, but takes the object ID of the file it's
+// diffing from, and the index of the blocks in a stream. It'll then
+// calculate which blocks can be reused from that old file.
+// The timeout is the timeout value for reading the diff block index.
+// If pIsCompletelyDifferent != 0, it will be set to true if the
+// the two files are completely different (do not share any block), false otherwise.
+//
+// Created: 12/1/04
+//
+// --------------------------------------------------------------------------
+std::auto_ptr<IOStream> BackupStoreFile::EncodeFileDiff(const char *Filename, int64_t ContainerID,
+ const BackupStoreFilename &rStoreFilename, int64_t DiffFromObjectID, IOStream &rDiffFromBlockIndex,
+ int Timeout, int64_t *pModificationTime, bool *pIsCompletelyDifferent)
+{
+ // Is it a symlink?
+ {
+ struct stat st;
+ if(::lstat(Filename, &st) != 0)
+ {
+ THROW_EXCEPTION(CommonException, OSFileError)
+ }
+ if((st.st_mode & S_IFLNK) == S_IFLNK)
+ {
+ // Don't do diffs for symlinks
+ if(pIsCompletelyDifferent != 0)
+ {
+ *pIsCompletelyDifferent = true;
+ }
+ return EncodeFile(Filename, ContainerID, rStoreFilename, pModificationTime);
+ }
+ }
+
+ // Load in the blocks
+ BlocksAvailableEntry *pindex = 0;
+ int64_t blocksInIndex = 0;
+ bool canDiffFromThis = false;
+ LoadIndex(rDiffFromBlockIndex, DiffFromObjectID, &pindex, blocksInIndex, Timeout, canDiffFromThis);
+ //TRACE1("Diff: Blocks in index: %lld\n", blocksInIndex);
+
+ if(!canDiffFromThis)
+ {
+ // Don't do diffing...
+ if(pIsCompletelyDifferent != 0)
+ {
+ *pIsCompletelyDifferent = true;
+ }
+ return EncodeFile(Filename, ContainerID, rStoreFilename, pModificationTime);
+ }
+
+ // Pointer to recipe we're going to create
+ BackupStoreFileEncodeStream::Recipe *precipe = 0;
+
+ // Start the timeout timer, so that the operation doesn't continue for ever
+ StartDiffTimer();
+
+ try
+ {
+ // Find which sizes should be scanned
+ int32_t sizesToScan[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES];
+ FindMostUsedSizes(pindex, blocksInIndex, sizesToScan);
+
+ // Flag for reporting to the user
+ bool completelyDifferent;
+
+ // BLOCK
+ {
+ // Search the file to find matching blocks
+ std::map<int64_t, int64_t> foundBlocks; // map of offset in file to index in block index
+ int64_t sizeOfInputFile = 0;
+ // BLOCK
+ {
+ FileStream file(Filename);
+ // Get size of file
+ sizeOfInputFile = file.BytesLeftToRead();
+ // Find all those lovely matching blocks
+ SearchForMatchingBlocks(file, foundBlocks, pindex, blocksInIndex, sizesToScan);
+
+ // Is it completely different?
+ completelyDifferent = (foundBlocks.size() == 0);
+ }
+
+ // Create a recipe -- if the two files are completely different, don't put the from file ID in the recipe.
+ precipe = new BackupStoreFileEncodeStream::Recipe(pindex, blocksInIndex, completelyDifferent?(0):(DiffFromObjectID));
+ BlocksAvailableEntry *pindexKeptRef = pindex; // we need this later, but must set pindex == 0 now, because of exceptions
+ pindex = 0; // Recipe now has ownership
+
+ // Fill it in
+ GenerateRecipe(*precipe, pindexKeptRef, blocksInIndex, foundBlocks, sizeOfInputFile);
+ }
+ // foundBlocks no longer required
+
+ // Create the stream
+ std::auto_ptr<IOStream> stream(new BackupStoreFileEncodeStream);
+
+ // Do the initial setup
+ ((BackupStoreFileEncodeStream*)stream.get())->Setup(Filename, precipe, ContainerID, rStoreFilename, pModificationTime);
+ precipe = 0; // Stream has taken ownership of this
+
+ // Tell user about completely different status?
+ if(pIsCompletelyDifferent != 0)
+ {
+ *pIsCompletelyDifferent = completelyDifferent;
+ }
+
+ // Return the stream for the caller
+ return stream;
+ }
+ catch(...)
+ {
+ // cleanup
+ if(pindex != 0)
+ {
+ ::free(pindex);
+ pindex = 0;
+ }
+ if(precipe != 0)
+ {
+ delete precipe;
+ precipe = 0;
+ }
+ throw;
+ }
+}
+
+// --------------------------------------------------------------------------
+//
+// Function
+// Name: static LoadIndex(IOStream &, int64_t, BlocksAvailableEntry **, int64_t, bool &)
+// Purpose: Read in an index, and decrypt, and store in the in memory block format.
+// rCanDiffFromThis is set to false if the version of the from file is too old.
+// Created: 12/1/04
+//
+// --------------------------------------------------------------------------
+static void LoadIndex(IOStream &rBlockIndex, int64_t ThisID, BlocksAvailableEntry **ppIndex, int64_t &rNumBlocksOut, int Timeout, bool &rCanDiffFromThis)
+{
+ // Reset
+ rNumBlocksOut = 0;
+ rCanDiffFromThis = false;
+
+ // Read header
+ file_BlockIndexHeader hdr;
+ if(!rBlockIndex.ReadFullBuffer(&hdr, sizeof(hdr), 0 /* not interested in bytes read if this fails */, Timeout))
+ {
+ // Couldn't read header
+ THROW_EXCEPTION(BackupStoreException, CouldntReadEntireStructureFromStream)
+ }
+
+#ifndef BOX_DISABLE_BACKWARDS_COMPATIBILITY_BACKUPSTOREFILE
+ // Check against backwards comptaibility stuff
+ if(hdr.mMagicValue == (int32_t)htonl(OBJECTMAGIC_FILE_BLOCKS_MAGIC_VALUE_V0))
+ {
+ // Won't diff against old version
+
+ // Absorb rest of stream
+ char buffer[2048];
+ while(rBlockIndex.StreamDataLeft())
+ {
+ rBlockIndex.Read(buffer, sizeof(buffer), 1000 /* 1 sec timeout */);
+ }
+
+ // Tell caller
+ rCanDiffFromThis = false;
+ return;
+ }
+#endif
+
+ // Check magic
+ if(hdr.mMagicValue != (int32_t)htonl(OBJECTMAGIC_FILE_BLOCKS_MAGIC_VALUE_V1))
+ {
+ THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile)
+ }
+
+ // Check that we're not trying to diff against a file which references blocks from another file
+ if(((int64_t)ntoh64(hdr.mOtherFileID)) != 0)
+ {
+ THROW_EXCEPTION(BackupStoreException, CannotDiffAnIncompleteStoreFile)
+ }
+
+ // Mark as an acceptable diff.
+ rCanDiffFromThis = true;
+
+ // Get basic information
+ int64_t numBlocks = ntoh64(hdr.mNumBlocks);
+ uint64_t entryIVBase = ntoh64(hdr.mEntryIVBase);
+
+ //TODO: Verify that these sizes look reasonable
+
+ // Allocate space for the index
+ BlocksAvailableEntry *pindex = (BlocksAvailableEntry*)::malloc(sizeof(BlocksAvailableEntry) * numBlocks);
+ if(pindex == 0)
+ {
+ throw std::bad_alloc();
+ }
+
+ try
+ {
+ for(int64_t b = 0; b < numBlocks; ++b)
+ {
+ // Read an entry from the stream
+ file_BlockIndexEntry entry;
+ if(!rBlockIndex.ReadFullBuffer(&entry, sizeof(entry), 0 /* not interested in bytes read if this fails */, Timeout))
+ {
+ // Couldn't read entry
+ THROW_EXCEPTION(BackupStoreException, CouldntReadEntireStructureFromStream)
+ }
+
+ // Calculate IV for this entry
+ uint64_t iv = entryIVBase;
+ iv += b;
+ // Network byte order
+ iv = hton64(iv);
+ sBlowfishDecryptBlockEntry.SetIV(&iv);
+
+ // Decrypt the encrypted section
+ file_BlockIndexEntryEnc entryEnc;
+ int sectionSize = sBlowfishDecryptBlockEntry.TransformBlock(&entryEnc, sizeof(entryEnc),
+ entry.mEnEnc, sizeof(entry.mEnEnc));
+ if(sectionSize != sizeof(entryEnc))
+ {
+ THROW_EXCEPTION(BackupStoreException, BlockEntryEncodingDidntGiveExpectedLength)
+ }
+
+ // Check that we're not trying to diff against a file which references blocks from another file
+ if(((int64_t)ntoh64(entry.mEncodedSize)) <= 0)
+ {
+ THROW_EXCEPTION(BackupStoreException, CannotDiffAnIncompleteStoreFile)
+ }
+
+ // Store all the required information
+ pindex[b].mpNextInHashList = 0; // hash list not set up yet
+ pindex[b].mSize = ntohl(entryEnc.mSize);
+ pindex[b].mWeakChecksum = ntohl(entryEnc.mWeakChecksum);
+ ::memcpy(pindex[b].mStrongChecksum, entryEnc.mStrongChecksum, sizeof(pindex[b].mStrongChecksum));
+ }
+
+ // Store index pointer for called
+ ASSERT(ppIndex != 0);
+ *ppIndex = pindex;
+
+ // Store number of blocks for caller
+ rNumBlocksOut = numBlocks;
+
+ }
+ catch(...)
+ {
+ // clean up and send the exception along its way
+ ::free(pindex);
+ throw;
+ }
+}
+
+
+// --------------------------------------------------------------------------
+//
+// Function
+// Name: static FindMostUsedSizes(BlocksAvailableEntry *, int64_t, int32_t[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES])
+// Purpose: Finds the most commonly used block sizes in the index
+// Created: 12/1/04
+//
+// --------------------------------------------------------------------------
+static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES])
+{
+ // Array for lengths
+ int64_t sizeCounts[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES];
+
+ // Set arrays to lots of zeros (= unused entries)
+ for(int l = 0; l < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++l)
+ {
+ Sizes[l] = 0;
+ sizeCounts[l] = 0;
+ }
+
+ // Array for collecting sizes
+ std::map<int32_t, int64_t> foundSizes;
+
+ // Run through blocks and make a count of the entries
+ for(int64_t b = 0; b < NumBlocks; ++b)
+ {
+ // Only if the block size is bigger than the minimum size we'll scan for
+ if(pIndex[b].mSize > BACKUP_FILE_DIFF_MIN_BLOCK_SIZE)
+ {
+ // Find entry?
+ std::map<int32_t, int64_t>::const_iterator f(foundSizes.find(pIndex[b].mSize));
+ if(f != foundSizes.end())
+ {
+ // Increment existing entry
+ foundSizes[pIndex[b].mSize] = foundSizes[pIndex[b].mSize] + 1;
+ }
+ else
+ {
+ // New entry
+ foundSizes[pIndex[b].mSize] = 1;
+ }
+ }
+ }
+
+ // Make the block sizes
+ for(std::map<int32_t, int64_t>::const_iterator i(foundSizes.begin()); i != foundSizes.end(); ++i)
+ {
+ // Find the position of the size in the array
+ for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t)
+ {
+ if(i->second > sizeCounts[t])
+ {
+ // Then this size belong before this entry -- shuffle them up
+ for(int s = (BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1); s >= t; --s)
+ {
+ Sizes[s] = Sizes[s-1];
+ sizeCounts[s] = sizeCounts[s-1];
+ }
+
+ // Insert this size
+ Sizes[t] = i->first;
+ sizeCounts[t] = i->second;
+
+ // Shouldn't do any more searching
+ break;
+ }
+ }
+ }
+
+ // trace the size table in debug builds
+#ifndef NDEBUG
+ if(BackupStoreFile::TraceDetailsOfDiffProcess)
+ {
+ for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t)
+ {
+ TRACE3("Diff block size %d: %d (count = %lld)\n", t, Sizes[t], sizeCounts[t]);
+ }
+ }
+#endif
+}
+
+
+
+// --------------------------------------------------------------------------
+//
+// Function
+// Name: static SearchForMatchingBlocks(IOStream &, std::map<int64_t, int64_t> &, BlocksAvailableEntry *, int64_t, int32_t[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES])
+// Purpose: Find the matching blocks within the file.
+// Created: 12/1/04
+//
+// --------------------------------------------------------------------------
+static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> &rFoundBlocks,
+ BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES])
+{
+ // Allocate the hash lookup table
+ BlocksAvailableEntry **phashTable = (BlocksAvailableEntry **)::malloc(sizeof(BlocksAvailableEntry *) * (64*1024));
+
+ // Choose a size for the buffer, just a little bit more than the maximum block size
+ int32_t bufSize = Sizes[0];
+ for(int z = 1; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z)
+ {
+ if(Sizes[z] > bufSize) bufSize = Sizes[z];
+ }
+ bufSize += 4;
+ ASSERT(bufSize > Sizes[0]);
+ ASSERT(bufSize > 0);
+ if(bufSize > (BACKUP_FILE_MAX_BLOCK_SIZE + 1024))
+ {
+ THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile)
+ }
+
+ // TODO: Use buffered file class
+ // Because we read in the file a scanned block size at a time, it is likely to be
+ // inefficient. Probably will be much better to use a buffering IOStream class which
+ // reads data in at the size of the filesystem block size.
+
+ // Allocate the buffers.
+ uint8_t *pbuffer0 = (uint8_t *)::malloc(bufSize);
+ uint8_t *pbuffer1 = (uint8_t *)::malloc(bufSize);
+ try
+ {
+ // Check buffer allocation
+ if(pbuffer0 == 0 || pbuffer1 == 0 || phashTable == 0)
+ {
+ // If a buffer got alloocated, it will be cleaned up in the catch block
+ throw std::bad_alloc();
+ }
+
+ // Flag to abort the run, if too many blocks are found -- avoid using
+ // huge amounts of processor time when files contain many similar blocks.
+ bool abortSearch = false;
+
+ // Search for each block size in turn
+ // NOTE: Do the smallest size first, so that the scheme for adding
+ // entries in the found list works as expected and replaces smallers block
+ // with larger blocks when it finds matches at the same offset in the file.
+ for(int s = BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1; s >= 0; --s)
+ {
+ ASSERT(Sizes[s] <= bufSize);
+ //TRACE2("Diff pass %d, for block size %d\n", s, Sizes[s]);
+
+ // Check we haven't finished
+ if(Sizes[s] == 0)
+ {
+ // empty entry, try next size
+ continue;
+ }
+
+ // Set up the hash table entries
+ SetupHashTable(pIndex, NumBlocks, Sizes[s], phashTable);
+
+ // Shift file position to beginning
+ rFile.Seek(0, IOStream::SeekType_Absolute);
+
+ // Read first block
+ if(rFile.Read(pbuffer0, Sizes[s]) != Sizes[s])
+ {
+ // Size of file too short to match -- do next size
+ continue;
+ }
+
+ // Setup block pointers
+ uint8_t *beginnings = pbuffer0;
+ uint8_t *endings = pbuffer1;
+ int offset = 0;
+
+ // Calculate the first checksum, ready for rolling
+ RollingChecksum rolling(beginnings, Sizes[s]);
+
+ // Then roll, until the file is exhausted
+ int64_t fileBlockNumber = 0;
+ int rollOverInitialBytes = 0;
+ while(true)
+ {
+ // Load in another block of data, and record how big it is
+ int bytesInEndings = rFile.Read(endings, Sizes[s]);
+
+ // Skip any bytes from a previous matched block
+ while(rollOverInitialBytes > 0 && offset < bytesInEndings)
+ {
+ rolling.RollForward(beginnings[offset], endings[offset], Sizes[s]);
+ ++offset;
+ --rollOverInitialBytes;
+ }
+
+ while(offset < bytesInEndings)
+ {
+ // Put is current checksum in hash list?
+ uint16_t hash = rolling.GetComponentForHashing();
+ if(phashTable[hash] != 0)
+ {
+ if(SecondStageMatch(phashTable[hash], hash, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks))
+ {
+ // Block matched, roll the checksum forward to the next block without doing
+ // any more comparisons, because these are pointless (as any more matches will be ignored when
+ // the receipe is generated) and just take up valuable processor time. Edge cases are
+ // especially nasty, using huge amounts of time and memory.
+ int skip = Sizes[s];
+ while(offset < bytesInEndings && skip > 0)
+ {
+ rolling.RollForward(beginnings[offset], endings[offset], Sizes[s]);
+ ++offset;
+ --skip;
+ }
+ // Not all the bytes necessary will have been skipped, so get them
+ // skipped after the next block is loaded.
+ rollOverInitialBytes = skip;
+
+ // End this loop, so the final byte isn't used again
+ break;
+ }
+
+ if(static_cast<int64_t>(rFoundBlocks.size()) > (NumBlocks * BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE)
+ || sDiffTimedOut)
+ {
+ abortSearch = true;
+ break;
+ }
+ }
+
+ // Roll checksum forward
+ rolling.RollForward(beginnings[offset], endings[offset], Sizes[s]);
+
+ // Increment offset
+ ++offset;
+ }
+
+ if(abortSearch) break;
+
+ // Finished?
+ if(bytesInEndings != Sizes[s])
+ {
+ // No more data in file -- check the final block
+ // (Do a copy and paste of 5 lines of code instead of introducing a comparison for
+ // each byte of the file)
+ uint16_t hash = rolling.GetComponentForHashing();
+ if(phashTable[hash] != 0)
+ {
+ SecondStageMatch(phashTable[hash], hash, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks);
+ }
+
+ // finish
+ break;
+ }
+
+ // Switch buffers, reset offset
+ beginnings = endings;
+ endings = (beginnings == pbuffer0)?(pbuffer1):(pbuffer0); // ie the other buffer
+ offset = 0;
+
+ // And count the blocks which have been done
+ ++fileBlockNumber;
+ }
+
+ if(abortSearch) break;
+ }
+
+ // Free buffers and hash table
+ ::free(pbuffer1);
+ pbuffer1 = 0;
+ ::free(pbuffer0);
+ pbuffer0 = 0;
+ ::free(phashTable);
+ phashTable = 0;
+ }
+ catch(...)
+ {
+ // Cleanup and throw
+ if(pbuffer1 != 0) ::free(pbuffer1);
+ if(pbuffer0 != 0) ::free(pbuffer0);
+ if(phashTable != 0) ::free(phashTable);
+ throw;
+ }
+
+#ifndef NDEBUG
+ if(BackupStoreFile::TraceDetailsOfDiffProcess)
+ {
+ // Trace out the found blocks in debug mode
+ TRACE0("Diff: list of found blocks\n======== ======== ======== ========\n Offset BlkIdx Size Movement\n");
+ for(std::map<int64_t, int64_t>::const_iterator i(rFoundBlocks.begin()); i != rFoundBlocks.end(); ++i)
+ {
+ int64_t orgLoc = 0;
+ for(int64_t b = 0; b < i->second; ++b)
+ {
+ orgLoc += pIndex[b].mSize;
+ }
+ TRACE4("%8lld %8lld %8lld %8lld\n", i->first, i->second,
+ (int64_t)(pIndex[i->second].mSize), i->first - orgLoc);
+ }
+ TRACE0("======== ======== ======== ========\n");
+ }
+#endif
+}
+
+
+// --------------------------------------------------------------------------
+//
+// Function
+// Name: static SetupHashTable(BlocksAvailableEntry *, int64_t, in32_t, BlocksAvailableEntry **)
+// Purpose: Set up the hash table ready for a scan
+// Created: 14/1/04
+//
+// --------------------------------------------------------------------------
+static void SetupHashTable(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t BlockSize, BlocksAvailableEntry **pHashTable)
+{
+ // Set all entries in the hash table to zero
+ ::memset(pHashTable, 0, (sizeof(BlocksAvailableEntry *) * (64*1024)));
+
+ // Scan through the blocks, building the hash table
+ for(int64_t b = 0; b < NumBlocks; ++b)
+ {
+ // Only look at the required block size
+ if(pIndex[b].mSize == BlockSize)
+ {
+ // Get the value under which to hash this entry
+ uint16_t hash = RollingChecksum::ExtractHashingComponent(pIndex[b].mWeakChecksum);
+
+ // Already present in table?
+ if(pHashTable[hash] != 0)
+ {
+ //TRACE1("Another hash entry for %d found\n", hash);
+ // Yes -- need to set the pointer in this entry to the current entry to build the linked list
+ pIndex[b].mpNextInHashList = pHashTable[hash];
+ }
+
+ // Put a pointer to this entry in the hash table
+ pHashTable[hash] = pIndex + b;
+ }
+ }
+}
+
+
+// --------------------------------------------------------------------------
+//
+// Function
+// Name: static bool SecondStageMatch(xxx)
+// Purpose: When a match in the hash table is found, scan for second stage match using strong checksum.
+// Created: 14/1/04
+//
+// --------------------------------------------------------------------------
+static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, uint16_t Hash, uint8_t *pBeginnings, uint8_t *pEndings,
+ int Offset, int32_t BlockSize, int64_t FileBlockNumber, BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks)
+{
+ // Check parameters
+ ASSERT(pBeginnings != 0);
+ ASSERT(pEndings != 0);
+ ASSERT(Offset >= 0);
+ ASSERT(BlockSize > 0);
+ ASSERT(pFirstInHashList != 0);
+ ASSERT(pIndex != 0);
+
+ // Calculate the strong MD5 digest for this block
+ MD5Digest strong;
+ // Add the data from the beginnings
+ strong.Add(pBeginnings + Offset, BlockSize - Offset);
+ // Add any data from the endings
+ if(Offset > 0)
+ {
+ strong.Add(pEndings, Offset);
+ }
+ strong.Finish();
+
+ // Then go through the entries in the hash list, comparing with the strong digest calculated
+ BlocksAvailableEntry *scan = pFirstInHashList;
+ //TRACE0("second stage match\n");
+ while(scan != 0)
+ {
+ //TRACE3("scan size %d, block size %d, hash %d\n", scan->mSize, BlockSize, Hash);
+ ASSERT(scan->mSize == BlockSize);
+ ASSERT(RollingChecksum::ExtractHashingComponent(scan->mWeakChecksum) == Hash);
+
+ // Compare?
+ if(strong.DigestMatches(scan->mStrongChecksum))
+ {
+ //TRACE0("Match!\n");
+ // Found! Add to list of found blocks...
+ int64_t fileOffset = (FileBlockNumber * BlockSize) + Offset;
+ int64_t blockIndex = (scan - pIndex); // pointer arthmitic is frowed apon. But most efficient way of doing it here -- alternative is to use more memory
+
+ // Because we search for smallest blocks first, if there's an entry there
+ // we'll just be replacing it with a larger one, which is great news.
+ rFoundBlocks[fileOffset] = blockIndex;
+
+ // No point in searching further, report success
+ return true;
+ }
+
+ // Next
+ scan = scan->mpNextInHashList;
+ }
+
+ // Not matched
+ return false;
+}
+
+
+// --------------------------------------------------------------------------
+//
+// Function
+// Name: static GenerateRecipe(BackupStoreFileEncodeStream::Recipe &, BlocksAvailableEntry *, int64_t, std::map<int64_t, int64_t> &)
+// Purpose: Fills in the recipe from the found block list
+// Created: 15/1/04
+//
+// --------------------------------------------------------------------------
+static void GenerateRecipe(BackupStoreFileEncodeStream::Recipe &rRecipe, BlocksAvailableEntry *pIndex,
+ int64_t NumBlocks, std::map<int64_t, int64_t> &rFoundBlocks, int64_t SizeOfInputFile)
+{
+ // NOTE: This function could be a lot more sophisiticated. For example, if
+ // a small block overlaps a big block like this
+ // ****
+ // *******************************
+ // then the small block will be used, not the big one. But it'd be better to
+ // just ignore the small block and keep the big one. However, some stats should
+ // be gathered about real world files before writing complex code which might
+ // go wrong.
+
+ // Initialise a blank instruction
+ BackupStoreFileEncodeStream::RecipeInstruction instruction;
+ #define RESET_INSTRUCTION \
+ instruction.mSpaceBefore = 0; \
+ instruction.mBlocks = 0; \
+ instruction.mpStartBlock = 0;
+ RESET_INSTRUCTION
+
+ // First, a special case for when there are no found blocks
+ if(rFoundBlocks.size() == 0)
+ {
+ // No blocks, just a load of space
+ instruction.mSpaceBefore = SizeOfInputFile;
+ rRecipe.push_back(instruction);
+
+ #ifndef NDEBUG
+ if(BackupStoreFile::TraceDetailsOfDiffProcess)
+ {
+ TRACE1("Diff: Default recipe generated, %lld bytes of file\n", SizeOfInputFile);
+ }
+ #endif
+
+ // Don't do anything
+ return;
+ }
+
+ // Current location
+ int64_t loc = 0;
+
+ // Then iterate through the list, generating the recipe
+ std::map<int64_t, int64_t>::const_iterator i(rFoundBlocks.begin());
+ ASSERT(i != rFoundBlocks.end()); // check logic
+
+ // Counting for debug tracing
+#ifndef NDEBUG
+ int64_t debug_NewBytesFound = 0;
+ int64_t debug_OldBlocksUsed = 0;
+#endif
+
+ for(; i != rFoundBlocks.end(); ++i)
+ {
+ // Remember... map is (position in file) -> (index of block in pIndex)
+
+ if(i->first < loc)
+ {
+ // This block overlaps the last one
+ continue;
+ }
+ else if(i->first > loc)
+ {
+ // There's a gap between the end of the last thing and this block.
+ // If there's an instruction waiting, push it onto the list
+ if(instruction.mSpaceBefore != 0 || instruction.mpStartBlock != 0)
+ {
+ rRecipe.push_back(instruction);
+ }
+ // Start a new instruction, with the gap ready
+ RESET_INSTRUCTION
+ instruction.mSpaceBefore = i->first - loc;
+ // Move location forward to match
+ loc += instruction.mSpaceBefore;
+#ifndef NDEBUG
+ debug_NewBytesFound += instruction.mSpaceBefore;
+#endif
+ }
+
+ // First, does the current instruction need pushing back, because this block is not
+ // sequential to the last one?
+ if(instruction.mpStartBlock != 0 && (pIndex + i->second) != (instruction.mpStartBlock + instruction.mBlocks))
+ {
+ rRecipe.push_back(instruction);
+ RESET_INSTRUCTION
+ }
+
+ // Add in this block
+ if(instruction.mpStartBlock == 0)
+ {
+ // This block starts a new instruction
+ instruction.mpStartBlock = pIndex + i->second;
+ instruction.mBlocks = 1;
+ }
+ else
+ {
+ // It continues the previous section of blocks
+ instruction.mBlocks += 1;
+ }
+
+#ifndef NDEBUG
+ debug_OldBlocksUsed++;
+#endif
+
+ // Move location forward
+ loc += pIndex[i->second].mSize;
+ }
+
+ // Push the last instruction generated
+ rRecipe.push_back(instruction);
+
+ // Is there any space left at the end which needs sending?
+ if(loc != SizeOfInputFile)
+ {
+ RESET_INSTRUCTION
+ instruction.mSpaceBefore = SizeOfInputFile - loc;
+#ifndef NDEBUG
+ debug_NewBytesFound += instruction.mSpaceBefore;
+#endif
+ rRecipe.push_back(instruction);
+ }
+
+ // dump out the recipe
+#ifndef NDEBUG
+ TRACE2("Diff: %lld new bytes found, %lld old blocks used\n", debug_NewBytesFound, debug_OldBlocksUsed);
+ if(BackupStoreFile::TraceDetailsOfDiffProcess)
+ {
+ TRACE1("Diff: Recipe generated (size %d)\n======== ========= ========\nSpace b4 FirstBlk NumBlks\n", rRecipe.size());
+ {
+ for(unsigned int e = 0; e < rRecipe.size(); ++e)
+ {
+ char b[64];
+ sprintf(b, "%8lld", (int64_t)(rRecipe[e].mpStartBlock - pIndex));
+ TRACE3("%8lld %s %8lld\n", rRecipe[e].mSpaceBefore, (rRecipe[e].mpStartBlock == 0)?" -":b, (int64_t)rRecipe[e].mBlocks);
+ }
+ }
+ TRACE0("======== ========= ========\n");
+ }
+#endif
+}
+
+
+// --------------------------------------------------------------------------
+//
+// Function
+// Name: static TimerSignalHandler(int)
+// Purpose: Signal handler
+// Created: 19/3/04
+//
+// --------------------------------------------------------------------------
+void TimerSignalHandler(int signal)
+{
+ sDiffTimedOut = true;
+}
+
+
+// --------------------------------------------------------------------------
+//
+// Function
+// Name: static StartDiffTimer()
+// Purpose: Starts the diff timeout timer
+// Created: 19/3/04
+//
+// --------------------------------------------------------------------------
+void StartDiffTimer()
+{
+ // Set timer signal handler
+ if(!sSetTimerSignelHandler)
+ {
+ ::signal(SIGVTALRM, TimerSignalHandler);
+ sSetTimerSignelHandler = true;
+ }
+
+ struct itimerval timeout;
+ // Don't want this to repeat
+ timeout.it_interval.tv_sec = 0;
+ timeout.it_interval.tv_usec = 0;
+ // Single timeout after the specified number of seconds
+ timeout.it_value.tv_sec = sMaximumDiffTime;
+ timeout.it_value.tv_usec = 0;
+ // Set timer
+ if(::setitimer(ITIMER_VIRTUAL, &timeout, NULL) != 0)
+ {
+ TRACE0("WARNING: couldn't set diff timeout\n");
+ }
+
+ // Unset flag (last thing)
+ sDiffTimedOut = false;
+}
+
+
+