summaryrefslogtreecommitdiff
path: root/lib/backupclient/BackupStoreFileDiff.cpp
diff options
context:
space:
mode:
authorReinhard Tartler <siretart@tauware.de>2017-06-14 19:53:34 -0400
committerReinhard Tartler <siretart@tauware.de>2017-06-14 19:55:14 -0400
commite0c122119afea4c951c0c57144d26a473118c254 (patch)
tree34a02a56f9b017201dfb721ef678c711351466d6 /lib/backupclient/BackupStoreFileDiff.cpp
parente0eb815b67734abd09ff41e2271630d4b2a6d760 (diff)
Fixup botched merge
Diffstat (limited to 'lib/backupclient/BackupStoreFileDiff.cpp')
-rw-r--r--lib/backupclient/BackupStoreFileDiff.cpp1046
1 files changed, 0 insertions, 1046 deletions
diff --git a/lib/backupclient/BackupStoreFileDiff.cpp b/lib/backupclient/BackupStoreFileDiff.cpp
deleted file mode 100644
index 5705c3aa..00000000
--- a/lib/backupclient/BackupStoreFileDiff.cpp
+++ /dev/null
@@ -1,1046 +0,0 @@
-// --------------------------------------------------------------------------
-//
-// File
-// Name: BackupStoreFileDiff.cpp
-// Purpose: Functions relating to diffing BackupStoreFiles
-// Created: 12/1/04
-//
-// --------------------------------------------------------------------------
-
-#include "Box.h"
-
-#include <string.h>
-
-#include <new>
-#include <map>
-
-#ifdef HAVE_TIME_H
- #include <time.h>
-#elif HAVE_SYS_TIME_H
- #include <sys/time.h>
-#endif
-
-#include "BackupStoreConstants.h"
-#include "BackupStoreException.h"
-#include "BackupStoreFile.h"
-#include "BackupStoreFileCryptVar.h"
-#include "BackupStoreFileEncodeStream.h"
-#include "BackupStoreFileWire.h"
-#include "BackupStoreObjectMagic.h"
-#include "CommonException.h"
-#include "FileStream.h"
-#include "MD5Digest.h"
-#include "RollingChecksum.h"
-#include "Timer.h"
-
-#include "MemLeakFindOn.h"
-
-#include <cstring>
-
-using namespace BackupStoreFileCryptVar;
-using namespace BackupStoreFileCreation;
-
-// By default, don't trace out details of the diff as we go along -- would fill up logs significantly.
-// But it's useful for the test.
-#ifndef BOX_RELEASE_BUILD
- bool BackupStoreFile::TraceDetailsOfDiffProcess = false;
-#endif
-
-static void LoadIndex(IOStream &rBlockIndex, int64_t ThisID, BlocksAvailableEntry **ppIndex, int64_t &rNumBlocksOut, int Timeout, bool &rCanDiffFromThis);
-static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]);
-static void SearchForMatchingBlocks(IOStream &rFile,
- std::map<int64_t, int64_t> &rFoundBlocks, BlocksAvailableEntry *pIndex,
- int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES],
- DiffTimer *pDiffTimer);
-static void SetupHashTable(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t BlockSize, BlocksAvailableEntry **pHashTable);
-static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings, int Offset, int32_t BlockSize, int64_t FileBlockNumber,
-BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks);
-static void GenerateRecipe(BackupStoreFileEncodeStream::Recipe &rRecipe, BlocksAvailableEntry *pIndex, int64_t NumBlocks, std::map<int64_t, int64_t> &rFoundBlocks, int64_t SizeOfInputFile);
-
-// --------------------------------------------------------------------------
-//
-// Function
-// Name: BackupStoreFile::MoveStreamPositionToBlockIndex(IOStream &)
-// Purpose: Move the file pointer in this stream to just before the block index.
-// Assumes that the stream is at the beginning, seekable, and
-// reading from the stream is OK.
-// Created: 12/1/04
-//
-// --------------------------------------------------------------------------
-void BackupStoreFile::MoveStreamPositionToBlockIndex(IOStream &rStream)
-{
- // Size of file
- int64_t fileSize = rStream.BytesLeftToRead();
-
- // Get header
- file_StreamFormat hdr;
-
- // Read the header
- if(!rStream.ReadFullBuffer(&hdr, sizeof(hdr), 0 /* not interested in bytes read if this fails */, IOStream::TimeOutInfinite))
- {
- // Couldn't read header
- THROW_EXCEPTION(BackupStoreException, CouldntReadEntireStructureFromStream)
- }
-
- // Check magic number
- if(ntohl(hdr.mMagicValue) != OBJECTMAGIC_FILE_MAGIC_VALUE_V1
-#ifndef BOX_DISABLE_BACKWARDS_COMPATIBILITY_BACKUPSTOREFILE
- && ntohl(hdr.mMagicValue) != OBJECTMAGIC_FILE_MAGIC_VALUE_V0
-#endif
- )
- {
- THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile)
- }
-
- // Work out where the index is
- int64_t numBlocks = box_ntoh64(hdr.mNumBlocks);
- int64_t blockHeaderPosFromEnd = ((numBlocks * sizeof(file_BlockIndexEntry)) + sizeof(file_BlockIndexHeader));
-
- // Sanity check
- if(blockHeaderPosFromEnd > static_cast<int64_t>(fileSize - sizeof(file_StreamFormat)))
- {
- THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile)
- }
-
- // Seek to that position
- rStream.Seek(0 - blockHeaderPosFromEnd, IOStream::SeekType_End);
-
- // Done. Stream now in right position (as long as the file is formatted correctly)
-}
-
-
-// --------------------------------------------------------------------------
-//
-// Function
-// Name: BackupStoreFile::EncodeFileDiff(const char *, int64_t, const BackupStoreFilename &, int64_t, IOStream &, int64_t *)
-// Purpose: Similar to EncodeFile, but takes the object ID of the file it's
-// diffing from, and the index of the blocks in a stream. It'll then
-// calculate which blocks can be reused from that old file.
-// The timeout is the timeout value for reading the diff block index.
-// If pIsCompletelyDifferent != 0, it will be set to true if the
-// the two files are completely different (do not share any block), false otherwise.
-//
-// Created: 12/1/04
-//
-// --------------------------------------------------------------------------
-std::auto_ptr<IOStream> BackupStoreFile::EncodeFileDiff
-(
- const char *Filename, int64_t ContainerID,
- const BackupStoreFilename &rStoreFilename, int64_t DiffFromObjectID,
- IOStream &rDiffFromBlockIndex, int Timeout, DiffTimer *pDiffTimer,
- int64_t *pModificationTime, bool *pIsCompletelyDifferent)
-{
- // Is it a symlink?
- {
- EMU_STRUCT_STAT st;
- if(EMU_LSTAT(Filename, &st) != 0)
- {
- THROW_EXCEPTION(CommonException, OSFileError)
- }
- if((st.st_mode & S_IFLNK) == S_IFLNK)
- {
- // Don't do diffs for symlinks
- if(pIsCompletelyDifferent != 0)
- {
- *pIsCompletelyDifferent = true;
- }
- return EncodeFile(Filename, ContainerID, rStoreFilename, pModificationTime);
- }
- }
-
- // Load in the blocks
- BlocksAvailableEntry *pindex = 0;
- int64_t blocksInIndex = 0;
- bool canDiffFromThis = false;
- LoadIndex(rDiffFromBlockIndex, DiffFromObjectID, &pindex, blocksInIndex, Timeout, canDiffFromThis);
- // BOX_TRACE("Diff: Blocks in index: " << blocksInIndex);
-
- if(!canDiffFromThis)
- {
- // Don't do diffing...
- if(pIsCompletelyDifferent != 0)
- {
- *pIsCompletelyDifferent = true;
- }
- return EncodeFile(Filename, ContainerID, rStoreFilename, pModificationTime);
- }
-
- // Pointer to recipe we're going to create
- BackupStoreFileEncodeStream::Recipe *precipe = 0;
-
- try
- {
- // Find which sizes should be scanned
- int32_t sizesToScan[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES];
- FindMostUsedSizes(pindex, blocksInIndex, sizesToScan);
-
- // Flag for reporting to the user
- bool completelyDifferent;
-
- // BLOCK
- {
- // Search the file to find matching blocks
- std::map<int64_t, int64_t> foundBlocks; // map of offset in file to index in block index
- int64_t sizeOfInputFile = 0;
- // BLOCK
- {
- FileStream file(Filename);
- // Get size of file
- sizeOfInputFile = file.BytesLeftToRead();
- // Find all those lovely matching blocks
- SearchForMatchingBlocks(file, foundBlocks, pindex,
- blocksInIndex, sizesToScan, pDiffTimer);
-
- // Is it completely different?
- completelyDifferent = (foundBlocks.size() == 0);
- }
-
- // Create a recipe -- if the two files are completely different, don't put the from file ID in the recipe.
- precipe = new BackupStoreFileEncodeStream::Recipe(pindex, blocksInIndex, completelyDifferent?(0):(DiffFromObjectID));
- BlocksAvailableEntry *pindexKeptRef = pindex; // we need this later, but must set pindex == 0 now, because of exceptions
- pindex = 0; // Recipe now has ownership
-
- // Fill it in
- GenerateRecipe(*precipe, pindexKeptRef, blocksInIndex, foundBlocks, sizeOfInputFile);
- }
- // foundBlocks no longer required
-
- // Create the stream
- std::auto_ptr<IOStream> stream(new BackupStoreFileEncodeStream);
-
- // Do the initial setup
- ((BackupStoreFileEncodeStream*)stream.get())->Setup(Filename, precipe, ContainerID, rStoreFilename, pModificationTime);
- precipe = 0; // Stream has taken ownership of this
-
- // Tell user about completely different status?
- if(pIsCompletelyDifferent != 0)
- {
- *pIsCompletelyDifferent = completelyDifferent;
- }
-
- // Return the stream for the caller
- return stream;
- }
- catch(...)
- {
- // cleanup
- if(pindex != 0)
- {
- ::free(pindex);
- pindex = 0;
- }
- if(precipe != 0)
- {
- delete precipe;
- precipe = 0;
- }
- throw;
- }
-}
-
-// --------------------------------------------------------------------------
-//
-// Function
-// Name: static LoadIndex(IOStream &, int64_t, BlocksAvailableEntry **, int64_t, bool &)
-// Purpose: Read in an index, and decrypt, and store in the in memory block format.
-// rCanDiffFromThis is set to false if the version of the from file is too old.
-// Created: 12/1/04
-//
-// --------------------------------------------------------------------------
-static void LoadIndex(IOStream &rBlockIndex, int64_t ThisID, BlocksAvailableEntry **ppIndex, int64_t &rNumBlocksOut, int Timeout, bool &rCanDiffFromThis)
-{
- // Reset
- rNumBlocksOut = 0;
- rCanDiffFromThis = false;
-
- // Read header
- file_BlockIndexHeader hdr;
- if(!rBlockIndex.ReadFullBuffer(&hdr, sizeof(hdr), 0 /* not interested in bytes read if this fails */, Timeout))
- {
- // Couldn't read header
- THROW_EXCEPTION(BackupStoreException, CouldntReadEntireStructureFromStream)
- }
-
-#ifndef BOX_DISABLE_BACKWARDS_COMPATIBILITY_BACKUPSTOREFILE
- // Check against backwards comptaibility stuff
- if(hdr.mMagicValue == (int32_t)htonl(OBJECTMAGIC_FILE_BLOCKS_MAGIC_VALUE_V0))
- {
- // Won't diff against old version
-
- // Absorb rest of stream
- char buffer[2048];
- while(rBlockIndex.StreamDataLeft())
- {
- rBlockIndex.Read(buffer, sizeof(buffer), 1000 /* 1 sec timeout */);
- }
-
- // Tell caller
- rCanDiffFromThis = false;
- return;
- }
-#endif
-
- // Check magic
- if(hdr.mMagicValue != (int32_t)htonl(OBJECTMAGIC_FILE_BLOCKS_MAGIC_VALUE_V1))
- {
- THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile)
- }
-
- // Check that we're not trying to diff against a file which references blocks from another file
- if(((int64_t)box_ntoh64(hdr.mOtherFileID)) != 0)
- {
- THROW_EXCEPTION(BackupStoreException, CannotDiffAnIncompleteStoreFile)
- }
-
- // Mark as an acceptable diff.
- rCanDiffFromThis = true;
-
- // Get basic information
- int64_t numBlocks = box_ntoh64(hdr.mNumBlocks);
- uint64_t entryIVBase = box_ntoh64(hdr.mEntryIVBase);
-
- //TODO: Verify that these sizes look reasonable
-
- // Allocate space for the index
- BlocksAvailableEntry *pindex = (BlocksAvailableEntry*)::malloc(sizeof(BlocksAvailableEntry) * numBlocks);
- if(pindex == 0)
- {
- throw std::bad_alloc();
- }
-
- try
- {
- for(int64_t b = 0; b < numBlocks; ++b)
- {
- // Read an entry from the stream
- file_BlockIndexEntry entry;
- if(!rBlockIndex.ReadFullBuffer(&entry, sizeof(entry), 0 /* not interested in bytes read if this fails */, Timeout))
- {
- // Couldn't read entry
- THROW_EXCEPTION(BackupStoreException, CouldntReadEntireStructureFromStream)
- }
-
- // Calculate IV for this entry
- uint64_t iv = entryIVBase;
- iv += b;
- // Network byte order
- iv = box_hton64(iv);
- sBlowfishDecryptBlockEntry.SetIV(&iv);
-
- // Decrypt the encrypted section
- file_BlockIndexEntryEnc entryEnc;
- int sectionSize = sBlowfishDecryptBlockEntry.TransformBlock(&entryEnc, sizeof(entryEnc),
- entry.mEnEnc, sizeof(entry.mEnEnc));
- if(sectionSize != sizeof(entryEnc))
- {
- THROW_EXCEPTION(BackupStoreException, BlockEntryEncodingDidntGiveExpectedLength)
- }
-
- // Check that we're not trying to diff against a file which references blocks from another file
- if(((int64_t)box_ntoh64(entry.mEncodedSize)) <= 0)
- {
- THROW_EXCEPTION(BackupStoreException, CannotDiffAnIncompleteStoreFile)
- }
-
- // Store all the required information
- pindex[b].mpNextInHashList = 0; // hash list not set up yet
- pindex[b].mSize = ntohl(entryEnc.mSize);
- pindex[b].mWeakChecksum = ntohl(entryEnc.mWeakChecksum);
- ::memcpy(pindex[b].mStrongChecksum, entryEnc.mStrongChecksum, sizeof(pindex[b].mStrongChecksum));
- }
-
- // Store index pointer for called
- ASSERT(ppIndex != 0);
- *ppIndex = pindex;
-
- // Store number of blocks for caller
- rNumBlocksOut = numBlocks;
-
- }
- catch(...)
- {
- // clean up and send the exception along its way
- ::free(pindex);
- throw;
- }
-}
-
-
-// --------------------------------------------------------------------------
-//
-// Function
-// Name: static FindMostUsedSizes(BlocksAvailableEntry *, int64_t, int32_t[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES])
-// Purpose: Finds the most commonly used block sizes in the index
-// Created: 12/1/04
-//
-// --------------------------------------------------------------------------
-static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES])
-{
- // Array for lengths
- int64_t sizeCounts[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES];
-
- // Set arrays to lots of zeros (= unused entries)
- for(int l = 0; l < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++l)
- {
- Sizes[l] = 0;
- sizeCounts[l] = 0;
- }
-
- // Array for collecting sizes
- std::map<int32_t, int64_t> foundSizes;
-
- // Run through blocks and make a count of the entries
- for(int64_t b = 0; b < NumBlocks; ++b)
- {
- // Only if the block size is bigger than the minimum size we'll scan for
- if(pIndex[b].mSize > BACKUP_FILE_DIFF_MIN_BLOCK_SIZE)
- {
- // Find entry?
- std::map<int32_t, int64_t>::const_iterator f(foundSizes.find(pIndex[b].mSize));
- if(f != foundSizes.end())
- {
- // Increment existing entry
- foundSizes[pIndex[b].mSize] = foundSizes[pIndex[b].mSize] + 1;
- }
- else
- {
- // New entry
- foundSizes[pIndex[b].mSize] = 1;
- }
- }
- }
-
- // Make the block sizes
- for(std::map<int32_t, int64_t>::const_iterator i(foundSizes.begin()); i != foundSizes.end(); ++i)
- {
- // Find the position of the size in the array
- for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t)
- {
- // Instead of sorting on the raw count of blocks,
- // take the file area covered by this block size.
- if(i->second * i->first > sizeCounts[t] * Sizes[t])
- {
- // Then this size belong before this entry -- shuffle them up
- for(int s = (BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1); s >= t; --s)
- {
- Sizes[s] = Sizes[s-1];
- sizeCounts[s] = sizeCounts[s-1];
- }
-
- // Insert this size
- Sizes[t] = i->first;
- sizeCounts[t] = i->second;
-
- // Shouldn't do any more searching
- break;
- }
- }
- }
-
- // trace the size table in debug builds
-#ifndef BOX_RELEASE_BUILD
- if(BackupStoreFile::TraceDetailsOfDiffProcess)
- {
- for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t)
- {
- BOX_TRACE("Diff block size " << t << ": " <<
- Sizes[t] << " (count = " <<
- sizeCounts[t] << ")");
- }
- }
-#endif
-}
-
-
-
-// --------------------------------------------------------------------------
-//
-// Function
-// Name: static SearchForMatchingBlocks(IOStream &, std::map<int64_t, int64_t> &, BlocksAvailableEntry *, int64_t, int32_t[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES])
-// Purpose: Find the matching blocks within the file.
-// Created: 12/1/04
-//
-// --------------------------------------------------------------------------
-static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> &rFoundBlocks,
- BlocksAvailableEntry *pIndex, int64_t NumBlocks,
- int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES], DiffTimer *pDiffTimer)
-{
- Timer maximumDiffingTime(0, "MaximumDiffingTime");
-
- if(pDiffTimer && pDiffTimer->IsManaged())
- {
- maximumDiffingTime = Timer(pDiffTimer->GetMaximumDiffingTime(),
- "MaximumDiffingTime");
- }
-
- std::map<int64_t, int32_t> goodnessOfFit;
-
- // Allocate the hash lookup table
- BlocksAvailableEntry **phashTable = (BlocksAvailableEntry **)::malloc(sizeof(BlocksAvailableEntry *) * (64*1024));
-
- // Choose a size for the buffer, just a little bit more than the maximum block size
- int32_t bufSize = Sizes[0];
- for(int z = 1; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z)
- {
- if(Sizes[z] > bufSize) bufSize = Sizes[z];
- }
- bufSize += 4;
- ASSERT(bufSize > Sizes[0]);
- ASSERT(bufSize > 0);
- if(bufSize > (BACKUP_FILE_MAX_BLOCK_SIZE + 1024))
- {
- THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile)
- }
-
- // TODO: Because we read in the file a scanned block size at a time,
- // it is likely to be inefficient. Probably will be much better to
- // calculate checksums for all block sizes in a single pass.
-
- // Allocate the buffers.
- uint8_t *pbuffer0 = (uint8_t *)::malloc(bufSize);
- uint8_t *pbuffer1 = (uint8_t *)::malloc(bufSize);
- try
- {
- // Check buffer allocation
- if(pbuffer0 == 0 || pbuffer1 == 0 || phashTable == 0)
- {
- // If a buffer got allocated, it will be cleaned up in the catch block
- throw std::bad_alloc();
- }
-
- // Flag to abort the run, if too many blocks are found -- avoid using
- // huge amounts of processor time when files contain many similar blocks.
- bool abortSearch = false;
-
- // Search for each block size in turn
- // NOTE: Do the smallest size first, so that the scheme for adding
- // entries in the found list works as expected and replaces smallers block
- // with larger blocks when it finds matches at the same offset in the file.
- for(int s = BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1; s >= 0; --s)
- {
- ASSERT(Sizes[s] <= bufSize);
- BOX_TRACE("Diff pass " << s << ", for block size " <<
- Sizes[s]);
-
- // Check we haven't finished
- if(Sizes[s] == 0)
- {
- // empty entry, try next size
- continue;
- }
-
- // Set up the hash table entries
- SetupHashTable(pIndex, NumBlocks, Sizes[s], phashTable);
-
- // Shift file position to beginning
- rFile.Seek(0, IOStream::SeekType_Absolute);
-
- // Read first block
- if(rFile.Read(pbuffer0, Sizes[s]) != Sizes[s])
- {
- // Size of file too short to match -- do next size
- continue;
- }
-
- // Setup block pointers
- uint8_t *beginnings = pbuffer0;
- uint8_t *endings = pbuffer1;
- int offset = 0;
-
- // Calculate the first checksum, ready for rolling
- RollingChecksum rolling(beginnings, Sizes[s]);
-
- // Then roll, until the file is exhausted
- int64_t fileBlockNumber = 0;
- int64_t fileOffset = 0;
- int rollOverInitialBytes = 0;
- while(true)
- {
- if(maximumDiffingTime.HasExpired())
- {
- ASSERT(pDiffTimer != NULL);
- BOX_INFO("MaximumDiffingTime reached - "
- "suspending file diff");
- abortSearch = true;
- break;
- }
-
- if(pDiffTimer)
- {
- pDiffTimer->DoKeepAlive();
- }
-
- // Load in another block of data, and record how big it is
- int bytesInEndings = rFile.Read(endings, Sizes[s]);
- int tmp;
-
- // Skip any bytes from a previous matched block
- if(rollOverInitialBytes > 0 && offset < bytesInEndings)
- {
- int spaceLeft = bytesInEndings - offset;
- int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes;
-
- rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll);
-
- offset += thisRoll;
- fileOffset += thisRoll;
- rollOverInitialBytes -= thisRoll;
-
- if(rollOverInitialBytes)
- {
- goto refresh;
- }
- }
-
- if(goodnessOfFit.count(fileOffset))
- {
- tmp = goodnessOfFit[fileOffset];
- }
- else
- {
- tmp = 0;
- }
-
- if(tmp >= Sizes[s])
- {
- // Skip over bigger ready-matched blocks completely
- rollOverInitialBytes = tmp;
- int spaceLeft = bytesInEndings - offset;
- int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes;
-
- rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll);
-
- offset += thisRoll;
- fileOffset += thisRoll;
- rollOverInitialBytes -= thisRoll;
-
- if(rollOverInitialBytes)
- {
- goto refresh;
- }
- }
-
- while(offset < bytesInEndings)
- {
- // Is current checksum in hash list?
- uint16_t hash = rolling.GetComponentForHashing();
- if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s]))
- {
- if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks))
- {
- BOX_TRACE("Found block match for " << hash << " of " << Sizes[s] << " bytes at offset " << fileOffset);
- goodnessOfFit[fileOffset] = Sizes[s];
-
- // Block matched, roll the checksum forward to the next block without doing
- // any more comparisons, because these are pointless (as any more matches will be ignored when
- // the recipe is generated) and just take up valuable processor time. Edge cases are
- // especially nasty, using huge amounts of time and memory.
- int skip = Sizes[s];
- if(offset < bytesInEndings && skip > 0)
- {
- int spaceLeft = bytesInEndings - offset;
- int thisRoll = (skip > spaceLeft) ? spaceLeft : skip;
-
- rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll);
-
- offset += thisRoll;
- fileOffset += thisRoll;
- skip -= thisRoll;
- }
- // Not all the bytes necessary will have been skipped, so get them
- // skipped after the next block is loaded.
- rollOverInitialBytes = skip;
-
- // End this loop, so the final byte isn't used again
- break;
- }
- else
- {
- BOX_TRACE("False alarm match for " << hash << " of " << Sizes[s] << " bytes at offset " << fileOffset);
- }
-
- int64_t NumBlocksFound = static_cast<int64_t>(
- rFoundBlocks.size());
- int64_t MaxBlocksFound = NumBlocks *
- BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE;
-
- if(NumBlocksFound > MaxBlocksFound)
- {
- abortSearch = true;
- break;
- }
- }
-
- // Roll checksum forward
- rolling.RollForward(beginnings[offset], endings[offset], Sizes[s]);
-
- // Increment offsets
- ++offset;
- ++fileOffset;
- }
-
- if(abortSearch) break;
-
- refresh:
- // Finished?
- if(bytesInEndings != Sizes[s])
- {
- // No more data in file -- check the final block
- // (Do a copy and paste of 5 lines of code instead of introducing a comparison for
- // each byte of the file)
- uint16_t hash = rolling.GetComponentForHashing();
- if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s]))
- {
- if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks))
- {
- goodnessOfFit[fileOffset] = Sizes[s];
- }
- }
-
- // finish
- break;
- }
-
- // Switch buffers, reset offset
- beginnings = endings;
- endings = (beginnings == pbuffer0)?(pbuffer1):(pbuffer0); // ie the other buffer
- offset = 0;
-
- // And count the blocks which have been done
- ++fileBlockNumber;
- }
-
- if(abortSearch) break;
- }
-
- // Free buffers and hash table
- ::free(pbuffer1);
- pbuffer1 = 0;
- ::free(pbuffer0);
- pbuffer0 = 0;
- ::free(phashTable);
- phashTable = 0;
- }
- catch(...)
- {
- // Cleanup and throw
- if(pbuffer1 != 0) ::free(pbuffer1);
- if(pbuffer0 != 0) ::free(pbuffer0);
- if(phashTable != 0) ::free(phashTable);
- throw;
- }
-
-#ifndef BOX_RELEASE_BUILD
- if(BackupStoreFile::TraceDetailsOfDiffProcess)
- {
- // Trace out the found blocks in debug mode
- BOX_TRACE("Diff: list of found blocks");
- BOX_TRACE("======== ======== ======== ========");
- BOX_TRACE(" Offset BlkIdx Size Movement");
- for(std::map<int64_t, int64_t>::const_iterator i(rFoundBlocks.begin()); i != rFoundBlocks.end(); ++i)
- {
- int64_t orgLoc = 0;
- for(int64_t b = 0; b < i->second; ++b)
- {
- orgLoc += pIndex[b].mSize;
- }
- BOX_TRACE(std::setw(8) << i->first << " " <<
- std::setw(8) << i->second << " " <<
- std::setw(8) << pIndex[i->second].mSize <<
- " " <<
- std::setw(8) << (i->first - orgLoc));
- }
- BOX_TRACE("======== ======== ======== ========");
- }
-#endif
-}
-
-
-// --------------------------------------------------------------------------
-//
-// Function
-// Name: static SetupHashTable(BlocksAvailableEntry *, int64_t, in32_t, BlocksAvailableEntry **)
-// Purpose: Set up the hash table ready for a scan
-// Created: 14/1/04
-//
-// --------------------------------------------------------------------------
-static void SetupHashTable(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t BlockSize, BlocksAvailableEntry **pHashTable)
-{
- // Set all entries in the hash table to zero
- ::memset(pHashTable, 0, (sizeof(BlocksAvailableEntry *) * (64*1024)));
-
- // Scan through the blocks, building the hash table
- for(int64_t b = 0; b < NumBlocks; ++b)
- {
- // Only look at the required block size
- if(pIndex[b].mSize == BlockSize)
- {
- // Get the value under which to hash this entry
- uint16_t hash = RollingChecksum::ExtractHashingComponent(pIndex[b].mWeakChecksum);
-
- // Already present in table?
- if(pHashTable[hash] != 0)
- {
- //BOX_TRACE("Another hash entry for " << hash << " found");
- // Yes -- need to set the pointer in this entry to the current entry to build the linked list
- pIndex[b].mpNextInHashList = pHashTable[hash];
- }
-
- // Put a pointer to this entry in the hash table
- pHashTable[hash] = pIndex + b;
- }
- }
-}
-
-
-// --------------------------------------------------------------------------
-//
-// Function
-// Name: static bool SecondStageMatch(xxx)
-// Purpose: When a match in the hash table is found, scan for second stage match using strong checksum.
-// Created: 14/1/04
-//
-// --------------------------------------------------------------------------
-static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings,
- int Offset, int32_t BlockSize, int64_t FileBlockNumber, BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks)
-{
- // Check parameters
- ASSERT(pBeginnings != 0);
- ASSERT(pEndings != 0);
- ASSERT(Offset >= 0);
- ASSERT(BlockSize > 0);
- ASSERT(pFirstInHashList != 0);
- ASSERT(pIndex != 0);
-
-#ifndef BOX_RELEASE_BUILD
- uint16_t DEBUG_Hash = fastSum.GetComponentForHashing();
-#endif
- uint32_t Checksum = fastSum.GetChecksum();
-
- // Before we go to the expense of the MD5, make sure it's a darn good match on the checksum we already know.
- BlocksAvailableEntry *scan = pFirstInHashList;
- bool found=false;
- while(scan != 0)
- {
- if(scan->mWeakChecksum == Checksum)
- {
- found = true;
- break;
- }
- scan = scan->mpNextInHashList;
- }
- if(!found)
- {
- return false;
- }
-
- // Calculate the strong MD5 digest for this block
- MD5Digest strong;
- // Add the data from the beginnings
- strong.Add(pBeginnings + Offset, BlockSize - Offset);
- // Add any data from the endings
- if(Offset > 0)
- {
- strong.Add(pEndings, Offset);
- }
- strong.Finish();
-
- // Then go through the entries in the hash list, comparing with the strong digest calculated
- scan = pFirstInHashList;
- //BOX_TRACE("second stage match");
- while(scan != 0)
- {
- //BOX_TRACE("scan size " << scan->mSize <<
- // ", block size " << BlockSize <<
- // ", hash " << Hash);
- ASSERT(scan->mSize == BlockSize);
- ASSERT(RollingChecksum::ExtractHashingComponent(scan->mWeakChecksum) == DEBUG_Hash);
-
- // Compare?
- if(strong.DigestMatches(scan->mStrongChecksum))
- {
- //BOX_TRACE("Match!\n");
- // Found! Add to list of found blocks...
- int64_t fileOffset = (FileBlockNumber * BlockSize) + Offset;
- int64_t blockIndex = (scan - pIndex); // pointer arthmitic is frowned upon. But most efficient way of doing it here -- alternative is to use more memory
-
- // We do NOT search for smallest blocks first, as this code originally assumed.
- // To prevent this from potentially overwriting a better match, the caller must determine
- // the relative "goodness" of any existing match and this one, and avoid the call if it
- // could be detrimental.
- rFoundBlocks[fileOffset] = blockIndex;
-
- // No point in searching further, report success
- return true;
- }
-
- // Next
- scan = scan->mpNextInHashList;
- }
-
- // Not matched
- return false;
-}
-
-
-// --------------------------------------------------------------------------
-//
-// Function
-// Name: static GenerateRecipe(BackupStoreFileEncodeStream::Recipe &, BlocksAvailableEntry *, int64_t, std::map<int64_t, int64_t> &)
-// Purpose: Fills in the recipe from the found block list
-// Created: 15/1/04
-//
-// --------------------------------------------------------------------------
-static void GenerateRecipe(BackupStoreFileEncodeStream::Recipe &rRecipe, BlocksAvailableEntry *pIndex,
- int64_t NumBlocks, std::map<int64_t, int64_t> &rFoundBlocks, int64_t SizeOfInputFile)
-{
- // NOTE: This function could be a lot more sophisiticated. For example, if
- // a small block overlaps a big block like this
- // ****
- // *******************************
- // then the small block will be used, not the big one. But it'd be better to
- // just ignore the small block and keep the big one. However, some stats should
- // be gathered about real world files before writing complex code which might
- // go wrong.
-
- // Initialise a blank instruction
- BackupStoreFileEncodeStream::RecipeInstruction instruction;
- #define RESET_INSTRUCTION \
- instruction.mSpaceBefore = 0; \
- instruction.mBlocks = 0; \
- instruction.mpStartBlock = 0;
- RESET_INSTRUCTION
-
- // First, a special case for when there are no found blocks
- if(rFoundBlocks.size() == 0)
- {
- // No blocks, just a load of space
- instruction.mSpaceBefore = SizeOfInputFile;
- rRecipe.push_back(instruction);
-
- #ifndef BOX_RELEASE_BUILD
- if(BackupStoreFile::TraceDetailsOfDiffProcess)
- {
- BOX_TRACE("Diff: Default recipe generated, " <<
- SizeOfInputFile << " bytes of file");
- }
- #endif
-
- // Don't do anything
- return;
- }
-
- // Current location
- int64_t loc = 0;
-
- // Then iterate through the list, generating the recipe
- std::map<int64_t, int64_t>::const_iterator i(rFoundBlocks.begin());
- ASSERT(i != rFoundBlocks.end()); // check logic
-
- // Counting for debug tracing
-#ifndef BOX_RELEASE_BUILD
- int64_t debug_NewBytesFound = 0;
- int64_t debug_OldBlocksUsed = 0;
-#endif
-
- for(; i != rFoundBlocks.end(); ++i)
- {
- // Remember... map is (position in file) -> (index of block in pIndex)
-
- if(i->first < loc)
- {
- // This block overlaps the last one
- continue;
- }
- else if(i->first > loc)
- {
- // There's a gap between the end of the last thing and this block.
- // If there's an instruction waiting, push it onto the list
- if(instruction.mSpaceBefore != 0 || instruction.mpStartBlock != 0)
- {
- rRecipe.push_back(instruction);
- }
- // Start a new instruction, with the gap ready
- RESET_INSTRUCTION
- instruction.mSpaceBefore = i->first - loc;
- // Move location forward to match
- loc += instruction.mSpaceBefore;
-#ifndef BOX_RELEASE_BUILD
- debug_NewBytesFound += instruction.mSpaceBefore;
-#endif
- }
-
- // First, does the current instruction need pushing back, because this block is not
- // sequential to the last one?
- if(instruction.mpStartBlock != 0 && (pIndex + i->second) != (instruction.mpStartBlock + instruction.mBlocks))
- {
- rRecipe.push_back(instruction);
- RESET_INSTRUCTION
- }
-
- // Add in this block
- if(instruction.mpStartBlock == 0)
- {
- // This block starts a new instruction
- instruction.mpStartBlock = pIndex + i->second;
- instruction.mBlocks = 1;
- }
- else
- {
- // It continues the previous section of blocks
- instruction.mBlocks += 1;
- }
-
-#ifndef BOX_RELEASE_BUILD
- debug_OldBlocksUsed++;
-#endif
-
- // Move location forward
- loc += pIndex[i->second].mSize;
- }
-
- // Push the last instruction generated
- rRecipe.push_back(instruction);
-
- // Is there any space left at the end which needs sending?
- if(loc != SizeOfInputFile)
- {
- RESET_INSTRUCTION
- instruction.mSpaceBefore = SizeOfInputFile - loc;
-#ifndef BOX_RELEASE_BUILD
- debug_NewBytesFound += instruction.mSpaceBefore;
-#endif
- rRecipe.push_back(instruction);
- }
-
- // dump out the recipe
-#ifndef BOX_RELEASE_BUILD
- BOX_TRACE("Diff: " <<
- debug_NewBytesFound << " new bytes found, " <<
- debug_OldBlocksUsed << " old blocks used");
- if(BackupStoreFile::TraceDetailsOfDiffProcess)
- {
- BOX_TRACE("Diff: Recipe generated (size " << rRecipe.size());
- BOX_TRACE("======== ========= ========");
- BOX_TRACE("Space b4 FirstBlk NumBlks");
- {
- for(unsigned int e = 0; e < rRecipe.size(); ++e)
- {
- char b[64];
-#ifdef WIN32
- sprintf(b, "%8I64d", (int64_t)(rRecipe[e].mpStartBlock - pIndex));
-#else
- sprintf(b, "%8lld", (int64_t)(rRecipe[e].mpStartBlock - pIndex));
-#endif
- BOX_TRACE(std::setw(8) <<
- rRecipe[e].mSpaceBefore <<
- " " <<
- ((rRecipe[e].mpStartBlock == 0)?" -":b) <<
- " " << std::setw(8) <<
- rRecipe[e].mBlocks);
- }
- }
- BOX_TRACE("======== ========= ========");
- }
-#endif
-}