summaryrefslogtreecommitdiff
path: root/lib/backupclient
diff options
context:
space:
mode:
authorMartin Ebourne <martin@ebourne.me.uk>2005-12-07 16:24:49 +0000
committerMartin Ebourne <martin@ebourne.me.uk>2005-12-07 16:24:49 +0000
commit065dc6f8cd168e3ee6e71ddfb06f42a92abfabbd (patch)
treeec09f9aff78d337b64ef4af7c7cba2d23a4ef0e9 /lib/backupclient
parent9950474440f85c5a35be8cfe7a85cee884209f20 (diff)
Merged chromi/diffopt at r116 to trunk
Diffstat (limited to 'lib/backupclient')
-rwxr-xr-xlib/backupclient/BackupStoreConstants.h4
-rwxr-xr-xlib/backupclient/BackupStoreFileDiff.cpp128
2 files changed, 105 insertions, 27 deletions
diff --git a/lib/backupclient/BackupStoreConstants.h b/lib/backupclient/BackupStoreConstants.h
index 8254e918..907072ea 100755
--- a/lib/backupclient/BackupStoreConstants.h
+++ b/lib/backupclient/BackupStoreConstants.h
@@ -28,10 +28,10 @@
#define BACKUP_FILE_AVOID_BLOCKS_LESS_THAN 128
// Maximum number of sizes to do an rsync-like scan for
-#define BACKUP_FILE_DIFF_MAX_BLOCK_SIZES 8
+#define BACKUP_FILE_DIFF_MAX_BLOCK_SIZES 64
// When doing rsync scans, do not scan for blocks smaller than
-#define BACKUP_FILE_DIFF_MIN_BLOCK_SIZE 256
+#define BACKUP_FILE_DIFF_MIN_BLOCK_SIZE 128
// A limit to stop diffing running out of control: If more than this
// times the number of blocks in the original index are found, stop
diff --git a/lib/backupclient/BackupStoreFileDiff.cpp b/lib/backupclient/BackupStoreFileDiff.cpp
index b72ce328..22395151 100755
--- a/lib/backupclient/BackupStoreFileDiff.cpp
+++ b/lib/backupclient/BackupStoreFileDiff.cpp
@@ -41,11 +41,12 @@ static void LoadIndex(IOStream &rBlockIndex, int64_t ThisID, BlocksAvailableEntr
static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]);
static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> &rFoundBlocks, BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]);
static void SetupHashTable(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t BlockSize, BlocksAvailableEntry **pHashTable);
-static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, uint16_t Hash, uint8_t *pBeginnings, uint8_t *pEndings, int Offset, int32_t BlockSize, int64_t FileBlockNumber, BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks);
+static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings, int Offset, int32_t BlockSize, int64_t FileBlockNumber,
+BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks);
static void GenerateRecipe(BackupStoreFileEncodeStream::Recipe &rRecipe, BlocksAvailableEntry *pIndex, int64_t NumBlocks, std::map<int64_t, int64_t> &rFoundBlocks, int64_t SizeOfInputFile);
// Avoid running on too long with diffs
-static int sMaximumDiffTime = 10; // maximum time to spend diffing
+static int sMaximumDiffTime = 600; // maximum time to spend diffing
static bool sDiffTimedOut = false;
static bool sSetTimerSignelHandler = false;
static void TimerSignalHandler(int signal);
@@ -426,7 +427,9 @@ static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, i
// Find the position of the size in the array
for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t)
{
- if(i->second > sizeCounts[t])
+ // Instead of sorting on the raw count of blocks,
+ // take the file area covered by this block size.
+ if(i->second * i->first > sizeCounts[t] * Sizes[t])
{
// Then this size belong before this entry -- shuffle them up
for(int s = (BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1); s >= t; --s)
@@ -470,6 +473,8 @@ static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, i
static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> &rFoundBlocks,
BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES])
{
+ std::map<int64_t, int32_t> goodnessOfFit;
+
// Allocate the hash lookup table
BlocksAvailableEntry **phashTable = (BlocksAvailableEntry **)::malloc(sizeof(BlocksAvailableEntry *) * (64*1024));
@@ -500,7 +505,7 @@ static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t>
// Check buffer allocation
if(pbuffer0 == 0 || pbuffer1 == 0 || phashTable == 0)
{
- // If a buffer got alloocated, it will be cleaned up in the catch block
+ // If a buffer got allocated, it will be cleaned up in the catch block
throw std::bad_alloc();
}
@@ -547,38 +552,85 @@ static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t>
// Then roll, until the file is exhausted
int64_t fileBlockNumber = 0;
+ int64_t fileOffset = 0;
int rollOverInitialBytes = 0;
while(true)
{
// Load in another block of data, and record how big it is
int bytesInEndings = rFile.Read(endings, Sizes[s]);
+ int tmp;
// Skip any bytes from a previous matched block
- while(rollOverInitialBytes > 0 && offset < bytesInEndings)
+ if(rollOverInitialBytes > 0 && offset < bytesInEndings)
{
- rolling.RollForward(beginnings[offset], endings[offset], Sizes[s]);
- ++offset;
- --rollOverInitialBytes;
+ int spaceLeft = bytesInEndings - offset;
+ int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes;
+
+ rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll);
+
+ offset += thisRoll;
+ fileOffset += thisRoll;
+ rollOverInitialBytes -= thisRoll;
+
+ if(rollOverInitialBytes)
+ {
+ goto refresh;
+ }
+ }
+
+ if(goodnessOfFit.count(fileOffset))
+ {
+ tmp = goodnessOfFit[fileOffset];
+ }
+ else
+ {
+ tmp = 0;
+ }
+
+ if(tmp >= Sizes[s])
+ {
+ // Skip over bigger ready-matched blocks completely
+ rollOverInitialBytes = tmp;
+ int spaceLeft = bytesInEndings - offset;
+ int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes;
+
+ rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll);
+
+ offset += thisRoll;
+ fileOffset += thisRoll;
+ rollOverInitialBytes -= thisRoll;
+
+ if(rollOverInitialBytes)
+ {
+ goto refresh;
+ }
}
while(offset < bytesInEndings)
{
- // Put is current checksum in hash list?
+ // Is current checksum in hash list?
uint16_t hash = rolling.GetComponentForHashing();
- if(phashTable[hash] != 0)
+ if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s]))
{
- if(SecondStageMatch(phashTable[hash], hash, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks))
+ if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks))
{
+ goodnessOfFit[fileOffset] = Sizes[s];
+
// Block matched, roll the checksum forward to the next block without doing
// any more comparisons, because these are pointless (as any more matches will be ignored when
// the receipe is generated) and just take up valuable processor time. Edge cases are
// especially nasty, using huge amounts of time and memory.
int skip = Sizes[s];
- while(offset < bytesInEndings && skip > 0)
+ if(offset < bytesInEndings && skip > 0)
{
- rolling.RollForward(beginnings[offset], endings[offset], Sizes[s]);
- ++offset;
- --skip;
+ int spaceLeft = bytesInEndings - offset;
+ int thisRoll = (skip > spaceLeft) ? spaceLeft : skip;
+
+ rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll);
+
+ offset += thisRoll;
+ fileOffset += thisRoll;
+ skip -= thisRoll;
}
// Not all the bytes necessary will have been skipped, so get them
// skipped after the next block is loaded.
@@ -588,8 +640,7 @@ static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t>
break;
}
- if(static_cast<int64_t>(rFoundBlocks.size()) > (NumBlocks * BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE)
- || sDiffTimedOut)
+ if(static_cast<int64_t>(rFoundBlocks.size()) > (NumBlocks * BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE) || sDiffTimedOut)
{
abortSearch = true;
break;
@@ -599,12 +650,14 @@ static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t>
// Roll checksum forward
rolling.RollForward(beginnings[offset], endings[offset], Sizes[s]);
- // Increment offset
+ // Increment offsets
++offset;
+ ++fileOffset;
}
if(abortSearch) break;
+ refresh:
// Finished?
if(bytesInEndings != Sizes[s])
{
@@ -612,9 +665,12 @@ static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t>
// (Do a copy and paste of 5 lines of code instead of introducing a comparison for
// each byte of the file)
uint16_t hash = rolling.GetComponentForHashing();
- if(phashTable[hash] != 0)
+ if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s]))
{
- SecondStageMatch(phashTable[hash], hash, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks);
+ if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks))
+ {
+ goodnessOfFit[fileOffset] = Sizes[s];
+ }
}
// finish
@@ -716,7 +772,7 @@ static void SetupHashTable(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int3
// Created: 14/1/04
//
// --------------------------------------------------------------------------
-static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, uint16_t Hash, uint8_t *pBeginnings, uint8_t *pEndings,
+static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings,
int Offset, int32_t BlockSize, int64_t FileBlockNumber, BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks)
{
// Check parameters
@@ -727,6 +783,26 @@ static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, uint16_t Ha
ASSERT(pFirstInHashList != 0);
ASSERT(pIndex != 0);
+ uint16_t Hash = fastSum.GetComponentForHashing();
+ uint32_t Checksum = fastSum.GetChecksum();
+
+ // Before we go to the expense of the MD5, make sure it's a darn good match on the checksum we already know.
+ BlocksAvailableEntry *scan = pFirstInHashList;
+ bool found=false;
+ while(scan != 0)
+ {
+ if(scan->mWeakChecksum == Checksum)
+ {
+ found = true;
+ break;
+ }
+ scan = scan->mpNextInHashList;
+ }
+ if(!found)
+ {
+ return false;
+ }
+
// Calculate the strong MD5 digest for this block
MD5Digest strong;
// Add the data from the beginnings
@@ -739,7 +815,7 @@ static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, uint16_t Ha
strong.Finish();
// Then go through the entries in the hash list, comparing with the strong digest calculated
- BlocksAvailableEntry *scan = pFirstInHashList;
+ scan = pFirstInHashList;
//TRACE0("second stage match\n");
while(scan != 0)
{
@@ -753,10 +829,12 @@ static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, uint16_t Ha
//TRACE0("Match!\n");
// Found! Add to list of found blocks...
int64_t fileOffset = (FileBlockNumber * BlockSize) + Offset;
- int64_t blockIndex = (scan - pIndex); // pointer arthmitic is frowed apon. But most efficient way of doing it here -- alternative is to use more memory
+ int64_t blockIndex = (scan - pIndex); // pointer arthmitic is frowned upon. But most efficient way of doing it here -- alternative is to use more memory
- // Because we search for smallest blocks first, if there's an entry there
- // we'll just be replacing it with a larger one, which is great news.
+ // We do NOT search for smallest blocks first, as this code originally assumed.
+ // To prevent this from potentially overwriting a better match, the caller must determine
+ // the relative "goodness" of any existing match and this one, and avoid the call if it
+ // could be detrimental.
rFoundBlocks[fileOffset] = blockIndex;
// No point in searching further, report success