summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorMartin Ebourne <martin@ebourne.me.uk>2005-12-07 16:24:49 +0000
committerMartin Ebourne <martin@ebourne.me.uk>2005-12-07 16:24:49 +0000
commit065dc6f8cd168e3ee6e71ddfb06f42a92abfabbd (patch)
treeec09f9aff78d337b64ef4af7c7cba2d23a4ef0e9 /lib
parent9950474440f85c5a35be8cfe7a85cee884209f20 (diff)
Merged chromi/diffopt at r116 to trunk
Diffstat (limited to 'lib')
-rwxr-xr-xlib/backupclient/BackupStoreConstants.h4
-rwxr-xr-xlib/backupclient/BackupStoreFileDiff.cpp128
-rwxr-xr-xlib/crypto/RollingChecksum.cpp28
-rwxr-xr-xlib/crypto/RollingChecksum.h25
4 files changed, 149 insertions, 36 deletions
diff --git a/lib/backupclient/BackupStoreConstants.h b/lib/backupclient/BackupStoreConstants.h
index 8254e918..907072ea 100755
--- a/lib/backupclient/BackupStoreConstants.h
+++ b/lib/backupclient/BackupStoreConstants.h
@@ -28,10 +28,10 @@
#define BACKUP_FILE_AVOID_BLOCKS_LESS_THAN 128
// Maximum number of sizes to do an rsync-like scan for
-#define BACKUP_FILE_DIFF_MAX_BLOCK_SIZES 8
+#define BACKUP_FILE_DIFF_MAX_BLOCK_SIZES 64
// When doing rsync scans, do not scan for blocks smaller than
-#define BACKUP_FILE_DIFF_MIN_BLOCK_SIZE 256
+#define BACKUP_FILE_DIFF_MIN_BLOCK_SIZE 128
// A limit to stop diffing running out of control: If more than this
// times the number of blocks in the original index are found, stop
diff --git a/lib/backupclient/BackupStoreFileDiff.cpp b/lib/backupclient/BackupStoreFileDiff.cpp
index b72ce328..22395151 100755
--- a/lib/backupclient/BackupStoreFileDiff.cpp
+++ b/lib/backupclient/BackupStoreFileDiff.cpp
@@ -41,11 +41,12 @@ static void LoadIndex(IOStream &rBlockIndex, int64_t ThisID, BlocksAvailableEntr
static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]);
static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> &rFoundBlocks, BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]);
static void SetupHashTable(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t BlockSize, BlocksAvailableEntry **pHashTable);
-static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, uint16_t Hash, uint8_t *pBeginnings, uint8_t *pEndings, int Offset, int32_t BlockSize, int64_t FileBlockNumber, BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks);
+static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings, int Offset, int32_t BlockSize, int64_t FileBlockNumber,
+BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks);
static void GenerateRecipe(BackupStoreFileEncodeStream::Recipe &rRecipe, BlocksAvailableEntry *pIndex, int64_t NumBlocks, std::map<int64_t, int64_t> &rFoundBlocks, int64_t SizeOfInputFile);
// Avoid running on too long with diffs
-static int sMaximumDiffTime = 10; // maximum time to spend diffing
+static int sMaximumDiffTime = 600; // maximum time to spend diffing
static bool sDiffTimedOut = false;
static bool sSetTimerSignelHandler = false;
static void TimerSignalHandler(int signal);
@@ -426,7 +427,9 @@ static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, i
// Find the position of the size in the array
for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t)
{
- if(i->second > sizeCounts[t])
+ // Instead of sorting on the raw count of blocks,
+ // take the file area covered by this block size.
+ if(i->second * i->first > sizeCounts[t] * Sizes[t])
{
// Then this size belong before this entry -- shuffle them up
for(int s = (BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1); s >= t; --s)
@@ -470,6 +473,8 @@ static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, i
static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> &rFoundBlocks,
BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES])
{
+ std::map<int64_t, int32_t> goodnessOfFit;
+
// Allocate the hash lookup table
BlocksAvailableEntry **phashTable = (BlocksAvailableEntry **)::malloc(sizeof(BlocksAvailableEntry *) * (64*1024));
@@ -500,7 +505,7 @@ static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t>
// Check buffer allocation
if(pbuffer0 == 0 || pbuffer1 == 0 || phashTable == 0)
{
- // If a buffer got alloocated, it will be cleaned up in the catch block
+ // If a buffer got allocated, it will be cleaned up in the catch block
throw std::bad_alloc();
}
@@ -547,38 +552,85 @@ static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t>
// Then roll, until the file is exhausted
int64_t fileBlockNumber = 0;
+ int64_t fileOffset = 0;
int rollOverInitialBytes = 0;
while(true)
{
// Load in another block of data, and record how big it is
int bytesInEndings = rFile.Read(endings, Sizes[s]);
+ int tmp;
// Skip any bytes from a previous matched block
- while(rollOverInitialBytes > 0 && offset < bytesInEndings)
+ if(rollOverInitialBytes > 0 && offset < bytesInEndings)
{
- rolling.RollForward(beginnings[offset], endings[offset], Sizes[s]);
- ++offset;
- --rollOverInitialBytes;
+ int spaceLeft = bytesInEndings - offset;
+ int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes;
+
+ rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll);
+
+ offset += thisRoll;
+ fileOffset += thisRoll;
+ rollOverInitialBytes -= thisRoll;
+
+ if(rollOverInitialBytes)
+ {
+ goto refresh;
+ }
+ }
+
+ if(goodnessOfFit.count(fileOffset))
+ {
+ tmp = goodnessOfFit[fileOffset];
+ }
+ else
+ {
+ tmp = 0;
+ }
+
+ if(tmp >= Sizes[s])
+ {
+ // Skip over bigger ready-matched blocks completely
+ rollOverInitialBytes = tmp;
+ int spaceLeft = bytesInEndings - offset;
+ int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes;
+
+ rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll);
+
+ offset += thisRoll;
+ fileOffset += thisRoll;
+ rollOverInitialBytes -= thisRoll;
+
+ if(rollOverInitialBytes)
+ {
+ goto refresh;
+ }
}
while(offset < bytesInEndings)
{
- // Put is current checksum in hash list?
+ // Is current checksum in hash list?
uint16_t hash = rolling.GetComponentForHashing();
- if(phashTable[hash] != 0)
+ if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s]))
{
- if(SecondStageMatch(phashTable[hash], hash, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks))
+ if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks))
{
+ goodnessOfFit[fileOffset] = Sizes[s];
+
// Block matched, roll the checksum forward to the next block without doing
// any more comparisons, because these are pointless (as any more matches will be ignored when
// the receipe is generated) and just take up valuable processor time. Edge cases are
// especially nasty, using huge amounts of time and memory.
int skip = Sizes[s];
- while(offset < bytesInEndings && skip > 0)
+ if(offset < bytesInEndings && skip > 0)
{
- rolling.RollForward(beginnings[offset], endings[offset], Sizes[s]);
- ++offset;
- --skip;
+ int spaceLeft = bytesInEndings - offset;
+ int thisRoll = (skip > spaceLeft) ? spaceLeft : skip;
+
+ rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll);
+
+ offset += thisRoll;
+ fileOffset += thisRoll;
+ skip -= thisRoll;
}
// Not all the bytes necessary will have been skipped, so get them
// skipped after the next block is loaded.
@@ -588,8 +640,7 @@ static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t>
break;
}
- if(static_cast<int64_t>(rFoundBlocks.size()) > (NumBlocks * BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE)
- || sDiffTimedOut)
+ if(static_cast<int64_t>(rFoundBlocks.size()) > (NumBlocks * BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE) || sDiffTimedOut)
{
abortSearch = true;
break;
@@ -599,12 +650,14 @@ static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t>
// Roll checksum forward
rolling.RollForward(beginnings[offset], endings[offset], Sizes[s]);
- // Increment offset
+ // Increment offsets
++offset;
+ ++fileOffset;
}
if(abortSearch) break;
+ refresh:
// Finished?
if(bytesInEndings != Sizes[s])
{
@@ -612,9 +665,12 @@ static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t>
// (Do a copy and paste of 5 lines of code instead of introducing a comparison for
// each byte of the file)
uint16_t hash = rolling.GetComponentForHashing();
- if(phashTable[hash] != 0)
+ if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s]))
{
- SecondStageMatch(phashTable[hash], hash, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks);
+ if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks))
+ {
+ goodnessOfFit[fileOffset] = Sizes[s];
+ }
}
// finish
@@ -716,7 +772,7 @@ static void SetupHashTable(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int3
// Created: 14/1/04
//
// --------------------------------------------------------------------------
-static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, uint16_t Hash, uint8_t *pBeginnings, uint8_t *pEndings,
+static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings,
int Offset, int32_t BlockSize, int64_t FileBlockNumber, BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks)
{
// Check parameters
@@ -727,6 +783,26 @@ static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, uint16_t Ha
ASSERT(pFirstInHashList != 0);
ASSERT(pIndex != 0);
+ uint16_t Hash = fastSum.GetComponentForHashing();
+ uint32_t Checksum = fastSum.GetChecksum();
+
+ // Before we go to the expense of the MD5, make sure it's a darn good match on the checksum we already know.
+ BlocksAvailableEntry *scan = pFirstInHashList;
+ bool found=false;
+ while(scan != 0)
+ {
+ if(scan->mWeakChecksum == Checksum)
+ {
+ found = true;
+ break;
+ }
+ scan = scan->mpNextInHashList;
+ }
+ if(!found)
+ {
+ return false;
+ }
+
// Calculate the strong MD5 digest for this block
MD5Digest strong;
// Add the data from the beginnings
@@ -739,7 +815,7 @@ static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, uint16_t Ha
strong.Finish();
// Then go through the entries in the hash list, comparing with the strong digest calculated
- BlocksAvailableEntry *scan = pFirstInHashList;
+ scan = pFirstInHashList;
//TRACE0("second stage match\n");
while(scan != 0)
{
@@ -753,10 +829,12 @@ static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, uint16_t Ha
//TRACE0("Match!\n");
// Found! Add to list of found blocks...
int64_t fileOffset = (FileBlockNumber * BlockSize) + Offset;
- int64_t blockIndex = (scan - pIndex); // pointer arthmitic is frowed apon. But most efficient way of doing it here -- alternative is to use more memory
+ int64_t blockIndex = (scan - pIndex); // pointer arthmitic is frowned upon. But most efficient way of doing it here -- alternative is to use more memory
- // Because we search for smallest blocks first, if there's an entry there
- // we'll just be replacing it with a larger one, which is great news.
+ // We do NOT search for smallest blocks first, as this code originally assumed.
+ // To prevent this from potentially overwriting a better match, the caller must determine
+ // the relative "goodness" of any existing match and this one, and avoid the call if it
+ // could be detrimental.
rFoundBlocks[fileOffset] = blockIndex;
// No point in searching further, report success
diff --git a/lib/crypto/RollingChecksum.cpp b/lib/crypto/RollingChecksum.cpp
index 75bad7df..a2954e3a 100755
--- a/lib/crypto/RollingChecksum.cpp
+++ b/lib/crypto/RollingChecksum.cpp
@@ -20,11 +20,11 @@
// Created: 6/12/03
//
// --------------------------------------------------------------------------
-RollingChecksum::RollingChecksum(const void *data, unsigned int Length)
+RollingChecksum::RollingChecksum(const void * const data, const unsigned int Length)
: a(0),
b(0)
{
- uint8_t *block = (uint8_t *)data;
+ const uint8_t *block = (const uint8_t *)data;
for(unsigned int x = Length; x >= 1; --x)
{
a += (*block);
@@ -34,5 +34,29 @@ RollingChecksum::RollingChecksum(const void *data, unsigned int Length)
}
}
+// --------------------------------------------------------------------------
+//
+// Function
+// Name: RollingChecksum::RollForwardSeveral(uint8_t*, uint8_t*, unsigned int, unsigned int)
+// Purpose: Move the checksum forward a block, given a pointer to the first byte of the current block,
+// and a pointer just after the last byte of the current block and the length of the block and of the skip.
+// Created: 7/14/05
+//
+// --------------------------------------------------------------------------
+void RollingChecksum::RollForwardSeveral(const uint8_t * const StartOfThisBlock, const uint8_t * const LastOfNextBlock, const unsigned int Length, const unsigned int Skip)
+{
+ // IMPLEMENTATION NOTE: Everything is implicitly mod 2^16 -- uint16_t's will overflow nicely.
+ unsigned int i;
+ uint16_t sumBegin=0, j,k;
+ for(i=0; i < Skip; i++)
+ {
+ j = StartOfThisBlock[i];
+ k = LastOfNextBlock[i];
+ sumBegin += j;
+ a += (k - j);
+ b += a;
+ }
+ b -= Length * sumBegin;
+}
diff --git a/lib/crypto/RollingChecksum.h b/lib/crypto/RollingChecksum.h
index 99d116b9..be79c36f 100755
--- a/lib/crypto/RollingChecksum.h
+++ b/lib/crypto/RollingChecksum.h
@@ -24,7 +24,7 @@
class RollingChecksum
{
public:
- RollingChecksum(const void *data, unsigned int Length);
+ RollingChecksum(const void * const data, const unsigned int Length);
// --------------------------------------------------------------------------
//
@@ -35,7 +35,7 @@ public:
// Created: 6/12/03
//
// --------------------------------------------------------------------------
- inline void RollForward(uint8_t StartOfThisBlock, uint8_t LastOfNextBlock, unsigned int Length)
+ inline void RollForward(const uint8_t StartOfThisBlock, const uint8_t LastOfNextBlock, const unsigned int Length)
{
// IMPLEMENTATION NOTE: Everything is implicitly mod 2^16 -- uint16_t's will overflow nicely.
a -= StartOfThisBlock;
@@ -47,19 +47,30 @@ public:
// --------------------------------------------------------------------------
//
// Function
+ // Name: RollingChecksum::RollForwardSeveral(uint8_t*, uint8_t*, unsigned int, unsigned int)
+ // Purpose: Move the checksum forward a block, given a pointer to the first byte of the current block,
+ // and a pointer just after the last byte of the current block and the length of the block and of the skip.
+ // Created: 7/14/05
+ //
+ // --------------------------------------------------------------------------
+ void RollForwardSeveral(const uint8_t * const StartOfThisBlock, const uint8_t * const LastOfNextBlock, const unsigned int Length, const unsigned int Skip);
+
+ // --------------------------------------------------------------------------
+ //
+ // Function
// Name: RollingChecksum::GetChecksum()
// Purpose: Returns the checksum
// Created: 6/12/03
//
// --------------------------------------------------------------------------
- inline uint32_t GetChecksum()
+ inline uint32_t GetChecksum() const
{
return ((uint32_t)a) | (((uint32_t)b) << 16);
}
// Components, just in case they're handy
- inline uint16_t GetComponent1() {return a;}
- inline uint16_t GetComponent2() {return b;}
+ inline uint16_t GetComponent1() const {return a;}
+ inline uint16_t GetComponent2() const {return b;}
// --------------------------------------------------------------------------
//
@@ -69,7 +80,7 @@ public:
// Created: 6/12/03
//
// --------------------------------------------------------------------------
- inline uint16_t GetComponentForHashing()
+ inline uint16_t GetComponentForHashing() const
{
return b;
}
@@ -82,7 +93,7 @@ public:
// Created: 14/1/04
//
// --------------------------------------------------------------------------
- static inline uint16_t ExtractHashingComponent(uint32_t Checksum)
+ static inline uint16_t ExtractHashingComponent(const uint32_t Checksum)
{
return Checksum >> 16;
}