summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--docs/backup/encrypt_rsync.txt (renamed from docs/backup/encryt_rsync.txt)6
-rw-r--r--docs/common/lib_crypto/RollingChecksum.txt4
-rwxr-xr-xlib/backupclient/BackupStoreConstants.h4
-rwxr-xr-xlib/backupclient/BackupStoreFileDiff.cpp128
-rwxr-xr-xlib/crypto/RollingChecksum.cpp28
-rwxr-xr-xlib/crypto/RollingChecksum.h25
-rwxr-xr-xtest/backupdiff/testbackupdiff.cpp23
-rwxr-xr-xtest/crypto/testcrypto.cpp6
8 files changed, 179 insertions, 45 deletions
diff --git a/docs/backup/encryt_rsync.txt b/docs/backup/encrypt_rsync.txt
index 9e3427c1..a5db2df2 100644
--- a/docs/backup/encryt_rsync.txt
+++ b/docs/backup/encrypt_rsync.txt
@@ -19,7 +19,7 @@ Why not just encrypt the file, and use the standard rsync algorithm?
1) Compression cannot be used, since encryption turns the file into essentially random data. This is not very compressible.
-2) Any modification to the file will result in all data after that in the file having different ciphertext (in any cipher mode we might want to use). Therefore the rsync algorithm will only be able to detect "same" blocks up until the first modification. This significantly reduces the effectiveness of the process.
+2) Any modification to the file will result in all data after that in the file having different ciphertext (in any cipher mode we might want to use). Therefore the rsync algorithm will only be able to detect "same" blocks up until the first modification. This significantly reduces the effectiveness of the process.
Note that blocks are not all the same size. The last block in the file is unlikely to be a full block, and if data is inserted which is not a integral multiple of the block size, odd sized blocks need to be created. This is because the server cannot reassemble the blocks, because the contents are opaque to the server.
@@ -30,9 +30,9 @@ To produce a list of the changes to send the new version, the client requests th
The client then decrypts the index, and builds a list of the 8 most used block sizes above a certain threshold size.
-The new version of the file is then scanned in exactly the same way as rsync for these 8 block sizes. If a block is found, then it is added to a list of found blocks, sorted by position in the file. If a block has already been found at that position, then the old entry is replaced by the new entry.
+The new version of the file is then scanned in exactly the same way as rsync for these 8 block sizes. If a block is found, then it is added to a list of found blocks, sorted by position in the file. If a block has already been found at that position, then the old entry is only replaced by the new entry if the new entry is a "better" (bigger) match.
-The smallest block size is searched first, so that larger blocks replace smaller blocks in the found list.
+The block size covering the biggest file area is searched first, so that most of the file can be skipped over after the first pass without expensive checksumming.
A "recipe" is then built from the found list, by trivially discarding overlapping blocks. Each entry consists of a number of bytes of "new" data, a block start number, and a number of blocks from the old file. The data is stored like this as a memory optimisation, assuming that files mostly stay the same rather than having all their blocks reordered.
diff --git a/docs/common/lib_crypto/RollingChecksum.txt b/docs/common/lib_crypto/RollingChecksum.txt
index cbee1454..d871b3f2 100644
--- a/docs/common/lib_crypto/RollingChecksum.txt
+++ b/docs/common/lib_crypto/RollingChecksum.txt
@@ -30,3 +30,7 @@ with size s, then it should be called with
and now GetChecksum will return the checksum of the block (pBlock+1) of size s.
+FUNCTION RollingChecksum::RollForwardSeveral()
+
+Similar to RollForward(), but is more efficient for skipping several bytes at once. Takes pointers to the data buffer rather than the actual data values.
+
diff --git a/lib/backupclient/BackupStoreConstants.h b/lib/backupclient/BackupStoreConstants.h
index 8254e918..907072ea 100755
--- a/lib/backupclient/BackupStoreConstants.h
+++ b/lib/backupclient/BackupStoreConstants.h
@@ -28,10 +28,10 @@
#define BACKUP_FILE_AVOID_BLOCKS_LESS_THAN 128
// Maximum number of sizes to do an rsync-like scan for
-#define BACKUP_FILE_DIFF_MAX_BLOCK_SIZES 8
+#define BACKUP_FILE_DIFF_MAX_BLOCK_SIZES 64
// When doing rsync scans, do not scan for blocks smaller than
-#define BACKUP_FILE_DIFF_MIN_BLOCK_SIZE 256
+#define BACKUP_FILE_DIFF_MIN_BLOCK_SIZE 128
// A limit to stop diffing running out of control: If more than this
// times the number of blocks in the original index are found, stop
diff --git a/lib/backupclient/BackupStoreFileDiff.cpp b/lib/backupclient/BackupStoreFileDiff.cpp
index b72ce328..22395151 100755
--- a/lib/backupclient/BackupStoreFileDiff.cpp
+++ b/lib/backupclient/BackupStoreFileDiff.cpp
@@ -41,11 +41,12 @@ static void LoadIndex(IOStream &rBlockIndex, int64_t ThisID, BlocksAvailableEntr
static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]);
static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> &rFoundBlocks, BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]);
static void SetupHashTable(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t BlockSize, BlocksAvailableEntry **pHashTable);
-static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, uint16_t Hash, uint8_t *pBeginnings, uint8_t *pEndings, int Offset, int32_t BlockSize, int64_t FileBlockNumber, BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks);
+static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings, int Offset, int32_t BlockSize, int64_t FileBlockNumber,
+BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks);
static void GenerateRecipe(BackupStoreFileEncodeStream::Recipe &rRecipe, BlocksAvailableEntry *pIndex, int64_t NumBlocks, std::map<int64_t, int64_t> &rFoundBlocks, int64_t SizeOfInputFile);
// Avoid running on too long with diffs
-static int sMaximumDiffTime = 10; // maximum time to spend diffing
+static int sMaximumDiffTime = 600; // maximum time to spend diffing
static bool sDiffTimedOut = false;
static bool sSetTimerSignelHandler = false;
static void TimerSignalHandler(int signal);
@@ -426,7 +427,9 @@ static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, i
// Find the position of the size in the array
for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t)
{
- if(i->second > sizeCounts[t])
+ // Instead of sorting on the raw count of blocks,
+ // take the file area covered by this block size.
+ if(i->second * i->first > sizeCounts[t] * Sizes[t])
{
// Then this size belong before this entry -- shuffle them up
for(int s = (BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1); s >= t; --s)
@@ -470,6 +473,8 @@ static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, i
static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> &rFoundBlocks,
BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES])
{
+ std::map<int64_t, int32_t> goodnessOfFit;
+
// Allocate the hash lookup table
BlocksAvailableEntry **phashTable = (BlocksAvailableEntry **)::malloc(sizeof(BlocksAvailableEntry *) * (64*1024));
@@ -500,7 +505,7 @@ static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t>
// Check buffer allocation
if(pbuffer0 == 0 || pbuffer1 == 0 || phashTable == 0)
{
- // If a buffer got alloocated, it will be cleaned up in the catch block
+ // If a buffer got allocated, it will be cleaned up in the catch block
throw std::bad_alloc();
}
@@ -547,38 +552,85 @@ static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t>
// Then roll, until the file is exhausted
int64_t fileBlockNumber = 0;
+ int64_t fileOffset = 0;
int rollOverInitialBytes = 0;
while(true)
{
// Load in another block of data, and record how big it is
int bytesInEndings = rFile.Read(endings, Sizes[s]);
+ int tmp;
// Skip any bytes from a previous matched block
- while(rollOverInitialBytes > 0 && offset < bytesInEndings)
+ if(rollOverInitialBytes > 0 && offset < bytesInEndings)
{
- rolling.RollForward(beginnings[offset], endings[offset], Sizes[s]);
- ++offset;
- --rollOverInitialBytes;
+ int spaceLeft = bytesInEndings - offset;
+ int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes;
+
+ rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll);
+
+ offset += thisRoll;
+ fileOffset += thisRoll;
+ rollOverInitialBytes -= thisRoll;
+
+ if(rollOverInitialBytes)
+ {
+ goto refresh;
+ }
+ }
+
+ if(goodnessOfFit.count(fileOffset))
+ {
+ tmp = goodnessOfFit[fileOffset];
+ }
+ else
+ {
+ tmp = 0;
+ }
+
+ if(tmp >= Sizes[s])
+ {
+ // Skip over bigger ready-matched blocks completely
+ rollOverInitialBytes = tmp;
+ int spaceLeft = bytesInEndings - offset;
+ int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes;
+
+ rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll);
+
+ offset += thisRoll;
+ fileOffset += thisRoll;
+ rollOverInitialBytes -= thisRoll;
+
+ if(rollOverInitialBytes)
+ {
+ goto refresh;
+ }
}
while(offset < bytesInEndings)
{
- // Put is current checksum in hash list?
+ // Is current checksum in hash list?
uint16_t hash = rolling.GetComponentForHashing();
- if(phashTable[hash] != 0)
+ if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s]))
{
- if(SecondStageMatch(phashTable[hash], hash, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks))
+ if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks))
{
+ goodnessOfFit[fileOffset] = Sizes[s];
+
// Block matched, roll the checksum forward to the next block without doing
// any more comparisons, because these are pointless (as any more matches will be ignored when
// the receipe is generated) and just take up valuable processor time. Edge cases are
// especially nasty, using huge amounts of time and memory.
int skip = Sizes[s];
- while(offset < bytesInEndings && skip > 0)
+ if(offset < bytesInEndings && skip > 0)
{
- rolling.RollForward(beginnings[offset], endings[offset], Sizes[s]);
- ++offset;
- --skip;
+ int spaceLeft = bytesInEndings - offset;
+ int thisRoll = (skip > spaceLeft) ? spaceLeft : skip;
+
+ rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll);
+
+ offset += thisRoll;
+ fileOffset += thisRoll;
+ skip -= thisRoll;
}
// Not all the bytes necessary will have been skipped, so get them
// skipped after the next block is loaded.
@@ -588,8 +640,7 @@ static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t>
break;
}
- if(static_cast<int64_t>(rFoundBlocks.size()) > (NumBlocks * BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE)
- || sDiffTimedOut)
+ if(static_cast<int64_t>(rFoundBlocks.size()) > (NumBlocks * BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE) || sDiffTimedOut)
{
abortSearch = true;
break;
@@ -599,12 +650,14 @@ static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t>
// Roll checksum forward
rolling.RollForward(beginnings[offset], endings[offset], Sizes[s]);
- // Increment offset
+ // Increment offsets
++offset;
+ ++fileOffset;
}
if(abortSearch) break;
+ refresh:
// Finished?
if(bytesInEndings != Sizes[s])
{
@@ -612,9 +665,12 @@ static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t>
// (Do a copy and paste of 5 lines of code instead of introducing a comparison for
// each byte of the file)
uint16_t hash = rolling.GetComponentForHashing();
- if(phashTable[hash] != 0)
+ if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s]))
{
- SecondStageMatch(phashTable[hash], hash, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks);
+ if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks))
+ {
+ goodnessOfFit[fileOffset] = Sizes[s];
+ }
}
// finish
@@ -716,7 +772,7 @@ static void SetupHashTable(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int3
// Created: 14/1/04
//
// --------------------------------------------------------------------------
-static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, uint16_t Hash, uint8_t *pBeginnings, uint8_t *pEndings,
+static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings,
int Offset, int32_t BlockSize, int64_t FileBlockNumber, BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks)
{
// Check parameters
@@ -727,6 +783,26 @@ static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, uint16_t Ha
ASSERT(pFirstInHashList != 0);
ASSERT(pIndex != 0);
+ uint16_t Hash = fastSum.GetComponentForHashing();
+ uint32_t Checksum = fastSum.GetChecksum();
+
+ // Before we go to the expense of the MD5, make sure it's a darn good match on the checksum we already know.
+ BlocksAvailableEntry *scan = pFirstInHashList;
+ bool found=false;
+ while(scan != 0)
+ {
+ if(scan->mWeakChecksum == Checksum)
+ {
+ found = true;
+ break;
+ }
+ scan = scan->mpNextInHashList;
+ }
+ if(!found)
+ {
+ return false;
+ }
+
// Calculate the strong MD5 digest for this block
MD5Digest strong;
// Add the data from the beginnings
@@ -739,7 +815,7 @@ static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, uint16_t Ha
strong.Finish();
// Then go through the entries in the hash list, comparing with the strong digest calculated
- BlocksAvailableEntry *scan = pFirstInHashList;
+ scan = pFirstInHashList;
//TRACE0("second stage match\n");
while(scan != 0)
{
@@ -753,10 +829,12 @@ static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, uint16_t Ha
//TRACE0("Match!\n");
// Found! Add to list of found blocks...
int64_t fileOffset = (FileBlockNumber * BlockSize) + Offset;
- int64_t blockIndex = (scan - pIndex); // pointer arthmitic is frowed apon. But most efficient way of doing it here -- alternative is to use more memory
+ int64_t blockIndex = (scan - pIndex); // pointer arthmitic is frowned upon. But most efficient way of doing it here -- alternative is to use more memory
- // Because we search for smallest blocks first, if there's an entry there
- // we'll just be replacing it with a larger one, which is great news.
+ // We do NOT search for smallest blocks first, as this code originally assumed.
+ // To prevent this from potentially overwriting a better match, the caller must determine
+ // the relative "goodness" of any existing match and this one, and avoid the call if it
+ // could be detrimental.
rFoundBlocks[fileOffset] = blockIndex;
// No point in searching further, report success
diff --git a/lib/crypto/RollingChecksum.cpp b/lib/crypto/RollingChecksum.cpp
index 75bad7df..a2954e3a 100755
--- a/lib/crypto/RollingChecksum.cpp
+++ b/lib/crypto/RollingChecksum.cpp
@@ -20,11 +20,11 @@
// Created: 6/12/03
//
// --------------------------------------------------------------------------
-RollingChecksum::RollingChecksum(const void *data, unsigned int Length)
+RollingChecksum::RollingChecksum(const void * const data, const unsigned int Length)
: a(0),
b(0)
{
- uint8_t *block = (uint8_t *)data;
+ const uint8_t *block = (const uint8_t *)data;
for(unsigned int x = Length; x >= 1; --x)
{
a += (*block);
@@ -34,5 +34,29 @@ RollingChecksum::RollingChecksum(const void *data, unsigned int Length)
}
}
+// --------------------------------------------------------------------------
+//
+// Function
+// Name: RollingChecksum::RollForwardSeveral(uint8_t*, uint8_t*, unsigned int, unsigned int)
+// Purpose: Move the checksum forward a block, given a pointer to the first byte of the current block,
+// and a pointer just after the last byte of the current block and the length of the block and of the skip.
+// Created: 7/14/05
+//
+// --------------------------------------------------------------------------
+void RollingChecksum::RollForwardSeveral(const uint8_t * const StartOfThisBlock, const uint8_t * const LastOfNextBlock, const unsigned int Length, const unsigned int Skip)
+{
+ // IMPLEMENTATION NOTE: Everything is implicitly mod 2^16 -- uint16_t's will overflow nicely.
+ unsigned int i;
+ uint16_t sumBegin=0, j,k;
+ for(i=0; i < Skip; i++)
+ {
+ j = StartOfThisBlock[i];
+ k = LastOfNextBlock[i];
+ sumBegin += j;
+ a += (k - j);
+ b += a;
+ }
+ b -= Length * sumBegin;
+}
diff --git a/lib/crypto/RollingChecksum.h b/lib/crypto/RollingChecksum.h
index 99d116b9..be79c36f 100755
--- a/lib/crypto/RollingChecksum.h
+++ b/lib/crypto/RollingChecksum.h
@@ -24,7 +24,7 @@
class RollingChecksum
{
public:
- RollingChecksum(const void *data, unsigned int Length);
+ RollingChecksum(const void * const data, const unsigned int Length);
// --------------------------------------------------------------------------
//
@@ -35,7 +35,7 @@ public:
// Created: 6/12/03
//
// --------------------------------------------------------------------------
- inline void RollForward(uint8_t StartOfThisBlock, uint8_t LastOfNextBlock, unsigned int Length)
+ inline void RollForward(const uint8_t StartOfThisBlock, const uint8_t LastOfNextBlock, const unsigned int Length)
{
// IMPLEMENTATION NOTE: Everything is implicitly mod 2^16 -- uint16_t's will overflow nicely.
a -= StartOfThisBlock;
@@ -47,19 +47,30 @@ public:
// --------------------------------------------------------------------------
//
// Function
+ // Name: RollingChecksum::RollForwardSeveral(uint8_t*, uint8_t*, unsigned int, unsigned int)
+ // Purpose: Move the checksum forward a block, given a pointer to the first byte of the current block,
+ // and a pointer just after the last byte of the current block and the length of the block and of the skip.
+ // Created: 7/14/05
+ //
+ // --------------------------------------------------------------------------
+ void RollForwardSeveral(const uint8_t * const StartOfThisBlock, const uint8_t * const LastOfNextBlock, const unsigned int Length, const unsigned int Skip);
+
+ // --------------------------------------------------------------------------
+ //
+ // Function
// Name: RollingChecksum::GetChecksum()
// Purpose: Returns the checksum
// Created: 6/12/03
//
// --------------------------------------------------------------------------
- inline uint32_t GetChecksum()
+ inline uint32_t GetChecksum() const
{
return ((uint32_t)a) | (((uint32_t)b) << 16);
}
// Components, just in case they're handy
- inline uint16_t GetComponent1() {return a;}
- inline uint16_t GetComponent2() {return b;}
+ inline uint16_t GetComponent1() const {return a;}
+ inline uint16_t GetComponent2() const {return b;}
// --------------------------------------------------------------------------
//
@@ -69,7 +80,7 @@ public:
// Created: 6/12/03
//
// --------------------------------------------------------------------------
- inline uint16_t GetComponentForHashing()
+ inline uint16_t GetComponentForHashing() const
{
return b;
}
@@ -82,7 +93,7 @@ public:
// Created: 14/1/04
//
// --------------------------------------------------------------------------
- static inline uint16_t ExtractHashingComponent(uint32_t Checksum)
+ static inline uint16_t ExtractHashingComponent(const uint32_t Checksum)
{
return Checksum >> 16;
}
diff --git a/test/backupdiff/testbackupdiff.cpp b/test/backupdiff/testbackupdiff.cpp
index d086253e..5fb10038 100755
--- a/test/backupdiff/testbackupdiff.cpp
+++ b/test/backupdiff/testbackupdiff.cpp
@@ -64,14 +64,25 @@ bool files_identical(const char *file1, const char *file2)
return true;
}
-void make_file_of_zeros(const char *filename, int size)
+void make_file_of_zeros(const char *filename, size_t size)
{
- void *b = malloc(size);
- memset(b, 0, size);
+ static const size_t bs = 0x10000;
+ size_t remSize = size;
+ void *b = malloc(bs);
+ memset(b, 0, bs);
FILE *f = fopen(filename, "wb");
- fwrite(b, size, 1, f);
+
+ // Using largish blocks like this is much faster, while not consuming too much RAM
+ while(remSize > bs)
+ {
+ fwrite(b, bs, 1, f);
+ remSize -= bs;
+ }
+ fwrite(b, remSize, 1, f);
+
fclose(f);
free(b);
+
TEST_THAT(TestGetFileSize(filename) == size);
}
@@ -455,8 +466,8 @@ int test(int argc, const char *argv[])
// Found a nasty case where files of lots of the same thing sock up lots of processor
// time -- because of lots of matches found. Check this out!
- make_file_of_zeros("testfiles/zero.0", 20*1024);
- make_file_of_zeros("testfiles/zero.1", 200*1024);
+ make_file_of_zeros("testfiles/zero.0", 20*1024*1024);
+ make_file_of_zeros("testfiles/zero.1", 200*1024*1024);
// Generate a first encoded file
{
BackupStoreFilenameClear f0name("zero.0");
diff --git a/test/crypto/testcrypto.cpp b/test/crypto/testcrypto.cpp
index 2b055f91..983ae57e 100755
--- a/test/crypto/testcrypto.cpp
+++ b/test/crypto/testcrypto.cpp
@@ -270,6 +270,12 @@ int test(int argc, const char *argv[])
RAND_pseudo_bytes(checkdata, CHECKSUM_DATA_SIZE);
for(int size = CHECKSUM_BLOCK_SIZE_BASE; size <= CHECKSUM_BLOCK_SIZE_LAST; ++size)
{
+ // Test skip-roll code
+ RollingChecksum rollFast(checkdata, size);
+ rollFast.RollForwardSeveral(checkdata, checkdata+size, size, CHECKSUM_ROLLS/2);
+ RollingChecksum calc(checkdata + (CHECKSUM_ROLLS/2), size);
+ TEST_THAT(calc.GetChecksum() == rollFast.GetChecksum());
+
//printf("size = %d\n", size);
// Checksum to roll
RollingChecksum roll(checkdata, size);