From 065dc6f8cd168e3ee6e71ddfb06f42a92abfabbd Mon Sep 17 00:00:00 2001
From: Martin Ebourne <martin@ebourne.me.uk>
Date: Wed, 7 Dec 2005 16:24:49 +0000
Subject: Merged chromi/diffopt at r116 to trunk

---
 lib/backupclient/BackupStoreConstants.h  |   4 +-
 lib/backupclient/BackupStoreFileDiff.cpp | 128 +++++++++++++++++++++++++------
 lib/crypto/RollingChecksum.cpp           |  28 ++++++-
 lib/crypto/RollingChecksum.h             |  25 ++++--
 4 files changed, 149 insertions(+), 36 deletions(-)

(limited to 'lib')

diff --git a/lib/backupclient/BackupStoreConstants.h b/lib/backupclient/BackupStoreConstants.h
index 8254e918..907072ea 100755
--- a/lib/backupclient/BackupStoreConstants.h
+++ b/lib/backupclient/BackupStoreConstants.h
@@ -28,10 +28,10 @@
 #define	BACKUP_FILE_AVOID_BLOCKS_LESS_THAN		128
 
 // Maximum number of sizes to do an rsync-like scan for
-#define BACKUP_FILE_DIFF_MAX_BLOCK_SIZES		8
+#define BACKUP_FILE_DIFF_MAX_BLOCK_SIZES		64
 
 // When doing rsync scans, do not scan for blocks smaller than
-#define BACKUP_FILE_DIFF_MIN_BLOCK_SIZE			256
+#define BACKUP_FILE_DIFF_MIN_BLOCK_SIZE			128
 
 // A limit to stop diffing running out of control: If more than this
 // times the number of blocks in the original index are found, stop
diff --git a/lib/backupclient/BackupStoreFileDiff.cpp b/lib/backupclient/BackupStoreFileDiff.cpp
index b72ce328..22395151 100755
--- a/lib/backupclient/BackupStoreFileDiff.cpp
+++ b/lib/backupclient/BackupStoreFileDiff.cpp
@@ -41,11 +41,12 @@ static void LoadIndex(IOStream &rBlockIndex, int64_t ThisID, BlocksAvailableEntr
 static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]);
 static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> &rFoundBlocks, BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]);
 static void SetupHashTable(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t BlockSize, BlocksAvailableEntry **pHashTable);
-static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, uint16_t Hash, uint8_t *pBeginnings, uint8_t *pEndings, int Offset, int32_t BlockSize, int64_t FileBlockNumber, BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks);
+static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings, int Offset, int32_t BlockSize, int64_t FileBlockNumber,
+BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks);
 static void GenerateRecipe(BackupStoreFileEncodeStream::Recipe &rRecipe, BlocksAvailableEntry *pIndex, int64_t NumBlocks, std::map<int64_t, int64_t> &rFoundBlocks, int64_t SizeOfInputFile);
 
 // Avoid running on too long with diffs
-static int sMaximumDiffTime = 10;		// maximum time to spend diffing
+static int sMaximumDiffTime = 600;		// maximum time to spend diffing
 static bool sDiffTimedOut = false;
 static bool sSetTimerSignelHandler = false;
 static void TimerSignalHandler(int signal);
@@ -426,7 +427,9 @@ static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, i
 		// Find the position of the size in the array
 		for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t)
 		{
-			if(i->second > sizeCounts[t])
+			// Instead of sorting on the raw count of blocks,
+			// take the file area covered by this block size.
+			if(i->second * i->first > sizeCounts[t] * Sizes[t])
 			{
 				// Then this size belong before this entry -- shuffle them up
 				for(int s = (BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1); s >= t; --s)
@@ -470,6 +473,8 @@ static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, i
 static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> &rFoundBlocks,
 	BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES])
 {
+	std::map<int64_t, int32_t> goodnessOfFit;
+
 	// Allocate the hash lookup table
 	BlocksAvailableEntry **phashTable = (BlocksAvailableEntry **)::malloc(sizeof(BlocksAvailableEntry *) * (64*1024));
 
@@ -500,7 +505,7 @@ static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t>
 		// Check buffer allocation
 		if(pbuffer0 == 0 || pbuffer1 == 0 || phashTable == 0)
 		{
-			// If a buffer got alloocated, it will be cleaned up in the catch block
+			// If a buffer got allocated, it will be cleaned up in the catch block
 			throw std::bad_alloc();
 		}
 		
@@ -547,38 +552,85 @@ static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t>
 			
 			// Then roll, until the file is exhausted
 			int64_t fileBlockNumber = 0;
+			int64_t fileOffset = 0;
 			int rollOverInitialBytes = 0;
 			while(true)
 			{
 				// Load in another block of data, and record how big it is
 				int bytesInEndings = rFile.Read(endings, Sizes[s]);
+				int tmp;
 
 				// Skip any bytes from a previous matched block
-				while(rollOverInitialBytes > 0 && offset < bytesInEndings)
+				if(rollOverInitialBytes > 0 && offset < bytesInEndings)
 				{
-					rolling.RollForward(beginnings[offset], endings[offset], Sizes[s]);
-					++offset;
-					--rollOverInitialBytes;
+					int spaceLeft = bytesInEndings - offset;
+					int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes;
+
+					rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll);
+
+					offset += thisRoll;
+					fileOffset += thisRoll;
+					rollOverInitialBytes -= thisRoll;
+
+					if(rollOverInitialBytes)
+					{
+						goto refresh;
+					}
+				}
+
+				if(goodnessOfFit.count(fileOffset))
+				{
+					tmp = goodnessOfFit[fileOffset];
+				}
+				else
+				{
+					tmp = 0;
+				}
+
+				if(tmp >= Sizes[s])
+				{
+					// Skip over bigger ready-matched blocks completely
+					rollOverInitialBytes = tmp;
+					int spaceLeft = bytesInEndings - offset;
+					int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes;
+
+					rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll);
+
+					offset += thisRoll;
+					fileOffset += thisRoll;
+					rollOverInitialBytes -= thisRoll;
+
+					if(rollOverInitialBytes)
+					{
+						goto refresh;
+					}
 				}
 
 				while(offset < bytesInEndings)
 				{
-					// Put is current checksum in hash list?
+					// Is current checksum in hash list?
 					uint16_t hash = rolling.GetComponentForHashing();
-					if(phashTable[hash] != 0)
+					if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s]))
 					{
-						if(SecondStageMatch(phashTable[hash], hash, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks))
+						if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks))
 						{
+							goodnessOfFit[fileOffset] = Sizes[s];
+
 							// Block matched, roll the checksum forward to the next block without doing
 							// any more comparisons, because these are pointless (as any more matches will be ignored when
 							// the receipe is generated) and just take up valuable processor time. Edge cases are
 							// especially nasty, using huge amounts of time and memory.
 							int skip = Sizes[s];
-							while(offset < bytesInEndings && skip > 0)
+							if(offset < bytesInEndings && skip > 0)
 							{
-								rolling.RollForward(beginnings[offset], endings[offset], Sizes[s]);
-								++offset;
-								--skip;
+								int spaceLeft = bytesInEndings - offset;
+								int thisRoll = (skip > spaceLeft) ? spaceLeft : skip;
+
+								rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll);
+
+								offset += thisRoll;
+								fileOffset += thisRoll;
+								skip -= thisRoll;
 							}
 							// Not all the bytes necessary will have been skipped, so get them
 							// skipped after the next block is loaded.
@@ -588,8 +640,7 @@ static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t>
 							break;
 						}
 						
-						if(static_cast<int64_t>(rFoundBlocks.size()) > (NumBlocks * BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE)
-							|| sDiffTimedOut)
+						if(static_cast<int64_t>(rFoundBlocks.size()) > (NumBlocks * BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE) || sDiffTimedOut)
 						{
 							abortSearch = true;
 							break;
@@ -599,12 +650,14 @@ static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t>
 					// Roll checksum forward
 					rolling.RollForward(beginnings[offset], endings[offset], Sizes[s]);
 				
-					// Increment offset
+					// Increment offsets
 					++offset;
+					++fileOffset;
 				}
 				
 				if(abortSearch) break;
 				
+			refresh:
 				// Finished?
 				if(bytesInEndings != Sizes[s])
 				{
@@ -612,9 +665,12 @@ static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t>
 					// (Do a copy and paste of 5 lines of code instead of introducing a comparison for
 					// each byte of the file)
 					uint16_t hash = rolling.GetComponentForHashing();
-					if(phashTable[hash] != 0)
+					if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s]))
 					{
-						SecondStageMatch(phashTable[hash], hash, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks);
+						if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks))
+						{
+							goodnessOfFit[fileOffset] = Sizes[s];
+						}
 					}
 
 					// finish
@@ -716,7 +772,7 @@ static void SetupHashTable(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int3
 //		Created: 14/1/04
 //
 // --------------------------------------------------------------------------
-static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, uint16_t Hash, uint8_t *pBeginnings, uint8_t *pEndings,
+static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings,
 	int Offset, int32_t BlockSize, int64_t FileBlockNumber, BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks)
 {
 	// Check parameters
@@ -727,6 +783,26 @@ static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, uint16_t Ha
 	ASSERT(pFirstInHashList != 0);
 	ASSERT(pIndex != 0);
 
+	uint16_t Hash = fastSum.GetComponentForHashing();
+	uint32_t Checksum = fastSum.GetChecksum();
+
+	// Before we go to the expense of the MD5, make sure it's a darn good match on the checksum we already know.
+	BlocksAvailableEntry *scan = pFirstInHashList;
+	bool found=false;
+	while(scan != 0)
+	{
+		if(scan->mWeakChecksum == Checksum)
+		{
+			found = true;
+			break;
+		}
+		scan = scan->mpNextInHashList;
+	}
+	if(!found)
+	{
+		return false;
+	}
+
 	// Calculate the strong MD5 digest for this block
 	MD5Digest strong;
 	// Add the data from the beginnings
@@ -739,7 +815,7 @@ static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, uint16_t Ha
 	strong.Finish();
 	
 	// Then go through the entries in the hash list, comparing with the strong digest calculated
-	BlocksAvailableEntry *scan = pFirstInHashList;
+	scan = pFirstInHashList;
 	//TRACE0("second stage match\n");
 	while(scan != 0)
 	{
@@ -753,10 +829,12 @@ static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, uint16_t Ha
 			//TRACE0("Match!\n");
 			// Found! Add to list of found blocks...
 			int64_t fileOffset = (FileBlockNumber * BlockSize) + Offset;
-			int64_t blockIndex = (scan - pIndex);	// pointer arthmitic is frowed apon. But most efficient way of doing it here -- alternative is to use more memory
+			int64_t blockIndex = (scan - pIndex);	// pointer arthmitic is frowned upon. But most efficient way of doing it here -- alternative is to use more memory
 			
-			// Because we search for smallest blocks first, if there's an entry there
-			// we'll just be replacing it with a larger one, which is great news.
+			// We do NOT search for smallest blocks first, as this code originally assumed.
+			// To prevent this from potentially overwriting a better match, the caller must determine
+			// the relative "goodness" of any existing match and this one, and avoid the call if it
+			// could be detrimental.
 			rFoundBlocks[fileOffset] = blockIndex;
 			
 			// No point in searching further, report success
diff --git a/lib/crypto/RollingChecksum.cpp b/lib/crypto/RollingChecksum.cpp
index 75bad7df..a2954e3a 100755
--- a/lib/crypto/RollingChecksum.cpp
+++ b/lib/crypto/RollingChecksum.cpp
@@ -20,11 +20,11 @@
 //		Created: 6/12/03
 //
 // --------------------------------------------------------------------------
-RollingChecksum::RollingChecksum(const void *data, unsigned int Length)
+RollingChecksum::RollingChecksum(const void * const data, const unsigned int Length)
 	: a(0),
 	  b(0)
 {
-	uint8_t *block = (uint8_t *)data;
+	const uint8_t *block = (const uint8_t *)data;
 	for(unsigned int x = Length; x >= 1; --x)
 	{
 		a += (*block);
@@ -34,5 +34,29 @@ RollingChecksum::RollingChecksum(const void *data, unsigned int Length)
 	}
 }
 
+// --------------------------------------------------------------------------
+//
+// Function
+//		Name:    RollingChecksum::RollForwardSeveral(uint8_t*, uint8_t*, unsigned int, unsigned int)
+//		Purpose: Move the checksum forward a block, given a pointer to the first byte of the current block,
+//				 and a pointer just after the last byte of the current block and the length of the block and of the skip.
+//		Created: 7/14/05
+//
+// --------------------------------------------------------------------------
+void RollingChecksum::RollForwardSeveral(const uint8_t * const StartOfThisBlock, const uint8_t * const LastOfNextBlock, const unsigned int Length, const unsigned int Skip)
+{
+	// IMPLEMENTATION NOTE: Everything is implicitly mod 2^16 -- uint16_t's will overflow nicely.
+	unsigned int i;
+	uint16_t sumBegin=0, j,k;
 
+	for(i=0; i < Skip; i++)
+	{
+		j = StartOfThisBlock[i];
+		k = LastOfNextBlock[i];
+		sumBegin += j;
+		a += (k - j);
+		b += a;
+	}
 
+	b -= Length * sumBegin;
+}
diff --git a/lib/crypto/RollingChecksum.h b/lib/crypto/RollingChecksum.h
index 99d116b9..be79c36f 100755
--- a/lib/crypto/RollingChecksum.h
+++ b/lib/crypto/RollingChecksum.h
@@ -24,7 +24,7 @@
 class RollingChecksum
 {
 public:
-	RollingChecksum(const void *data, unsigned int Length);
+	RollingChecksum(const void * const data, const unsigned int Length);
 
 	// --------------------------------------------------------------------------
 	//
@@ -35,7 +35,7 @@ public:
 	//		Created: 6/12/03
 	//
 	// --------------------------------------------------------------------------
-	inline void RollForward(uint8_t StartOfThisBlock, uint8_t LastOfNextBlock, unsigned int Length)
+	inline void RollForward(const uint8_t StartOfThisBlock, const uint8_t LastOfNextBlock, const unsigned int Length)
 	{
 		// IMPLEMENTATION NOTE: Everything is implicitly mod 2^16 -- uint16_t's will overflow nicely.
 		a -= StartOfThisBlock;
@@ -44,6 +44,17 @@ public:
 		b += a;
 	}
 
+	// --------------------------------------------------------------------------
+	//
+	// Function
+	//		Name:    RollingChecksum::RollForwardSeveral(uint8_t*, uint8_t*, unsigned int, unsigned int)
+	//		Purpose: Move the checksum forward a block, given a pointer to the first byte of the current block,
+	//				 and a pointer just after the last byte of the current block and the length of the block and of the skip.
+	//		Created: 7/14/05
+	//
+	// --------------------------------------------------------------------------
+	void RollForwardSeveral(const uint8_t * const StartOfThisBlock, const uint8_t * const LastOfNextBlock, const unsigned int Length, const unsigned int Skip);
+
 	// --------------------------------------------------------------------------
 	//
 	// Function
@@ -52,14 +63,14 @@ public:
 	//		Created: 6/12/03
 	//
 	// --------------------------------------------------------------------------	
-	inline uint32_t GetChecksum()
+	inline uint32_t GetChecksum() const
 	{
 		return ((uint32_t)a) | (((uint32_t)b) << 16);
 	}
 	
 	// Components, just in case they're handy
-	inline uint16_t GetComponent1() {return a;}
-	inline uint16_t GetComponent2() {return b;}
+	inline uint16_t GetComponent1() const {return a;}
+	inline uint16_t GetComponent2() const {return b;}
 	
 	// --------------------------------------------------------------------------
 	//
@@ -69,7 +80,7 @@ public:
 	//		Created: 6/12/03
 	//
 	// --------------------------------------------------------------------------
-	inline uint16_t GetComponentForHashing()
+	inline uint16_t GetComponentForHashing() const
 	{
 		return b;
 	}
@@ -82,7 +93,7 @@ public:
 	//		Created: 14/1/04
 	//
 	// --------------------------------------------------------------------------
-	static inline uint16_t ExtractHashingComponent(uint32_t Checksum)
+	static inline uint16_t ExtractHashingComponent(const uint32_t Checksum)
 	{
 		return Checksum >> 16;
 	}
-- 
cgit v1.2.3