Merged chromi/diffopt at r116 to trunk

author: Martin Ebourne <martin@ebourne.me.uk> 2005-12-07 16:24:49 +0000
committer: Martin Ebourne <martin@ebourne.me.uk> 2005-12-07 16:24:49 +0000
commit: 065dc6f8cd168e3ee6e71ddfb06f42a92abfabbd (patch)
tree: ec09f9aff78d337b64ef4af7c7cba2d23a4ef0e9 /lib/backupclient/BackupStoreFileDiff.cpp
parent: 9950474440f85c5a35be8cfe7a85cee884209f20 (diff)
1 files changed, 103 insertions, 25 deletions
diff --git a/lib/backupclient/BackupStoreFileDiff.cpp b/lib/backupclient/BackupStoreFileDiff.cpp
index b72ce328..22395151 100755
--- a/lib/backupclient/BackupStoreFileDiff.cpp
+++ b/lib/backupclient/BackupStoreFileDiff.cpp
@@ -41,11 +41,12 @@ static void LoadIndex(IOStream &rBlockIndex, int64_t ThisID, BlocksAvailableEntr
 static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]);
 static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> &rFoundBlocks, BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]);
 static void SetupHashTable(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t BlockSize, BlocksAvailableEntry **pHashTable);
-static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, uint16_t Hash, uint8_t *pBeginnings, uint8_t *pEndings, int Offset, int32_t BlockSize, int64_t FileBlockNumber, BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks);
+static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings, int Offset, int32_t BlockSize, int64_t FileBlockNumber,
+BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks);
 static void GenerateRecipe(BackupStoreFileEncodeStream::Recipe &rRecipe, BlocksAvailableEntry *pIndex, int64_t NumBlocks, std::map<int64_t, int64_t> &rFoundBlocks, int64_t SizeOfInputFile);
 
 // Avoid running on too long with diffs
-static int sMaximumDiffTime = 10;		// maximum time to spend diffing
+static int sMaximumDiffTime = 600;		// maximum time to spend diffing
 static bool sDiffTimedOut = false;
 static bool sSetTimerSignelHandler = false;
 static void TimerSignalHandler(int signal);
@@ -426,7 +427,9 @@ static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, i
 		// Find the position of the size in the array
 		for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t)
 		{
-			if(i->second > sizeCounts[t])
+			// Instead of sorting on the raw count of blocks,
+			// take the file area covered by this block size.
+			if(i->second * i->first > sizeCounts[t] * Sizes[t])
 			{
 				// Then this size belong before this entry -- shuffle them up
 				for(int s = (BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1); s >= t; --s)
@@ -470,6 +473,8 @@ static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, i
 static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> &rFoundBlocks,
 	BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES])
 {
+	std::map<int64_t, int32_t> goodnessOfFit;
+
 	// Allocate the hash lookup table
 	BlocksAvailableEntry **phashTable = (BlocksAvailableEntry **)::malloc(sizeof(BlocksAvailableEntry *) * (64*1024));
 
@@ -500,7 +505,7 @@ static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t>
 		// Check buffer allocation
 		if(pbuffer0 == 0 || pbuffer1 == 0 || phashTable == 0)
 		{
-			// If a buffer got alloocated, it will be cleaned up in the catch block
+			// If a buffer got allocated, it will be cleaned up in the catch block
 			throw std::bad_alloc();
 		}
 		
@@ -547,38 +552,85 @@ static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t>
 			
 			// Then roll, until the file is exhausted
 			int64_t fileBlockNumber = 0;
+			int64_t fileOffset = 0;
 			int rollOverInitialBytes = 0;
 			while(true)
 			{
 				// Load in another block of data, and record how big it is
 				int bytesInEndings = rFile.Read(endings, Sizes[s]);
+				int tmp;
 
 				// Skip any bytes from a previous matched block
-				while(rollOverInitialBytes > 0 && offset < bytesInEndings)
+				if(rollOverInitialBytes > 0 && offset < bytesInEndings)
 				{
-					rolling.RollForward(beginnings[offset], endings[offset], Sizes[s]);
-					++offset;
-					--rollOverInitialBytes;
+					int spaceLeft = bytesInEndings - offset;
+					int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes;
+
+					rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll);
+
+					offset += thisRoll;
+					fileOffset += thisRoll;
+					rollOverInitialBytes -= thisRoll;
+
+					if(rollOverInitialBytes)
+					{
+						goto refresh;
+					}
+				}
+
+				if(goodnessOfFit.count(fileOffset))
+				{
+					tmp = goodnessOfFit[fileOffset];
+				}
+				else
+				{
+					tmp = 0;
+				}
+
+				if(tmp >= Sizes[s])
+				{
+					// Skip over bigger ready-matched blocks completely
+					rollOverInitialBytes = tmp;
+					int spaceLeft = bytesInEndings - offset;
+					int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes;
+
+					rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll);
+
+					offset += thisRoll;
+					fileOffset += thisRoll;
+					rollOverInitialBytes -= thisRoll;
+
+					if(rollOverInitialBytes)
+					{
+						goto refresh;
+					}
 				}
 
 				while(offset < bytesInEndings)
 				{
-					// Put is current checksum in hash list?
+					// Is current checksum in hash list?
 					uint16_t hash = rolling.GetComponentForHashing();
-					if(phashTable[hash] != 0)
+					if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s]))
 					{
-						if(SecondStageMatch(phashTable[hash], hash, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks))
+						if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks))
 						{
+							goodnessOfFit[fileOffset] = Sizes[s];
+
 							// Block matched, roll the checksum forward to the next block without doing
 							// any more comparisons, because these are pointless (as any more matches will be ignored when
 							// the receipe is generated) and just take up valuable processor time. Edge cases are
 							// especially nasty, using huge amounts of time and memory.
 							int skip = Sizes[s];
-							while(offset < bytesInEndings && skip > 0)
+							if(offset < bytesInEndings && skip > 0)
 							{
-								rolling.RollForward(beginnings[offset], endings[offset], Sizes[s]);
-								++offset;
-								--skip;
+								int spaceLeft = bytesInEndings - offset;
+								int thisRoll = (skip > spaceLeft) ? spaceLeft : skip;
+
+								rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll);
+
+								offset += thisRoll;
+								fileOffset += thisRoll;
+								skip -= thisRoll;
 							}
 							// Not all the bytes necessary will have been skipped, so get them
 							// skipped after the next block is loaded.
@@ -588,8 +640,7 @@ static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t>
 							break;
 						}
 						
-						if(static_cast<int64_t>(rFoundBlocks.size()) > (NumBlocks * BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE)
-							|| sDiffTimedOut)
+						if(static_cast<int64_t>(rFoundBlocks.size()) > (NumBlocks * BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE) || sDiffTimedOut)
 						{
 							abortSearch = true;
 							break;
@@ -599,12 +650,14 @@ static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t>
 					// Roll checksum forward
 					rolling.RollForward(beginnings[offset], endings[offset], Sizes[s]);
 				
-					// Increment offset
+					// Increment offsets
 					++offset;
+					++fileOffset;
 				}
 				
 				if(abortSearch) break;
 				
+			refresh:
 				// Finished?
 				if(bytesInEndings != Sizes[s])
 				{
@@ -612,9 +665,12 @@ static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t>
 					// (Do a copy and paste of 5 lines of code instead of introducing a comparison for
 					// each byte of the file)
 					uint16_t hash = rolling.GetComponentForHashing();
-					if(phashTable[hash] != 0)
+					if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s]))
 					{
-						SecondStageMatch(phashTable[hash], hash, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks);
+						if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks))
+						{
+							goodnessOfFit[fileOffset] = Sizes[s];
+						}
 					}
 
 					// finish
@@ -716,7 +772,7 @@ static void SetupHashTable(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int3
 //		Created: 14/1/04
 //
 // --------------------------------------------------------------------------
-static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, uint16_t Hash, uint8_t *pBeginnings, uint8_t *pEndings,
+static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings,
 	int Offset, int32_t BlockSize, int64_t FileBlockNumber, BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks)
 {
 	// Check parameters
@@ -727,6 +783,26 @@ static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, uint16_t Ha
 	ASSERT(pFirstInHashList != 0);
 	ASSERT(pIndex != 0);
 
+	uint16_t Hash = fastSum.GetComponentForHashing();
+	uint32_t Checksum = fastSum.GetChecksum();
+
+	// Before we go to the expense of the MD5, make sure it's a darn good match on the checksum we already know.
+	BlocksAvailableEntry *scan = pFirstInHashList;
+	bool found=false;
+	while(scan != 0)
+	{
+		if(scan->mWeakChecksum == Checksum)
+		{
+			found = true;
+			break;
+		}
+		scan = scan->mpNextInHashList;
+	}
+	if(!found)
+	{
+		return false;
+	}
+
 	// Calculate the strong MD5 digest for this block
 	MD5Digest strong;
 	// Add the data from the beginnings
@@ -739,7 +815,7 @@ static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, uint16_t Ha
 	strong.Finish();
 	
 	// Then go through the entries in the hash list, comparing with the strong digest calculated
-	BlocksAvailableEntry *scan = pFirstInHashList;
+	scan = pFirstInHashList;
 	//TRACE0("second stage match\n");
 	while(scan != 0)
 	{
@@ -753,10 +829,12 @@ static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, uint16_t Ha
 			//TRACE0("Match!\n");
 			// Found! Add to list of found blocks...
 			int64_t fileOffset = (FileBlockNumber * BlockSize) + Offset;
-			int64_t blockIndex = (scan - pIndex);	// pointer arthmitic is frowed apon. But most efficient way of doing it here -- alternative is to use more memory
+			int64_t blockIndex = (scan - pIndex);	// pointer arthmitic is frowned upon. But most efficient way of doing it here -- alternative is to use more memory
 			
-			// Because we search for smallest blocks first, if there's an entry there
-			// we'll just be replacing it with a larger one, which is great news.
+			// We do NOT search for smallest blocks first, as this code originally assumed.
+			// To prevent this from potentially overwriting a better match, the caller must determine
+			// the relative "goodness" of any existing match and this one, and avoid the call if it
+			// could be detrimental.
 			rFoundBlocks[fileOffset] = blockIndex;
 			
 			// No point in searching further, report success
author	Martin Ebourne <martin@ebourne.me.uk>	2005-12-07 16:24:49 +0000
committer	Martin Ebourne <martin@ebourne.me.uk>	2005-12-07 16:24:49 +0000
commit	065dc6f8cd168e3ee6e71ddfb06f42a92abfabbd (patch)
tree	ec09f9aff78d337b64ef4af7c7cba2d23a4ef0e9 /lib/backupclient/BackupStoreFileDiff.cpp
parent	9950474440f85c5a35be8cfe7a85cee884209f20 (diff)