Fixup botched merge

author: Reinhard Tartler <siretart@tauware.de> 2017-06-14 19:53:34 -0400
committer: Reinhard Tartler <siretart@tauware.de> 2017-06-14 19:55:14 -0400
commit: e0c122119afea4c951c0c57144d26a473118c254 (patch)
tree: 34a02a56f9b017201dfb721ef678c711351466d6 /lib/backupclient/BackupStoreFileDiff.cpp
parent: e0eb815b67734abd09ff41e2271630d4b2a6d760 (diff)
1 files changed, 0 insertions, 1046 deletions
diff --git a/lib/backupclient/BackupStoreFileDiff.cpp b/lib/backupclient/BackupStoreFileDiff.cpp
deleted file mode 100644
index 5705c3aa..00000000
--- a/lib/backupclient/BackupStoreFileDiff.cpp
+++ /dev/null
@@ -1,1046 +0,0 @@
-// --------------------------------------------------------------------------
-//
-// File
-//		Name:    BackupStoreFileDiff.cpp
-//		Purpose: Functions relating to diffing BackupStoreFiles
-//		Created: 12/1/04
-//
-// --------------------------------------------------------------------------
-
-#include "Box.h"
-
-#include <string.h>
-
-#include <new>
-#include <map>
-
-#ifdef HAVE_TIME_H
-	#include <time.h>
-#elif HAVE_SYS_TIME_H
-	#include <sys/time.h>
-#endif
-
-#include "BackupStoreConstants.h"
-#include "BackupStoreException.h"
-#include "BackupStoreFile.h"
-#include "BackupStoreFileCryptVar.h"
-#include "BackupStoreFileEncodeStream.h"
-#include "BackupStoreFileWire.h"
-#include "BackupStoreObjectMagic.h"
-#include "CommonException.h"
-#include "FileStream.h"
-#include "MD5Digest.h"
-#include "RollingChecksum.h"
-#include "Timer.h"
-
-#include "MemLeakFindOn.h"
-
-#include <cstring>
-
-using namespace BackupStoreFileCryptVar;
-using namespace BackupStoreFileCreation;
-
-// By default, don't trace out details of the diff as we go along -- would fill up logs significantly.
-// But it's useful for the test.
-#ifndef BOX_RELEASE_BUILD
-	bool BackupStoreFile::TraceDetailsOfDiffProcess = false;
-#endif
-
-static void LoadIndex(IOStream &rBlockIndex, int64_t ThisID, BlocksAvailableEntry **ppIndex, int64_t &rNumBlocksOut, int Timeout, bool &rCanDiffFromThis);
-static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]);
-static void SearchForMatchingBlocks(IOStream &rFile, 
-	std::map<int64_t, int64_t> &rFoundBlocks, BlocksAvailableEntry *pIndex, 
-	int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES],
-	DiffTimer *pDiffTimer);
-static void SetupHashTable(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t BlockSize, BlocksAvailableEntry **pHashTable);
-static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings, int Offset, int32_t BlockSize, int64_t FileBlockNumber,
-BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks);
-static void GenerateRecipe(BackupStoreFileEncodeStream::Recipe &rRecipe, BlocksAvailableEntry *pIndex, int64_t NumBlocks, std::map<int64_t, int64_t> &rFoundBlocks, int64_t SizeOfInputFile);
-
-// --------------------------------------------------------------------------
-//
-// Function
-//		Name:    BackupStoreFile::MoveStreamPositionToBlockIndex(IOStream &)
-//		Purpose: Move the file pointer in this stream to just before the block index.
-//				 Assumes that the stream is at the beginning, seekable, and
-//				 reading from the stream is OK.
-//		Created: 12/1/04
-//
-// --------------------------------------------------------------------------
-void BackupStoreFile::MoveStreamPositionToBlockIndex(IOStream &rStream)
-{
-	// Size of file
-	int64_t fileSize = rStream.BytesLeftToRead();
-
-	// Get header
-	file_StreamFormat hdr;
-
-	// Read the header
-	if(!rStream.ReadFullBuffer(&hdr, sizeof(hdr), 0 /* not interested in bytes read if this fails */, IOStream::TimeOutInfinite))
-	{
-		// Couldn't read header
-		THROW_EXCEPTION(BackupStoreException, CouldntReadEntireStructureFromStream)
-	}
-
-	// Check magic number
-	if(ntohl(hdr.mMagicValue) != OBJECTMAGIC_FILE_MAGIC_VALUE_V1
-#ifndef BOX_DISABLE_BACKWARDS_COMPATIBILITY_BACKUPSTOREFILE
-		&& ntohl(hdr.mMagicValue) != OBJECTMAGIC_FILE_MAGIC_VALUE_V0
-#endif
-		)
-	{
-		THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile)
-	}
-	
-	// Work out where the index is
-	int64_t numBlocks = box_ntoh64(hdr.mNumBlocks);
-	int64_t blockHeaderPosFromEnd = ((numBlocks * sizeof(file_BlockIndexEntry)) + sizeof(file_BlockIndexHeader));
-	
-	// Sanity check
-	if(blockHeaderPosFromEnd > static_cast<int64_t>(fileSize - sizeof(file_StreamFormat)))
-	{
-		THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile)
-	}
-	
-	// Seek to that position
-	rStream.Seek(0 - blockHeaderPosFromEnd, IOStream::SeekType_End);
-	
-	// Done. Stream now in right position (as long as the file is formatted correctly)
-}
-
-
-// --------------------------------------------------------------------------
-//
-// Function
-//		Name:    BackupStoreFile::EncodeFileDiff(const char *, int64_t, const BackupStoreFilename &, int64_t, IOStream &, int64_t *)
-//		Purpose: Similar to EncodeFile, but takes the object ID of the file it's
-//				 diffing from, and the index of the blocks in a stream. It'll then
-//				 calculate which blocks can be reused from that old file.
-//				 The timeout is the timeout value for reading the diff block index.
-//				 If pIsCompletelyDifferent != 0, it will be set to true if the
-//				 the two files are completely different (do not share any block), false otherwise.
-//				 
-//		Created: 12/1/04
-//
-// --------------------------------------------------------------------------
-std::auto_ptr<IOStream> BackupStoreFile::EncodeFileDiff
-(
-	const char *Filename, int64_t ContainerID,
-	const BackupStoreFilename &rStoreFilename, int64_t DiffFromObjectID, 
-	IOStream &rDiffFromBlockIndex, int Timeout, DiffTimer *pDiffTimer, 
-	int64_t *pModificationTime, bool *pIsCompletelyDifferent)
-{
-	// Is it a symlink?
-	{
-		EMU_STRUCT_STAT st;
-		if(EMU_LSTAT(Filename, &st) != 0)
-		{
-			THROW_EXCEPTION(CommonException, OSFileError)
-		}
-		if((st.st_mode & S_IFLNK) == S_IFLNK)
-		{
-			// Don't do diffs for symlinks
-			if(pIsCompletelyDifferent != 0)
-			{
-				*pIsCompletelyDifferent = true;
-			}
-			return EncodeFile(Filename, ContainerID, rStoreFilename, pModificationTime);
-		}
-	}
-
-	// Load in the blocks
-	BlocksAvailableEntry *pindex = 0;
-	int64_t blocksInIndex = 0;
-	bool canDiffFromThis = false;
-	LoadIndex(rDiffFromBlockIndex, DiffFromObjectID, &pindex, blocksInIndex, Timeout, canDiffFromThis);
-	// BOX_TRACE("Diff: Blocks in index: " << blocksInIndex);
-	
-	if(!canDiffFromThis)
-	{
-		// Don't do diffing...
-		if(pIsCompletelyDifferent != 0)
-		{
-			*pIsCompletelyDifferent = true;
-		}
-		return EncodeFile(Filename, ContainerID, rStoreFilename, pModificationTime);
-	}
-	
-	// Pointer to recipe we're going to create
-	BackupStoreFileEncodeStream::Recipe *precipe = 0;
-	
-	try
-	{
-		// Find which sizes should be scanned
-		int32_t sizesToScan[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES];
-		FindMostUsedSizes(pindex, blocksInIndex, sizesToScan);
-		
-		// Flag for reporting to the user
-		bool completelyDifferent;
-			
-		// BLOCK
-		{
-			// Search the file to find matching blocks
-			std::map<int64_t, int64_t> foundBlocks; // map of offset in file to index in block index
-			int64_t sizeOfInputFile = 0;
-			// BLOCK
-			{
-				FileStream file(Filename);
-				// Get size of file
-				sizeOfInputFile = file.BytesLeftToRead();
-				// Find all those lovely matching blocks
-				SearchForMatchingBlocks(file, foundBlocks, pindex, 
-					blocksInIndex, sizesToScan, pDiffTimer);
-				
-				// Is it completely different?
-				completelyDifferent = (foundBlocks.size() == 0);
-			}
-			
-			// Create a recipe -- if the two files are completely different, don't put the from file ID in the recipe.
-			precipe = new BackupStoreFileEncodeStream::Recipe(pindex, blocksInIndex, completelyDifferent?(0):(DiffFromObjectID));
-			BlocksAvailableEntry *pindexKeptRef = pindex;	// we need this later, but must set pindex == 0 now, because of exceptions
-			pindex = 0;		// Recipe now has ownership
-			
-			// Fill it in
-			GenerateRecipe(*precipe, pindexKeptRef, blocksInIndex, foundBlocks, sizeOfInputFile);
-		}
-		// foundBlocks no longer required
-		
-		// Create the stream
-		std::auto_ptr<IOStream> stream(new BackupStoreFileEncodeStream);
-	
-		// Do the initial setup
-		((BackupStoreFileEncodeStream*)stream.get())->Setup(Filename, precipe, ContainerID, rStoreFilename, pModificationTime);
-		precipe = 0;	// Stream has taken ownership of this
-		
-		// Tell user about completely different status?
-		if(pIsCompletelyDifferent != 0)
-		{
-			*pIsCompletelyDifferent = completelyDifferent;
-		}
-	
-		// Return the stream for the caller
-		return stream;
-	}
-	catch(...)
-	{
-		// cleanup
-		if(pindex != 0)
-		{
-			::free(pindex);
-			pindex = 0;
-		}
-		if(precipe != 0)
-		{
-			delete precipe;
-			precipe = 0;
-		}
-		throw;
-	}
-}
-
-// --------------------------------------------------------------------------
-//
-// Function
-//		Name:    static LoadIndex(IOStream &, int64_t, BlocksAvailableEntry **, int64_t, bool &)
-//		Purpose: Read in an index, and decrypt, and store in the in memory block format.
-//				 rCanDiffFromThis is set to false if the version of the from file is too old.
-//		Created: 12/1/04
-//
-// --------------------------------------------------------------------------
-static void LoadIndex(IOStream &rBlockIndex, int64_t ThisID, BlocksAvailableEntry **ppIndex, int64_t &rNumBlocksOut, int Timeout, bool &rCanDiffFromThis)
-{
-	// Reset
-	rNumBlocksOut = 0;
-	rCanDiffFromThis = false;
-	
-	// Read header
-	file_BlockIndexHeader hdr;
-	if(!rBlockIndex.ReadFullBuffer(&hdr, sizeof(hdr), 0 /* not interested in bytes read if this fails */, Timeout))
-	{
-		// Couldn't read header
-		THROW_EXCEPTION(BackupStoreException, CouldntReadEntireStructureFromStream)
-	}
-
-#ifndef BOX_DISABLE_BACKWARDS_COMPATIBILITY_BACKUPSTOREFILE
-	// Check against backwards comptaibility stuff
-	if(hdr.mMagicValue == (int32_t)htonl(OBJECTMAGIC_FILE_BLOCKS_MAGIC_VALUE_V0))
-	{
-		// Won't diff against old version
-		
-		// Absorb rest of stream
-		char buffer[2048];
-		while(rBlockIndex.StreamDataLeft())
-		{
-			rBlockIndex.Read(buffer, sizeof(buffer), 1000 /* 1 sec timeout */);
-		}
-		
-		// Tell caller
-		rCanDiffFromThis = false;
-		return;
-	}
-#endif
-
-	// Check magic
-	if(hdr.mMagicValue != (int32_t)htonl(OBJECTMAGIC_FILE_BLOCKS_MAGIC_VALUE_V1))
-	{
-		THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile)
-	}
-	
-	// Check that we're not trying to diff against a file which references blocks from another file
-	if(((int64_t)box_ntoh64(hdr.mOtherFileID)) != 0)
-	{
-		THROW_EXCEPTION(BackupStoreException, CannotDiffAnIncompleteStoreFile)
-	}
-
-	// Mark as an acceptable diff.
-	rCanDiffFromThis = true;
-
-	// Get basic information
-	int64_t numBlocks = box_ntoh64(hdr.mNumBlocks);
-	uint64_t entryIVBase = box_ntoh64(hdr.mEntryIVBase);
-	
-	//TODO: Verify that these sizes look reasonable
-	
-	// Allocate space for the index
-	BlocksAvailableEntry *pindex = (BlocksAvailableEntry*)::malloc(sizeof(BlocksAvailableEntry) * numBlocks);
-	if(pindex == 0)
-	{
-		throw std::bad_alloc();
-	}
-	
-	try
-	{	
-		for(int64_t b = 0; b < numBlocks; ++b)
-		{
-			// Read an entry from the stream
-			file_BlockIndexEntry entry;
-			if(!rBlockIndex.ReadFullBuffer(&entry, sizeof(entry), 0 /* not interested in bytes read if this fails */, Timeout))
-			{
-				// Couldn't read entry
-				THROW_EXCEPTION(BackupStoreException, CouldntReadEntireStructureFromStream)
-			}	
-		
-			// Calculate IV for this entry
-			uint64_t iv = entryIVBase;
-			iv += b;
-			// Network byte order
-			iv = box_hton64(iv);
-			sBlowfishDecryptBlockEntry.SetIV(&iv);			
-			
-			// Decrypt the encrypted section
-			file_BlockIndexEntryEnc entryEnc;
-			int sectionSize = sBlowfishDecryptBlockEntry.TransformBlock(&entryEnc, sizeof(entryEnc),
-					entry.mEnEnc, sizeof(entry.mEnEnc));
-			if(sectionSize != sizeof(entryEnc))
-			{
-				THROW_EXCEPTION(BackupStoreException, BlockEntryEncodingDidntGiveExpectedLength)
-			}
-
-			// Check that we're not trying to diff against a file which references blocks from another file
-			if(((int64_t)box_ntoh64(entry.mEncodedSize)) <= 0)
-			{
-				THROW_EXCEPTION(BackupStoreException, CannotDiffAnIncompleteStoreFile)
-			}
-			
-			// Store all the required information
-			pindex[b].mpNextInHashList = 0;	// hash list not set up yet
-			pindex[b].mSize = ntohl(entryEnc.mSize);
-			pindex[b].mWeakChecksum = ntohl(entryEnc.mWeakChecksum);
-			::memcpy(pindex[b].mStrongChecksum, entryEnc.mStrongChecksum, sizeof(pindex[b].mStrongChecksum));
-		}
-		
-		// Store index pointer for called
-		ASSERT(ppIndex != 0);
-		*ppIndex = pindex;
-
-		// Store number of blocks for caller
-		rNumBlocksOut = numBlocks;	
-
-	}
-	catch(...)
-	{
-		// clean up and send the exception along its way
-		::free(pindex);
-		throw;
-	}
-}
-
-
-// --------------------------------------------------------------------------
-//
-// Function
-//		Name:    static FindMostUsedSizes(BlocksAvailableEntry *, int64_t, int32_t[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES])
-//		Purpose: Finds the most commonly used block sizes in the index
-//		Created: 12/1/04
-//
-// --------------------------------------------------------------------------
-static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES])
-{
-	// Array for lengths
-	int64_t sizeCounts[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES];
-
-	// Set arrays to lots of zeros (= unused entries)
-	for(int l = 0; l < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++l)
-	{
-		Sizes[l] = 0;
-		sizeCounts[l] = 0;
-	}
-
-	// Array for collecting sizes
-	std::map<int32_t, int64_t> foundSizes;
-	
-	// Run through blocks and make a count of the entries
-	for(int64_t b = 0; b < NumBlocks; ++b)
-	{
-		// Only if the block size is bigger than the minimum size we'll scan for
-		if(pIndex[b].mSize > BACKUP_FILE_DIFF_MIN_BLOCK_SIZE)
-		{
-			// Find entry?
-			std::map<int32_t, int64_t>::const_iterator f(foundSizes.find(pIndex[b].mSize));
-			if(f != foundSizes.end())
-			{
-				// Increment existing entry
-				foundSizes[pIndex[b].mSize] = foundSizes[pIndex[b].mSize] + 1;
-			}
-			else
-			{
-				// New entry
-				foundSizes[pIndex[b].mSize] = 1;
-			}
-		}
-	}
-	
-	// Make the block sizes
-	for(std::map<int32_t, int64_t>::const_iterator i(foundSizes.begin()); i != foundSizes.end(); ++i)
-	{
-		// Find the position of the size in the array
-		for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t)
-		{
-			// Instead of sorting on the raw count of blocks,
-			// take the file area covered by this block size.
-			if(i->second * i->first > sizeCounts[t] * Sizes[t])
-			{
-				// Then this size belong before this entry -- shuffle them up
-				for(int s = (BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1); s >= t; --s)
-				{
-					Sizes[s] = Sizes[s-1];
-					sizeCounts[s] = sizeCounts[s-1];
-				}
-				
-				// Insert this size
-				Sizes[t] = i->first;
-				sizeCounts[t] = i->second;
-				
-				// Shouldn't do any more searching
-				break;
-			}
-		}
-	}
-	
-	// trace the size table in debug builds
-#ifndef BOX_RELEASE_BUILD
-	if(BackupStoreFile::TraceDetailsOfDiffProcess)
-	{
-		for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t)
-		{
-			BOX_TRACE("Diff block size " << t << ": " <<
-				Sizes[t] << " (count = " << 
-				sizeCounts[t] << ")");
-		}
-	}
-#endif
-}
-
-
-
-// --------------------------------------------------------------------------
-//
-// Function
-//		Name:    static SearchForMatchingBlocks(IOStream &, std::map<int64_t, int64_t> &, BlocksAvailableEntry *, int64_t, int32_t[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES])
-//		Purpose: Find the matching blocks within the file.
-//		Created: 12/1/04
-//
-// --------------------------------------------------------------------------
-static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> &rFoundBlocks,
-	BlocksAvailableEntry *pIndex, int64_t NumBlocks, 
-	int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES], DiffTimer *pDiffTimer)
-{
-	Timer maximumDiffingTime(0, "MaximumDiffingTime");
-
-	if(pDiffTimer && pDiffTimer->IsManaged())
-	{
-		maximumDiffingTime = Timer(pDiffTimer->GetMaximumDiffingTime(),
-			"MaximumDiffingTime");
-	}
-	
-	std::map<int64_t, int32_t> goodnessOfFit;
-
-	// Allocate the hash lookup table
-	BlocksAvailableEntry **phashTable = (BlocksAvailableEntry **)::malloc(sizeof(BlocksAvailableEntry *) * (64*1024));
-
-	// Choose a size for the buffer, just a little bit more than the maximum block size
-	int32_t bufSize = Sizes[0];
-	for(int z = 1; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z)
-	{
-		if(Sizes[z] > bufSize) bufSize = Sizes[z];
-	}
-	bufSize += 4;
-	ASSERT(bufSize > Sizes[0]);
-	ASSERT(bufSize > 0);
-	if(bufSize > (BACKUP_FILE_MAX_BLOCK_SIZE + 1024))
-	{
-		THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile)
-	}
-	
-	// TODO: Because we read in the file a scanned block size at a time,
-	// it is likely to be inefficient. Probably will be much better to
-	// calculate checksums for all block sizes in a single pass.
-
-	// Allocate the buffers.
-	uint8_t *pbuffer0 = (uint8_t *)::malloc(bufSize);
-	uint8_t *pbuffer1 = (uint8_t *)::malloc(bufSize);
-	try
-	{
-		// Check buffer allocation
-		if(pbuffer0 == 0 || pbuffer1 == 0 || phashTable == 0)
-		{
-			// If a buffer got allocated, it will be cleaned up in the catch block
-			throw std::bad_alloc();
-		}
-		
-		// Flag to abort the run, if too many blocks are found -- avoid using
-		// huge amounts of processor time when files contain many similar blocks.
-		bool abortSearch = false;
-		
-		// Search for each block size in turn
-		// NOTE: Do the smallest size first, so that the scheme for adding
-		// entries in the found list works as expected and replaces smallers block
-		// with larger blocks when it finds matches at the same offset in the file.
-		for(int s = BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1; s >= 0; --s)
-		{
-			ASSERT(Sizes[s] <= bufSize);
-			BOX_TRACE("Diff pass " << s << ", for block size " <<
-				Sizes[s]);
-			
-			// Check we haven't finished
-			if(Sizes[s] == 0)
-			{
-				// empty entry, try next size
-				continue;
-			}
-			
-			// Set up the hash table entries
-			SetupHashTable(pIndex, NumBlocks, Sizes[s], phashTable);
-		
-			// Shift file position to beginning
-			rFile.Seek(0, IOStream::SeekType_Absolute);
-			
-			// Read first block
-			if(rFile.Read(pbuffer0, Sizes[s]) != Sizes[s])
-			{
-				// Size of file too short to match -- do next size
-				continue;
-			}
-			
-			// Setup block pointers
-			uint8_t *beginnings = pbuffer0;
-			uint8_t *endings = pbuffer1;
-			int offset = 0;
-			
-			// Calculate the first checksum, ready for rolling
-			RollingChecksum rolling(beginnings, Sizes[s]);
-			
-			// Then roll, until the file is exhausted
-			int64_t fileBlockNumber = 0;
-			int64_t fileOffset = 0;
-			int rollOverInitialBytes = 0;
-			while(true)
-			{
-				if(maximumDiffingTime.HasExpired())
-				{
-					ASSERT(pDiffTimer != NULL);
-					BOX_INFO("MaximumDiffingTime reached - "
-						"suspending file diff");
-					abortSearch = true;
-					break;
-				}
-				
-				if(pDiffTimer)
-				{
-					pDiffTimer->DoKeepAlive();
-				}
-				
-				// Load in another block of data, and record how big it is
-				int bytesInEndings = rFile.Read(endings, Sizes[s]);
-				int tmp;
-
-				// Skip any bytes from a previous matched block
-				if(rollOverInitialBytes > 0 && offset < bytesInEndings)
-				{
-					int spaceLeft = bytesInEndings - offset;
-					int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes;
-
-					rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll);
-
-					offset += thisRoll;
-					fileOffset += thisRoll;
-					rollOverInitialBytes -= thisRoll;
-
-					if(rollOverInitialBytes)
-					{
-						goto refresh;
-					}
-				}
-
-				if(goodnessOfFit.count(fileOffset))
-				{
-					tmp = goodnessOfFit[fileOffset];
-				}
-				else
-				{
-					tmp = 0;
-				}
-
-				if(tmp >= Sizes[s])
-				{
-					// Skip over bigger ready-matched blocks completely
-					rollOverInitialBytes = tmp;
-					int spaceLeft = bytesInEndings - offset;
-					int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes;
-
-					rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll);
-
-					offset += thisRoll;
-					fileOffset += thisRoll;
-					rollOverInitialBytes -= thisRoll;
-
-					if(rollOverInitialBytes)
-					{
-						goto refresh;
-					}
-				}
-
-				while(offset < bytesInEndings)
-				{
-					// Is current checksum in hash list?
-					uint16_t hash = rolling.GetComponentForHashing();
-					if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s]))
-					{
-						if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks))
-						{
-							BOX_TRACE("Found block match for " << hash << " of " << Sizes[s] << " bytes at offset " << fileOffset);
-							goodnessOfFit[fileOffset] = Sizes[s];
-
-							// Block matched, roll the checksum forward to the next block without doing
-							// any more comparisons, because these are pointless (as any more matches will be ignored when
-							// the recipe is generated) and just take up valuable processor time. Edge cases are
-							// especially nasty, using huge amounts of time and memory.
-							int skip = Sizes[s];
-							if(offset < bytesInEndings && skip > 0)
-							{
-								int spaceLeft = bytesInEndings - offset;
-								int thisRoll = (skip > spaceLeft) ? spaceLeft : skip;
-
-								rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll);
-
-								offset += thisRoll;
-								fileOffset += thisRoll;
-								skip -= thisRoll;
-							}
-							// Not all the bytes necessary will have been skipped, so get them
-							// skipped after the next block is loaded.
-							rollOverInitialBytes = skip;
-							
-							// End this loop, so the final byte isn't used again
-							break;
-						}
-						else
-						{
-							BOX_TRACE("False alarm match for " << hash << " of " << Sizes[s] << " bytes at offset " << fileOffset);
-						}
-
-						int64_t NumBlocksFound = static_cast<int64_t>(
-							rFoundBlocks.size());
-						int64_t MaxBlocksFound = NumBlocks * 
-							BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE;
-						
-						if(NumBlocksFound > MaxBlocksFound)
-						{
-							abortSearch = true;
-							break;
-						}
-					}
-					
-					// Roll checksum forward
-					rolling.RollForward(beginnings[offset], endings[offset], Sizes[s]);
-				
-					// Increment offsets
-					++offset;
-					++fileOffset;
-				}
-				
-				if(abortSearch) break;
-				
-			refresh:
-				// Finished?
-				if(bytesInEndings != Sizes[s])
-				{
-					// No more data in file -- check the final block
-					// (Do a copy and paste of 5 lines of code instead of introducing a comparison for
-					// each byte of the file)
-					uint16_t hash = rolling.GetComponentForHashing();
-					if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s]))
-					{
-						if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks))
-						{
-							goodnessOfFit[fileOffset] = Sizes[s];
-						}
-					}
-
-					// finish
-					break;
-				}
-				
-				// Switch buffers, reset offset
-				beginnings = endings;
-				endings = (beginnings == pbuffer0)?(pbuffer1):(pbuffer0);	// ie the other buffer
-				offset = 0;
-
-				// And count the blocks which have been done
-				++fileBlockNumber;
-			}
-
-			if(abortSearch) break;
-		}
-		
-		// Free buffers and hash table
-		::free(pbuffer1);
-		pbuffer1 = 0;
-		::free(pbuffer0);
-		pbuffer0 = 0;
-		::free(phashTable);
-		phashTable = 0;
-	}
-	catch(...)
-	{
-		// Cleanup and throw
-		if(pbuffer1 != 0) ::free(pbuffer1);
-		if(pbuffer0 != 0) ::free(pbuffer0);
-		if(phashTable != 0) ::free(phashTable);
-		throw;
-	}
-	
-#ifndef BOX_RELEASE_BUILD
-	if(BackupStoreFile::TraceDetailsOfDiffProcess)
-	{
-		// Trace out the found blocks in debug mode
-		BOX_TRACE("Diff: list of found blocks");
-		BOX_TRACE("======== ======== ======== ========");
-		BOX_TRACE("  Offset   BlkIdx     Size Movement");
-		for(std::map<int64_t, int64_t>::const_iterator i(rFoundBlocks.begin()); i != rFoundBlocks.end(); ++i)
-		{
-			int64_t orgLoc = 0;
-			for(int64_t b = 0; b < i->second; ++b)
-			{
-				orgLoc += pIndex[b].mSize;
-			}
-			BOX_TRACE(std::setw(8) << i->first << " " <<
-				std::setw(8) << i->second << " " <<
-				std::setw(8) << pIndex[i->second].mSize << 
-				" " << 
-				std::setw(8) << (i->first - orgLoc));
-		}
-		BOX_TRACE("======== ======== ======== ========");
-	}
-#endif
-}
-
-
-// --------------------------------------------------------------------------
-//
-// Function
-//		Name:    static SetupHashTable(BlocksAvailableEntry *, int64_t, in32_t, BlocksAvailableEntry **)
-//		Purpose: Set up the hash table ready for a scan
-//		Created: 14/1/04
-//
-// --------------------------------------------------------------------------
-static void SetupHashTable(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t BlockSize, BlocksAvailableEntry **pHashTable)
-{
-	// Set all entries in the hash table to zero
-	::memset(pHashTable, 0, (sizeof(BlocksAvailableEntry *) * (64*1024)));
-
-	// Scan through the blocks, building the hash table
-	for(int64_t b = 0; b < NumBlocks; ++b)
-	{
-		// Only look at the required block size
-		if(pIndex[b].mSize == BlockSize)
-		{
-			// Get the value under which to hash this entry
-			uint16_t hash = RollingChecksum::ExtractHashingComponent(pIndex[b].mWeakChecksum);
-			
-			// Already present in table?
-			if(pHashTable[hash] != 0)
-			{
-				//BOX_TRACE("Another hash entry for " << hash << " found");
-				// Yes -- need to set the pointer in this entry to the current entry to build the linked list
-				pIndex[b].mpNextInHashList = pHashTable[hash];
-			}
-
-			// Put a pointer to this entry in the hash table
-			pHashTable[hash] = pIndex + b;
-		}
-	}
-}
-
-
-// --------------------------------------------------------------------------
-//
-// Function
-//		Name:    static bool SecondStageMatch(xxx)
-//		Purpose: When a match in the hash table is found, scan for second stage match using strong checksum.
-//		Created: 14/1/04
-//
-// --------------------------------------------------------------------------
-static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings,
-	int Offset, int32_t BlockSize, int64_t FileBlockNumber, BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks)
-{
-	// Check parameters
-	ASSERT(pBeginnings != 0);
-	ASSERT(pEndings != 0);
-	ASSERT(Offset >= 0);
-	ASSERT(BlockSize > 0);
-	ASSERT(pFirstInHashList != 0);
-	ASSERT(pIndex != 0);
-
-#ifndef BOX_RELEASE_BUILD
-	uint16_t DEBUG_Hash = fastSum.GetComponentForHashing();
-#endif
-	uint32_t Checksum = fastSum.GetChecksum();
-
-	// Before we go to the expense of the MD5, make sure it's a darn good match on the checksum we already know.
-	BlocksAvailableEntry *scan = pFirstInHashList;
-	bool found=false;
-	while(scan != 0)
-	{
-		if(scan->mWeakChecksum == Checksum)
-		{
-			found = true;
-			break;
-		}
-		scan = scan->mpNextInHashList;
-	}
-	if(!found)
-	{
-		return false;
-	}
-
-	// Calculate the strong MD5 digest for this block
-	MD5Digest strong;
-	// Add the data from the beginnings
-	strong.Add(pBeginnings + Offset, BlockSize - Offset);
-	// Add any data from the endings
-	if(Offset > 0)
-	{
-		strong.Add(pEndings, Offset);
-	}
-	strong.Finish();
-	
-	// Then go through the entries in the hash list, comparing with the strong digest calculated
-	scan = pFirstInHashList;
-	//BOX_TRACE("second stage match");
-	while(scan != 0)
-	{
-		//BOX_TRACE("scan size " << scan->mSize <<
-		//	", block size " << BlockSize <<
-		//	", hash " << Hash);
-		ASSERT(scan->mSize == BlockSize);
-		ASSERT(RollingChecksum::ExtractHashingComponent(scan->mWeakChecksum) == DEBUG_Hash);
-	
-		// Compare?
-		if(strong.DigestMatches(scan->mStrongChecksum))
-		{
-			//BOX_TRACE("Match!\n");
-			// Found! Add to list of found blocks...
-			int64_t fileOffset = (FileBlockNumber * BlockSize) + Offset;
-			int64_t blockIndex = (scan - pIndex);	// pointer arthmitic is frowned upon. But most efficient way of doing it here -- alternative is to use more memory
-			
-			// We do NOT search for smallest blocks first, as this code originally assumed.
-			// To prevent this from potentially overwriting a better match, the caller must determine
-			// the relative "goodness" of any existing match and this one, and avoid the call if it
-			// could be detrimental.
-			rFoundBlocks[fileOffset] = blockIndex;
-			
-			// No point in searching further, report success
-			return true;
-		}
-	
-		// Next
-		scan = scan->mpNextInHashList;
-	}
-	
-	// Not matched
-	return false;
-}
-
-
-// --------------------------------------------------------------------------
-//
-// Function
-//		Name:    static GenerateRecipe(BackupStoreFileEncodeStream::Recipe &, BlocksAvailableEntry *, int64_t, std::map<int64_t, int64_t> &)
-//		Purpose: Fills in the recipe from the found block list
-//		Created: 15/1/04
-//
-// --------------------------------------------------------------------------
-static void GenerateRecipe(BackupStoreFileEncodeStream::Recipe &rRecipe, BlocksAvailableEntry *pIndex,
-		int64_t NumBlocks, std::map<int64_t, int64_t> &rFoundBlocks, int64_t SizeOfInputFile)
-{
-	// NOTE: This function could be a lot more sophisiticated. For example, if
-	// a small block overlaps a big block like this
-	//   ****
-	//      *******************************
-	// then the small block will be used, not the big one. But it'd be better to
-	// just ignore the small block and keep the big one. However, some stats should
-	// be gathered about real world files before writing complex code which might
-	// go wrong.
-
-	// Initialise a blank instruction
-	BackupStoreFileEncodeStream::RecipeInstruction instruction;
-	#define RESET_INSTRUCTION			\
-	instruction.mSpaceBefore = 0;		\
-	instruction.mBlocks = 0;			\
-	instruction.mpStartBlock = 0;
-	RESET_INSTRUCTION
-
-	// First, a special case for when there are no found blocks
-	if(rFoundBlocks.size() == 0)
-	{
-		// No blocks, just a load of space
-		instruction.mSpaceBefore = SizeOfInputFile;
-		rRecipe.push_back(instruction);	
-	
-		#ifndef BOX_RELEASE_BUILD
-		if(BackupStoreFile::TraceDetailsOfDiffProcess)
-		{
-			BOX_TRACE("Diff: Default recipe generated, " << 
-				SizeOfInputFile << " bytes of file");
-		}
-		#endif
-		
-		// Don't do anything
-		return;
-	}
-
-	// Current location
-	int64_t loc = 0;
-
-	// Then iterate through the list, generating the recipe
-	std::map<int64_t, int64_t>::const_iterator i(rFoundBlocks.begin());
-	ASSERT(i != rFoundBlocks.end());	// check logic
-
-	// Counting for debug tracing
-#ifndef BOX_RELEASE_BUILD
-	int64_t debug_NewBytesFound = 0;
-	int64_t debug_OldBlocksUsed = 0;
-#endif
-	
-	for(; i != rFoundBlocks.end(); ++i)
-	{
-		// Remember... map is (position in file) -> (index of block in pIndex)
-		
-		if(i->first < loc)
-		{
-			// This block overlaps the last one
-			continue;
-		}
-		else if(i->first > loc)
-		{
-			// There's a gap between the end of the last thing and this block.
-			// If there's an instruction waiting, push it onto the list
-			if(instruction.mSpaceBefore != 0 || instruction.mpStartBlock != 0)
-			{
-				rRecipe.push_back(instruction);
-			}
-			// Start a new instruction, with the gap ready
-			RESET_INSTRUCTION
-			instruction.mSpaceBefore = i->first - loc;
-			// Move location forward to match
-			loc += instruction.mSpaceBefore;
-#ifndef BOX_RELEASE_BUILD
-			debug_NewBytesFound += instruction.mSpaceBefore;
-#endif
-		}
-		
-		// First, does the current instruction need pushing back, because this block is not
-		// sequential to the last one?
-		if(instruction.mpStartBlock != 0 && (pIndex + i->second) != (instruction.mpStartBlock + instruction.mBlocks))
-		{
-			rRecipe.push_back(instruction);
-			RESET_INSTRUCTION
-		}
-		
-		// Add in this block
-		if(instruction.mpStartBlock == 0)
-		{
-			// This block starts a new instruction
-			instruction.mpStartBlock = pIndex + i->second;
-			instruction.mBlocks = 1;
-		}
-		else
-		{
-			// It continues the previous section of blocks
-			instruction.mBlocks += 1;
-		}
-
-#ifndef BOX_RELEASE_BUILD
-		debug_OldBlocksUsed++;
-#endif
-
-		// Move location forward
-		loc += pIndex[i->second].mSize;
-	}
-	
-	// Push the last instruction generated
-	rRecipe.push_back(instruction);
-	
-	// Is there any space left at the end which needs sending?
-	if(loc != SizeOfInputFile)
-	{
-		RESET_INSTRUCTION
-		instruction.mSpaceBefore = SizeOfInputFile - loc;
-#ifndef BOX_RELEASE_BUILD
-		debug_NewBytesFound += instruction.mSpaceBefore;
-#endif
-		rRecipe.push_back(instruction);		
-	}
-	
-	// dump out the recipe
-#ifndef BOX_RELEASE_BUILD
-	BOX_TRACE("Diff: " << 
-		debug_NewBytesFound << " new bytes found, " <<
-		debug_OldBlocksUsed << " old blocks used");
-	if(BackupStoreFile::TraceDetailsOfDiffProcess)
-	{
-		BOX_TRACE("Diff: Recipe generated (size " << rRecipe.size());
-		BOX_TRACE("======== ========= ========");
-		BOX_TRACE("Space b4 FirstBlk  NumBlks");
-		{
-			for(unsigned int e = 0; e < rRecipe.size(); ++e)
-			{
-				char b[64];
-#ifdef WIN32
-				sprintf(b, "%8I64d", (int64_t)(rRecipe[e].mpStartBlock - pIndex));
-#else
-				sprintf(b, "%8lld", (int64_t)(rRecipe[e].mpStartBlock - pIndex));
-#endif
-				BOX_TRACE(std::setw(8) <<
-					rRecipe[e].mSpaceBefore <<
-					" " <<
-					((rRecipe[e].mpStartBlock == 0)?"       -":b) <<
-					" " << std::setw(8) <<
-					rRecipe[e].mBlocks);
-			}
-		}
-		BOX_TRACE("======== ========= ========");
-	}
-#endif
-}
author	Reinhard Tartler <siretart@tauware.de>	2017-06-14 19:53:34 -0400
committer	Reinhard Tartler <siretart@tauware.de>	2017-06-14 19:55:14 -0400
commit	e0c122119afea4c951c0c57144d26a473118c254 (patch)
tree	34a02a56f9b017201dfb721ef678c711351466d6 /lib/backupclient/BackupStoreFileDiff.cpp
parent	e0eb815b67734abd09ff41e2271630d4b2a6d760 (diff)