diff options
author | Chris Wilson <chris+github@qwirx.com> | 2011-05-24 15:08:59 +0000 |
---|---|---|
committer | Chris Wilson <chris+github@qwirx.com> | 2011-05-24 15:08:59 +0000 |
commit | 2bca085d623e857e89878a48efba59f78875370f (patch) | |
tree | 1cc119e7886cf2293c161f700323444934b85904 /lib/backupstore | |
parent | f04cd092a2711160519c957a5ba8c28babd3004c (diff) |
Move remaining parts of BackupStoreFile into lib/backupstore, and fix module
dependencies to fail if anything else required by bbstored is still in
lib/backupclient instead of lib/backupstore.
Diffstat (limited to 'lib/backupstore')
-rw-r--r-- | lib/backupstore/BackupStoreFileCmbDiff.cpp | 326 | ||||
-rw-r--r-- | lib/backupstore/BackupStoreFileCmbIdx.cpp | 324 | ||||
-rw-r--r-- | lib/backupstore/BackupStoreFileCombine.cpp | 410 | ||||
-rw-r--r-- | lib/backupstore/BackupStoreFileDiff.cpp | 1046 |
4 files changed, 2106 insertions, 0 deletions
diff --git a/lib/backupstore/BackupStoreFileCmbDiff.cpp b/lib/backupstore/BackupStoreFileCmbDiff.cpp new file mode 100644 index 00000000..1a88fa3f --- /dev/null +++ b/lib/backupstore/BackupStoreFileCmbDiff.cpp @@ -0,0 +1,326 @@ +// -------------------------------------------------------------------------- +// +// File +// Name: BackupStoreFileCmbDiff.cpp +// Purpose: Combine two diffs together +// Created: 12/7/04 +// +// -------------------------------------------------------------------------- + +#include "Box.h" + +#include <new> +#include <stdlib.h> + +#include "BackupStoreFile.h" +#include "BackupStoreFileWire.h" +#include "BackupStoreObjectMagic.h" +#include "BackupStoreException.h" +#include "BackupStoreConstants.h" +#include "BackupStoreFilename.h" + +#include "MemLeakFindOn.h" + +// -------------------------------------------------------------------------- +// +// Function +// Name: BackupStoreFile::CombineDiffs(IOStream &, IOStream &, IOStream &rOut) +// Purpose: Given two diffs, combine them into a single diff, to produce a diff +// which, combined with the original file, creates the result of applying +// rDiff, then rDiff2. Two opens of rDiff2 are required +// Created: 12/7/04 +// +// -------------------------------------------------------------------------- +void BackupStoreFile::CombineDiffs(IOStream &rDiff1, IOStream &rDiff2, IOStream &rDiff2b, IOStream &rOut) +{ + // Skip header of first diff, record where the data starts, and skip to the index + int64_t diff1DataStarts = 0; + { + // Read the header for the From file + file_StreamFormat diff1Hdr; + if(!rDiff1.ReadFullBuffer(&diff1Hdr, sizeof(diff1Hdr), 0)) + { + THROW_EXCEPTION(BackupStoreException, FailedToReadBlockOnCombine) + } + if(ntohl(diff1Hdr.mMagicValue) != OBJECTMAGIC_FILE_MAGIC_VALUE_V1) + { + THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile) + } + // Skip over the filename and attributes of the From file + // BLOCK + { + BackupStoreFilename filename2; + filename2.ReadFromStream(rDiff1, IOStream::TimeOutInfinite); + int32_t size_s; + if(!rDiff1.ReadFullBuffer(&size_s, sizeof(size_s), 0 /* not interested in bytes read if this fails */)) + { + THROW_EXCEPTION(CommonException, StreamableMemBlockIncompleteRead) + } + int size = ntohl(size_s); + // Skip forward the size + rDiff1.Seek(size, IOStream::SeekType_Relative); + } + // Record position + diff1DataStarts = rDiff1.GetPosition(); + // Skip to index + rDiff1.Seek(0 - (((box_ntoh64(diff1Hdr.mNumBlocks)) * sizeof(file_BlockIndexEntry)) + sizeof(file_BlockIndexHeader)), IOStream::SeekType_End); + } + + // Read the index of the first diff + // Header first + file_BlockIndexHeader diff1IdxHdr; + if(!rDiff1.ReadFullBuffer(&diff1IdxHdr, sizeof(diff1IdxHdr), 0)) + { + THROW_EXCEPTION(BackupStoreException, CouldntReadEntireStructureFromStream) + } + if(ntohl(diff1IdxHdr.mMagicValue) != OBJECTMAGIC_FILE_BLOCKS_MAGIC_VALUE_V1) + { + THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile) + } + int64_t diff1NumBlocks = box_ntoh64(diff1IdxHdr.mNumBlocks); + // Allocate some memory + int64_t *diff1BlockStartPositions = (int64_t*)::malloc((diff1NumBlocks + 1) * sizeof(int64_t)); + if(diff1BlockStartPositions == 0) + { + throw std::bad_alloc(); + } + + // Buffer data + void *buffer = 0; + int bufferSize = 0; + + try + { + // Then the entries: + // For each entry, want to know if it's in the file, and if so, how big it is. + // We'll store this as an array of file positions in the file, with an additioal + // entry on the end so that we can work out the length of the last block. + // If an entry isn't in the file, then store 0 - (position in other file). + int64_t diff1Position = diff1DataStarts; + for(int64_t b = 0; b < diff1NumBlocks; ++b) + { + file_BlockIndexEntry e; + if(!rDiff1.ReadFullBuffer(&e, sizeof(e), 0)) + { + THROW_EXCEPTION(BackupStoreException, CouldntReadEntireStructureFromStream) + } + + // Where's the block? + int64_t blockEn = box_ntoh64(e.mEncodedSize); + if(blockEn <= 0) + { + // Just store the negated block number + diff1BlockStartPositions[b] = blockEn; + } + else + { + // Block is present in this file + diff1BlockStartPositions[b] = diff1Position; + diff1Position += blockEn; + } + } + + // Finish off the list, so the last entry can have it's size calcuated. + diff1BlockStartPositions[diff1NumBlocks] = diff1Position; + + // Now read the second diff's header, copying it to the out file + file_StreamFormat diff2Hdr; + if(!rDiff2.ReadFullBuffer(&diff2Hdr, sizeof(diff2Hdr), 0)) + { + THROW_EXCEPTION(BackupStoreException, FailedToReadBlockOnCombine) + } + if(ntohl(diff2Hdr.mMagicValue) != OBJECTMAGIC_FILE_MAGIC_VALUE_V1) + { + THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile) + } + // Copy + rOut.Write(&diff2Hdr, sizeof(diff2Hdr)); + // Copy over filename and attributes + // BLOCK + { + BackupStoreFilename filename; + filename.ReadFromStream(rDiff2, IOStream::TimeOutInfinite); + filename.WriteToStream(rOut); + StreamableMemBlock attr; + attr.ReadFromStream(rDiff2, IOStream::TimeOutInfinite); + attr.WriteToStream(rOut); + } + + // Get to the index of rDiff2b, and read the header + MoveStreamPositionToBlockIndex(rDiff2b); + file_BlockIndexHeader diff2IdxHdr; + if(!rDiff2b.ReadFullBuffer(&diff2IdxHdr, sizeof(diff2IdxHdr), 0)) + { + THROW_EXCEPTION(BackupStoreException, CouldntReadEntireStructureFromStream) + } + if(ntohl(diff2IdxHdr.mMagicValue) != OBJECTMAGIC_FILE_BLOCKS_MAGIC_VALUE_V1) + { + THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile) + } + int64_t diff2NumBlocks = box_ntoh64(diff2IdxHdr.mNumBlocks); + int64_t diff2IndexEntriesStart = rDiff2b.GetPosition(); + + // Then read all the entries + int64_t diff2FilePosition = rDiff2.GetPosition(); + for(int64_t b = 0; b < diff2NumBlocks; ++b) + { + file_BlockIndexEntry e; + if(!rDiff2b.ReadFullBuffer(&e, sizeof(e), 0)) + { + THROW_EXCEPTION(BackupStoreException, CouldntReadEntireStructureFromStream) + } + + // What do to next about copying data + bool copyBlock = false; + int copySize = 0; + int64_t copyFrom = 0; + bool fromFileDiff1 = false; + + // Where's the block? + int64_t blockEn = box_ntoh64(e.mEncodedSize); + if(blockEn > 0) + { + // Block is present in this file -- copy to out + copyBlock = true; + copyFrom = diff2FilePosition; + copySize = (int)blockEn; + + // Move pointer onwards + diff2FilePosition += blockEn; + } + else + { + // Block isn't present here -- is it present in the old one? + int64_t blockIndex = 0 - blockEn; + if(blockIndex < 0 || blockIndex > diff1NumBlocks) + { + THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile) + } + if(diff1BlockStartPositions[blockIndex] > 0) + { + // Block is in the old diff file, copy it across + copyBlock = true; + copyFrom = diff1BlockStartPositions[blockIndex]; + int nb = blockIndex + 1; + while(diff1BlockStartPositions[nb] <= 0) + { + // This is safe, because the last entry will terminate it properly! + ++nb; + ASSERT(nb <= diff1NumBlocks); + } + copySize = diff1BlockStartPositions[nb] - copyFrom; + fromFileDiff1 = true; + } + } + //TRACE4("%d %d %lld %d\n", copyBlock, copySize, copyFrom, fromFileDiff1); + + // Copy data to the output file? + if(copyBlock) + { + // Allocate enough space + if(bufferSize < copySize || buffer == 0) + { + // Free old block + if(buffer != 0) + { + ::free(buffer); + buffer = 0; + bufferSize = 0; + } + // Allocate new block + buffer = ::malloc(copySize); + if(buffer == 0) + { + throw std::bad_alloc(); + } + bufferSize = copySize; + } + ASSERT(bufferSize >= copySize); + + // Load in the data + if(fromFileDiff1) + { + rDiff1.Seek(copyFrom, IOStream::SeekType_Absolute); + if(!rDiff1.ReadFullBuffer(buffer, copySize, 0)) + { + THROW_EXCEPTION(BackupStoreException, FailedToReadBlockOnCombine) + } + } + else + { + rDiff2.Seek(copyFrom, IOStream::SeekType_Absolute); + if(!rDiff2.ReadFullBuffer(buffer, copySize, 0)) + { + THROW_EXCEPTION(BackupStoreException, FailedToReadBlockOnCombine) + } + } + // Write out data + rOut.Write(buffer, copySize); + } + } + + // Write the modified header + diff2IdxHdr.mOtherFileID = diff1IdxHdr.mOtherFileID; + rOut.Write(&diff2IdxHdr, sizeof(diff2IdxHdr)); + + // Then we'll write out the index, reading the data again + rDiff2b.Seek(diff2IndexEntriesStart, IOStream::SeekType_Absolute); + for(int64_t b = 0; b < diff2NumBlocks; ++b) + { + file_BlockIndexEntry e; + if(!rDiff2b.ReadFullBuffer(&e, sizeof(e), 0)) + { + THROW_EXCEPTION(BackupStoreException, CouldntReadEntireStructureFromStream) + } + + // Where's the block? + int64_t blockEn = box_ntoh64(e.mEncodedSize); + + // If it's not in this file, it needs modification... + if(blockEn <= 0) + { + int64_t blockIndex = 0 - blockEn; + // In another file. Need to translate this against the other diff + if(diff1BlockStartPositions[blockIndex] > 0) + { + // Block is in the first diff file, stick in size + int nb = blockIndex + 1; + while(diff1BlockStartPositions[nb] <= 0) + { + // This is safe, because the last entry will terminate it properly! + ++nb; + ASSERT(nb <= diff1NumBlocks); + } + int64_t size = diff1BlockStartPositions[nb] - diff1BlockStartPositions[blockIndex]; + e.mEncodedSize = box_hton64(size); + } + else + { + // Block in the original file, use translated value + e.mEncodedSize = box_hton64(diff1BlockStartPositions[blockIndex]); + } + } + + // Write entry + rOut.Write(&e, sizeof(e)); + } + } + catch(...) + { + // clean up + ::free(diff1BlockStartPositions); + if(buffer != 0) + { + ::free(buffer); + } + throw; + } + + // Clean up allocated memory + ::free(diff1BlockStartPositions); + if(buffer != 0) + { + ::free(buffer); + } +} + diff --git a/lib/backupstore/BackupStoreFileCmbIdx.cpp b/lib/backupstore/BackupStoreFileCmbIdx.cpp new file mode 100644 index 00000000..c8bcc3b9 --- /dev/null +++ b/lib/backupstore/BackupStoreFileCmbIdx.cpp @@ -0,0 +1,324 @@ +// -------------------------------------------------------------------------- +// +// File +// Name: BackupStoreFileCmbIdx.cpp +// Purpose: Combine indicies of a delta file and the file it's a diff from. +// Created: 8/7/04 +// +// -------------------------------------------------------------------------- + +#include "Box.h" + +#include <new> +#include <string.h> + +#include "BackupStoreFile.h" +#include "BackupStoreFileWire.h" +#include "BackupStoreObjectMagic.h" +#include "BackupStoreException.h" +#include "BackupStoreConstants.h" +#include "BackupStoreFilename.h" + +#include "MemLeakFindOn.h" + +// Hide from outside world +namespace +{ + +class BSFCombinedIndexStream : public IOStream +{ +public: + BSFCombinedIndexStream(IOStream *pDiff); + ~BSFCombinedIndexStream(); + + virtual int Read(void *pBuffer, int NBytes, int Timeout = IOStream::TimeOutInfinite); + virtual void Write(const void *pBuffer, int NBytes); + virtual bool StreamDataLeft(); + virtual bool StreamClosed(); + virtual void Initialise(IOStream &rFrom); + +private: + IOStream *mpDiff; + bool mIsInitialised; + bool mHeaderWritten; + file_BlockIndexHeader mHeader; + int64_t mNumEntriesToGo; + int64_t mNumEntriesInFromFile; + int64_t *mFromBlockSizes; // NOTE: Entries in network byte order +}; + +}; + +// -------------------------------------------------------------------------- +// +// Function +// Name: BackupStoreFile::CombineFileIndices(IOStream &, IOStream &, bool) +// Purpose: Given a diff file and the file it's a diff from, return a stream from which +// can be read the index of the combined file, without actually combining them. +// The stream of the diff must have a lifetime greater than or equal to the +// lifetime of the returned stream object. The full "from" file stream +// only needs to exist during the actual function call. +// If you pass in dodgy files which aren't related, then you will either +// get an error or bad results. So don't do that. +// If DiffIsIndexOnly is true, then rDiff is assumed to be a stream positioned +// at the beginning of the block index. Similarly for FromIsIndexOnly. +// WARNING: Reads of the returned streams with buffer sizes less than 64 bytes +// will not return any data. +// Created: 8/7/04 +// +// -------------------------------------------------------------------------- +std::auto_ptr<IOStream> BackupStoreFile::CombineFileIndices(IOStream &rDiff, IOStream &rFrom, bool DiffIsIndexOnly, bool FromIsIndexOnly) +{ + // Reposition file pointers? + if(!DiffIsIndexOnly) + { + MoveStreamPositionToBlockIndex(rDiff); + } + if(!FromIsIndexOnly) + { + MoveStreamPositionToBlockIndex(rFrom); + } + + // Create object + std::auto_ptr<IOStream> stream(new BSFCombinedIndexStream(&rDiff)); + + // Initialise it + ((BSFCombinedIndexStream *)stream.get())->Initialise(rFrom); + + // And return the stream + return stream; +} + + +// -------------------------------------------------------------------------- +// +// Function +// Name: BSFCombinedIndexStream::BSFCombinedIndexStream() +// Purpose: Private class. Constructor. +// Created: 8/7/04 +// +// -------------------------------------------------------------------------- +BSFCombinedIndexStream::BSFCombinedIndexStream(IOStream *pDiff) + : mpDiff(pDiff), + mIsInitialised(false), + mHeaderWritten(false), + mNumEntriesToGo(0), + mNumEntriesInFromFile(0), + mFromBlockSizes(0) +{ + ASSERT(mpDiff != 0); +} + + +// -------------------------------------------------------------------------- +// +// Function +// Name: BSFCombinedIndexStream::~BSFCombinedIndexStream() +// Purpose: Private class. Destructor. +// Created: 8/7/04 +// +// -------------------------------------------------------------------------- +BSFCombinedIndexStream::~BSFCombinedIndexStream() +{ + if(mFromBlockSizes != 0) + { + ::free(mFromBlockSizes); + mFromBlockSizes = 0; + } +} + + +// -------------------------------------------------------------------------- +// +// Function +// Name: BSFCombinedIndexStream::Initialise(IOStream &) +// Purpose: Private class. Initalise from the streams (diff passed in constructor). +// Both streams must have file pointer positioned at the block index. +// Created: 8/7/04 +// +// -------------------------------------------------------------------------- +void BSFCombinedIndexStream::Initialise(IOStream &rFrom) +{ + // Paranoia is good. + if(mIsInitialised) + { + THROW_EXCEPTION(BackupStoreException, Internal) + } + + // Look at the diff file: Read in the header + if(!mpDiff->ReadFullBuffer(&mHeader, sizeof(mHeader), 0)) + { + THROW_EXCEPTION(BackupStoreException, CouldntReadEntireStructureFromStream) + } + if(ntohl(mHeader.mMagicValue) != OBJECTMAGIC_FILE_BLOCKS_MAGIC_VALUE_V1) + { + THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile) + } + + // Read relevant data. + mNumEntriesToGo = box_ntoh64(mHeader.mNumBlocks); + + // Adjust a bit to reflect the fact it's no longer a diff + mHeader.mOtherFileID = box_hton64(0); + + // Now look at the from file: Read header + file_BlockIndexHeader fromHdr; + if(!rFrom.ReadFullBuffer(&fromHdr, sizeof(fromHdr), 0)) + { + THROW_EXCEPTION(BackupStoreException, CouldntReadEntireStructureFromStream) + } + if(ntohl(fromHdr.mMagicValue) != OBJECTMAGIC_FILE_BLOCKS_MAGIC_VALUE_V1) + { + THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile) + } + + // Then... allocate memory for the list of sizes + mNumEntriesInFromFile = box_ntoh64(fromHdr.mNumBlocks); + mFromBlockSizes = (int64_t*)::malloc(mNumEntriesInFromFile * sizeof(int64_t)); + if(mFromBlockSizes == 0) + { + throw std::bad_alloc(); + } + + // And read them all in! + for(int64_t b = 0; b < mNumEntriesInFromFile; ++b) + { + file_BlockIndexEntry e; + if(!rFrom.ReadFullBuffer(&e, sizeof(e), 0)) + { + THROW_EXCEPTION(BackupStoreException, CouldntReadEntireStructureFromStream) + } + + // Check that the from file isn't a delta in itself + if(box_ntoh64(e.mEncodedSize) <= 0) + { + THROW_EXCEPTION(BackupStoreException, OnCombineFromFileIsIncomplete) + } + + // Store size (in network byte order) + mFromBlockSizes[b] = e.mEncodedSize; + } + + // Flag as initialised + mIsInitialised = true; +} + + +// -------------------------------------------------------------------------- +// +// Function +// Name: BSFCombinedIndexStream::Read(void *, int, int) +// Purpose: Private class. As interface. +// Created: 8/7/04 +// +// -------------------------------------------------------------------------- +int BSFCombinedIndexStream::Read(void *pBuffer, int NBytes, int Timeout) +{ + // Paranoia is good. + if(!mIsInitialised || mFromBlockSizes == 0 || mpDiff == 0) + { + THROW_EXCEPTION(BackupStoreException, Internal) + } + + int written = 0; + + // Header output yet? + if(!mHeaderWritten) + { + // Enough space? + if(NBytes < (int)sizeof(mHeader)) return 0; + + // Copy in + ::memcpy(pBuffer, &mHeader, sizeof(mHeader)); + NBytes -= sizeof(mHeader); + written += sizeof(mHeader); + + // Flag it's done + mHeaderWritten = true; + } + + // How many entries can be written? + int entriesToWrite = NBytes / sizeof(file_BlockIndexEntry); + if(entriesToWrite > mNumEntriesToGo) + { + entriesToWrite = mNumEntriesToGo; + } + + // Setup ready to go + file_BlockIndexEntry *poutput = (file_BlockIndexEntry*)(((uint8_t*)pBuffer) + written); + + // Write entries + for(int b = 0; b < entriesToWrite; ++b) + { + if(!mpDiff->ReadFullBuffer(&(poutput[b]), sizeof(file_BlockIndexEntry), 0)) + { + THROW_EXCEPTION(BackupStoreException, CouldntReadEntireStructureFromStream) + } + + // Does this need adjusting? + int s = box_ntoh64(poutput[b].mEncodedSize); + if(s <= 0) + { + // A reference to a block in the from file + int block = 0 - s; + ASSERT(block >= 0); + if(block >= mNumEntriesInFromFile) + { + // That's not good, the block doesn't exist + THROW_EXCEPTION(BackupStoreException, OnCombineFromFileIsIncomplete) + } + + // Adjust the entry in the buffer + poutput[b].mEncodedSize = mFromBlockSizes[block]; // stored in network byte order, no translation necessary + } + } + + // Update written count + written += entriesToWrite * sizeof(file_BlockIndexEntry); + mNumEntriesToGo -= entriesToWrite; + + return written; +} + + +// -------------------------------------------------------------------------- +// +// Function +// Name: BSFCombinedIndexStream::Write(const void *, int) +// Purpose: Private class. As interface. +// Created: 8/7/04 +// +// -------------------------------------------------------------------------- +void BSFCombinedIndexStream::Write(const void *pBuffer, int NBytes) +{ + THROW_EXCEPTION(BackupStoreException, StreamDoesntHaveRequiredFeatures) +} + + +// -------------------------------------------------------------------------- +// +// Function +// Name: BSFCombinedIndexStream::StreamDataLeft() +// Purpose: Private class. As interface +// Created: 8/7/04 +// +// -------------------------------------------------------------------------- +bool BSFCombinedIndexStream::StreamDataLeft() +{ + return (!mHeaderWritten) || (mNumEntriesToGo > 0); +} + + +// -------------------------------------------------------------------------- +// +// Function +// Name: BSFCombinedIndexStream::StreamClosed() +// Purpose: Private class. As interface. +// Created: 8/7/04 +// +// -------------------------------------------------------------------------- +bool BSFCombinedIndexStream::StreamClosed() +{ + return true; // doesn't do writing +} + diff --git a/lib/backupstore/BackupStoreFileCombine.cpp b/lib/backupstore/BackupStoreFileCombine.cpp new file mode 100644 index 00000000..baa331f0 --- /dev/null +++ b/lib/backupstore/BackupStoreFileCombine.cpp @@ -0,0 +1,410 @@ +// -------------------------------------------------------------------------- +// +// File +// Name: BackupStoreFileCombine.cpp +// Purpose: File combining for BackupStoreFile +// Created: 16/1/04 +// +// -------------------------------------------------------------------------- + +#include "Box.h" + +#include <new> + +#include "BackupStoreFile.h" +#include "BackupStoreFileWire.h" +#include "BackupStoreObjectMagic.h" +#include "BackupStoreException.h" +#include "BackupStoreConstants.h" +#include "BackupStoreFilename.h" +#include "FileStream.h" + +#include "MemLeakFindOn.h" + +typedef struct +{ + int64_t mFilePosition; +} FromIndexEntry; + +static void LoadFromIndex(IOStream &rFrom, FromIndexEntry *pIndex, int64_t NumEntries); +static void CopyData(IOStream &rDiffData, IOStream &rDiffIndex, int64_t DiffNumBlocks, IOStream &rFrom, FromIndexEntry *pFromIndex, int64_t FromNumBlocks, IOStream &rOut); +static void WriteNewIndex(IOStream &rDiff, int64_t DiffNumBlocks, FromIndexEntry *pFromIndex, int64_t FromNumBlocks, IOStream &rOut); + +// -------------------------------------------------------------------------- +// +// Function +// Name: BackupStoreFile::CombineFile(IOStream &, IOStream &, IOStream &) +// Purpose: Where rDiff is a store file which is incomplete as a result of a +// diffing operation, rFrom is the file it is diffed from, and +// rOut is the stream in which to place the result, the old file +// and new file are combined into a file containing all the data. +// rDiff2 is the same file as rDiff, opened again to get two +// independent streams to the same file. +// Created: 16/1/04 +// +// -------------------------------------------------------------------------- +void BackupStoreFile::CombineFile(IOStream &rDiff, IOStream &rDiff2, IOStream &rFrom, IOStream &rOut) +{ + // Read and copy the header. + file_StreamFormat hdr; + if(!rDiff.ReadFullBuffer(&hdr, sizeof(hdr), 0)) + { + THROW_EXCEPTION(BackupStoreException, FailedToReadBlockOnCombine) + } + if(ntohl(hdr.mMagicValue) != OBJECTMAGIC_FILE_MAGIC_VALUE_V1) + { + THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile) + } + // Copy + rOut.Write(&hdr, sizeof(hdr)); + // Copy over filename and attributes + // BLOCK + { + BackupStoreFilename filename; + filename.ReadFromStream(rDiff, IOStream::TimeOutInfinite); + filename.WriteToStream(rOut); + StreamableMemBlock attr; + attr.ReadFromStream(rDiff, IOStream::TimeOutInfinite); + attr.WriteToStream(rOut); + } + + // Read the header for the From file + file_StreamFormat fromHdr; + if(!rFrom.ReadFullBuffer(&fromHdr, sizeof(fromHdr), 0)) + { + THROW_EXCEPTION(BackupStoreException, FailedToReadBlockOnCombine) + } + if(ntohl(fromHdr.mMagicValue) != OBJECTMAGIC_FILE_MAGIC_VALUE_V1) + { + THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile) + } + // Skip over the filename and attributes of the From file + // BLOCK + { + BackupStoreFilename filename2; + filename2.ReadFromStream(rFrom, IOStream::TimeOutInfinite); + int32_t size_s; + if(!rFrom.ReadFullBuffer(&size_s, sizeof(size_s), 0 /* not interested in bytes read if this fails */)) + { + THROW_EXCEPTION(CommonException, StreamableMemBlockIncompleteRead) + } + int size = ntohl(size_s); + // Skip forward the size + rFrom.Seek(size, IOStream::SeekType_Relative); + } + + // Allocate memory for the block index of the From file + int64_t fromNumBlocks = box_ntoh64(fromHdr.mNumBlocks); + // NOTE: An extra entry is required so that the length of the last block can be calculated + FromIndexEntry *pFromIndex = (FromIndexEntry*)::malloc((fromNumBlocks+1) * sizeof(FromIndexEntry)); + if(pFromIndex == 0) + { + throw std::bad_alloc(); + } + + try + { + // Load the index from the From file, calculating the offsets in the + // file as we go along, and enforce that everything should be present. + LoadFromIndex(rFrom, pFromIndex, fromNumBlocks); + + // Read in the block index of the Diff file in small chunks, and output data + // for each block, either from this file, or the other file. + int64_t diffNumBlocks = box_ntoh64(hdr.mNumBlocks); + CopyData(rDiff /* positioned at start of data */, rDiff2, diffNumBlocks, rFrom, pFromIndex, fromNumBlocks, rOut); + + // Read in the block index again, and output the new block index, simply + // filling in the sizes of blocks from the old file. + WriteNewIndex(rDiff, diffNumBlocks, pFromIndex, fromNumBlocks, rOut); + + // Free buffers + ::free(pFromIndex); + pFromIndex = 0; + } + catch(...) + { + // Clean up + if(pFromIndex != 0) + { + ::free(pFromIndex); + pFromIndex = 0; + } + throw; + } +} + + +// -------------------------------------------------------------------------- +// +// Function +// Name: static LoadFromIndex(IOStream &, FromIndexEntry *, int64_t) +// Purpose: Static. Load the index from the From file +// Created: 16/1/04 +// +// -------------------------------------------------------------------------- +static void LoadFromIndex(IOStream &rFrom, FromIndexEntry *pIndex, int64_t NumEntries) +{ + ASSERT(pIndex != 0); + ASSERT(NumEntries >= 0); + + // Get the starting point in the file + int64_t filePos = rFrom.GetPosition(); + + // Jump to the end of the file to read the index + rFrom.Seek(0 - ((NumEntries * sizeof(file_BlockIndexEntry)) + sizeof(file_BlockIndexHeader)), IOStream::SeekType_End); + + // Read block index header + file_BlockIndexHeader blkhdr; + if(!rFrom.ReadFullBuffer(&blkhdr, sizeof(blkhdr), 0)) + { + THROW_EXCEPTION(BackupStoreException, FailedToReadBlockOnCombine) + } + if(ntohl(blkhdr.mMagicValue) != OBJECTMAGIC_FILE_BLOCKS_MAGIC_VALUE_V1 + || (int64_t)box_ntoh64(blkhdr.mNumBlocks) != NumEntries) + { + THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile) + } + + // And then the block entries + for(int64_t b = 0; b < NumEntries; ++b) + { + // Read + file_BlockIndexEntry en; + if(!rFrom.ReadFullBuffer(&en, sizeof(en), 0)) + { + THROW_EXCEPTION(BackupStoreException, FailedToReadBlockOnCombine) + } + + // Add to list + pIndex[b].mFilePosition = filePos; + + // Encoded size? + int64_t encodedSize = box_ntoh64(en.mEncodedSize); + // Check that the block is actually there + if(encodedSize <= 0) + { + THROW_EXCEPTION(BackupStoreException, OnCombineFromFileIsIncomplete) + } + + // Move file pointer on + filePos += encodedSize; + } + + // Store the position in the very last entry, so the size of the last entry can be calculated + pIndex[NumEntries].mFilePosition = filePos; +} + + +// -------------------------------------------------------------------------- +// +// Function +// Name: static CopyData(IOStream &, IOStream &, int64_t, IOStream &, FromIndexEntry *, int64_t, IOStream &) +// Purpose: Static. Copy data from the Diff and From file to the out file. +// rDiffData is at beginning of data. +// rDiffIndex at any position. +// rFrom is at any position. +// rOut is after the header, ready for data +// Created: 16/1/04 +// +// -------------------------------------------------------------------------- +static void CopyData(IOStream &rDiffData, IOStream &rDiffIndex, int64_t DiffNumBlocks, + IOStream &rFrom, FromIndexEntry *pFromIndex, int64_t FromNumBlocks, IOStream &rOut) +{ + // Jump to the end of the diff file to read the index + rDiffIndex.Seek(0 - ((DiffNumBlocks * sizeof(file_BlockIndexEntry)) + sizeof(file_BlockIndexHeader)), IOStream::SeekType_End); + + // Read block index header + file_BlockIndexHeader diffBlkhdr; + if(!rDiffIndex.ReadFullBuffer(&diffBlkhdr, sizeof(diffBlkhdr), 0)) + { + THROW_EXCEPTION(BackupStoreException, FailedToReadBlockOnCombine) + } + if(ntohl(diffBlkhdr.mMagicValue) != OBJECTMAGIC_FILE_BLOCKS_MAGIC_VALUE_V1 + || (int64_t)box_ntoh64(diffBlkhdr.mNumBlocks) != DiffNumBlocks) + { + THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile) + } + + // Record where the From file is + int64_t fromPos = rFrom.GetPosition(); + + // Buffer data + void *buffer = 0; + int bufferSize = 0; + + try + { + // Read the blocks in! + for(int64_t b = 0; b < DiffNumBlocks; ++b) + { + // Read + file_BlockIndexEntry en; + if(!rDiffIndex.ReadFullBuffer(&en, sizeof(en), 0)) + { + THROW_EXCEPTION(BackupStoreException, FailedToReadBlockOnCombine) + } + + // What's the size value stored in the entry + int64_t encodedSize = box_ntoh64(en.mEncodedSize); + + // How much data will be read? + int32_t blockSize = 0; + if(encodedSize > 0) + { + // The block is actually in the diff file + blockSize = encodedSize; + } + else + { + // It's in the from file. First, check to see if it's valid + int64_t blockIdx = (0 - encodedSize); + if(blockIdx > FromNumBlocks) + { + // References a block which doesn't actually exist + THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile) + } + // Calculate size. This operation is safe because of the extra entry at the end + blockSize = pFromIndex[blockIdx + 1].mFilePosition - pFromIndex[blockIdx].mFilePosition; + } + ASSERT(blockSize > 0); + + // Make sure there's memory available to copy this + if(bufferSize < blockSize || buffer == 0) + { + // Free old block + if(buffer != 0) + { + ::free(buffer); + buffer = 0; + bufferSize = 0; + } + // Allocate new block + buffer = ::malloc(blockSize); + if(buffer == 0) + { + throw std::bad_alloc(); + } + bufferSize = blockSize; + } + ASSERT(bufferSize >= blockSize); + + // Load in data from one of the files + if(encodedSize > 0) + { + // Load from diff file + if(!rDiffData.ReadFullBuffer(buffer, blockSize, 0)) + { + THROW_EXCEPTION(BackupStoreException, FailedToReadBlockOnCombine) + } + } + else + { + // Locate and read the data from the from file + int64_t blockIdx = (0 - encodedSize); + // Seek if necessary + if(fromPos != pFromIndex[blockIdx].mFilePosition) + { + rFrom.Seek(pFromIndex[blockIdx].mFilePosition, IOStream::SeekType_Absolute); + fromPos = pFromIndex[blockIdx].mFilePosition; + } + // Read + if(!rFrom.ReadFullBuffer(buffer, blockSize, 0)) + { + THROW_EXCEPTION(BackupStoreException, FailedToReadBlockOnCombine) + } + + // Update fromPos to current position + fromPos += blockSize; + } + + // Write data to out file + rOut.Write(buffer, blockSize); + } + + // Free buffer, if allocated + if(buffer != 0) + { + ::free(buffer); + buffer = 0; + } + } + catch(...) + { + if(buffer != 0) + { + ::free(buffer); + buffer = 0; + } + throw; + } +} + + + +// -------------------------------------------------------------------------- +// +// Function +// Name: static WriteNewIndex(IOStream &, int64_t, FromIndexEntry *, int64_t, IOStream &) +// Purpose: Write the index to the out file, just copying from the diff file and +// adjusting the entries. +// Created: 16/1/04 +// +// -------------------------------------------------------------------------- +static void WriteNewIndex(IOStream &rDiff, int64_t DiffNumBlocks, FromIndexEntry *pFromIndex, int64_t FromNumBlocks, IOStream &rOut) +{ + // Jump to the end of the diff file to read the index + rDiff.Seek(0 - ((DiffNumBlocks * sizeof(file_BlockIndexEntry)) + sizeof(file_BlockIndexHeader)), IOStream::SeekType_End); + + // Read block index header + file_BlockIndexHeader diffBlkhdr; + if(!rDiff.ReadFullBuffer(&diffBlkhdr, sizeof(diffBlkhdr), 0)) + { + THROW_EXCEPTION(BackupStoreException, FailedToReadBlockOnCombine) + } + if(ntohl(diffBlkhdr.mMagicValue) != OBJECTMAGIC_FILE_BLOCKS_MAGIC_VALUE_V1 + || (int64_t)box_ntoh64(diffBlkhdr.mNumBlocks) != DiffNumBlocks) + { + THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile) + } + + // Write it out with a blanked out other file ID + diffBlkhdr.mOtherFileID = box_hton64(0); + rOut.Write(&diffBlkhdr, sizeof(diffBlkhdr)); + + // Rewrite the index + for(int64_t b = 0; b < DiffNumBlocks; ++b) + { + file_BlockIndexEntry en; + if(!rDiff.ReadFullBuffer(&en, sizeof(en), 0)) + { + THROW_EXCEPTION(BackupStoreException, FailedToReadBlockOnCombine) + } + + // What's the size value stored in the entry + int64_t encodedSize = box_ntoh64(en.mEncodedSize); + + // Need to adjust it? + if(encodedSize <= 0) + { + // This actually refers to a block in the from file. So rewrite this. + int64_t blockIdx = (0 - encodedSize); + if(blockIdx > FromNumBlocks) + { + // References a block which doesn't actually exist + THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile) + } + // Calculate size. This operation is safe because of the extra entry at the end + int32_t blockSize = pFromIndex[blockIdx + 1].mFilePosition - pFromIndex[blockIdx].mFilePosition; + // Then replace entry + en.mEncodedSize = box_hton64(((uint64_t)blockSize)); + } + + // Write entry + rOut.Write(&en, sizeof(en)); + } +} + + + + + diff --git a/lib/backupstore/BackupStoreFileDiff.cpp b/lib/backupstore/BackupStoreFileDiff.cpp new file mode 100644 index 00000000..5705c3aa --- /dev/null +++ b/lib/backupstore/BackupStoreFileDiff.cpp @@ -0,0 +1,1046 @@ +// -------------------------------------------------------------------------- +// +// File +// Name: BackupStoreFileDiff.cpp +// Purpose: Functions relating to diffing BackupStoreFiles +// Created: 12/1/04 +// +// -------------------------------------------------------------------------- + +#include "Box.h" + +#include <string.h> + +#include <new> +#include <map> + +#ifdef HAVE_TIME_H + #include <time.h> +#elif HAVE_SYS_TIME_H + #include <sys/time.h> +#endif + +#include "BackupStoreConstants.h" +#include "BackupStoreException.h" +#include "BackupStoreFile.h" +#include "BackupStoreFileCryptVar.h" +#include "BackupStoreFileEncodeStream.h" +#include "BackupStoreFileWire.h" +#include "BackupStoreObjectMagic.h" +#include "CommonException.h" +#include "FileStream.h" +#include "MD5Digest.h" +#include "RollingChecksum.h" +#include "Timer.h" + +#include "MemLeakFindOn.h" + +#include <cstring> + +using namespace BackupStoreFileCryptVar; +using namespace BackupStoreFileCreation; + +// By default, don't trace out details of the diff as we go along -- would fill up logs significantly. +// But it's useful for the test. +#ifndef BOX_RELEASE_BUILD + bool BackupStoreFile::TraceDetailsOfDiffProcess = false; +#endif + +static void LoadIndex(IOStream &rBlockIndex, int64_t ThisID, BlocksAvailableEntry **ppIndex, int64_t &rNumBlocksOut, int Timeout, bool &rCanDiffFromThis); +static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]); +static void SearchForMatchingBlocks(IOStream &rFile, + std::map<int64_t, int64_t> &rFoundBlocks, BlocksAvailableEntry *pIndex, + int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES], + DiffTimer *pDiffTimer); +static void SetupHashTable(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t BlockSize, BlocksAvailableEntry **pHashTable); +static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings, int Offset, int32_t BlockSize, int64_t FileBlockNumber, +BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks); +static void GenerateRecipe(BackupStoreFileEncodeStream::Recipe &rRecipe, BlocksAvailableEntry *pIndex, int64_t NumBlocks, std::map<int64_t, int64_t> &rFoundBlocks, int64_t SizeOfInputFile); + +// -------------------------------------------------------------------------- +// +// Function +// Name: BackupStoreFile::MoveStreamPositionToBlockIndex(IOStream &) +// Purpose: Move the file pointer in this stream to just before the block index. +// Assumes that the stream is at the beginning, seekable, and +// reading from the stream is OK. +// Created: 12/1/04 +// +// -------------------------------------------------------------------------- +void BackupStoreFile::MoveStreamPositionToBlockIndex(IOStream &rStream) +{ + // Size of file + int64_t fileSize = rStream.BytesLeftToRead(); + + // Get header + file_StreamFormat hdr; + + // Read the header + if(!rStream.ReadFullBuffer(&hdr, sizeof(hdr), 0 /* not interested in bytes read if this fails */, IOStream::TimeOutInfinite)) + { + // Couldn't read header + THROW_EXCEPTION(BackupStoreException, CouldntReadEntireStructureFromStream) + } + + // Check magic number + if(ntohl(hdr.mMagicValue) != OBJECTMAGIC_FILE_MAGIC_VALUE_V1 +#ifndef BOX_DISABLE_BACKWARDS_COMPATIBILITY_BACKUPSTOREFILE + && ntohl(hdr.mMagicValue) != OBJECTMAGIC_FILE_MAGIC_VALUE_V0 +#endif + ) + { + THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile) + } + + // Work out where the index is + int64_t numBlocks = box_ntoh64(hdr.mNumBlocks); + int64_t blockHeaderPosFromEnd = ((numBlocks * sizeof(file_BlockIndexEntry)) + sizeof(file_BlockIndexHeader)); + + // Sanity check + if(blockHeaderPosFromEnd > static_cast<int64_t>(fileSize - sizeof(file_StreamFormat))) + { + THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile) + } + + // Seek to that position + rStream.Seek(0 - blockHeaderPosFromEnd, IOStream::SeekType_End); + + // Done. Stream now in right position (as long as the file is formatted correctly) +} + + +// -------------------------------------------------------------------------- +// +// Function +// Name: BackupStoreFile::EncodeFileDiff(const char *, int64_t, const BackupStoreFilename &, int64_t, IOStream &, int64_t *) +// Purpose: Similar to EncodeFile, but takes the object ID of the file it's +// diffing from, and the index of the blocks in a stream. It'll then +// calculate which blocks can be reused from that old file. +// The timeout is the timeout value for reading the diff block index. +// If pIsCompletelyDifferent != 0, it will be set to true if the +// the two files are completely different (do not share any block), false otherwise. +// +// Created: 12/1/04 +// +// -------------------------------------------------------------------------- +std::auto_ptr<IOStream> BackupStoreFile::EncodeFileDiff +( + const char *Filename, int64_t ContainerID, + const BackupStoreFilename &rStoreFilename, int64_t DiffFromObjectID, + IOStream &rDiffFromBlockIndex, int Timeout, DiffTimer *pDiffTimer, + int64_t *pModificationTime, bool *pIsCompletelyDifferent) +{ + // Is it a symlink? + { + EMU_STRUCT_STAT st; + if(EMU_LSTAT(Filename, &st) != 0) + { + THROW_EXCEPTION(CommonException, OSFileError) + } + if((st.st_mode & S_IFLNK) == S_IFLNK) + { + // Don't do diffs for symlinks + if(pIsCompletelyDifferent != 0) + { + *pIsCompletelyDifferent = true; + } + return EncodeFile(Filename, ContainerID, rStoreFilename, pModificationTime); + } + } + + // Load in the blocks + BlocksAvailableEntry *pindex = 0; + int64_t blocksInIndex = 0; + bool canDiffFromThis = false; + LoadIndex(rDiffFromBlockIndex, DiffFromObjectID, &pindex, blocksInIndex, Timeout, canDiffFromThis); + // BOX_TRACE("Diff: Blocks in index: " << blocksInIndex); + + if(!canDiffFromThis) + { + // Don't do diffing... + if(pIsCompletelyDifferent != 0) + { + *pIsCompletelyDifferent = true; + } + return EncodeFile(Filename, ContainerID, rStoreFilename, pModificationTime); + } + + // Pointer to recipe we're going to create + BackupStoreFileEncodeStream::Recipe *precipe = 0; + + try + { + // Find which sizes should be scanned + int32_t sizesToScan[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; + FindMostUsedSizes(pindex, blocksInIndex, sizesToScan); + + // Flag for reporting to the user + bool completelyDifferent; + + // BLOCK + { + // Search the file to find matching blocks + std::map<int64_t, int64_t> foundBlocks; // map of offset in file to index in block index + int64_t sizeOfInputFile = 0; + // BLOCK + { + FileStream file(Filename); + // Get size of file + sizeOfInputFile = file.BytesLeftToRead(); + // Find all those lovely matching blocks + SearchForMatchingBlocks(file, foundBlocks, pindex, + blocksInIndex, sizesToScan, pDiffTimer); + + // Is it completely different? + completelyDifferent = (foundBlocks.size() == 0); + } + + // Create a recipe -- if the two files are completely different, don't put the from file ID in the recipe. + precipe = new BackupStoreFileEncodeStream::Recipe(pindex, blocksInIndex, completelyDifferent?(0):(DiffFromObjectID)); + BlocksAvailableEntry *pindexKeptRef = pindex; // we need this later, but must set pindex == 0 now, because of exceptions + pindex = 0; // Recipe now has ownership + + // Fill it in + GenerateRecipe(*precipe, pindexKeptRef, blocksInIndex, foundBlocks, sizeOfInputFile); + } + // foundBlocks no longer required + + // Create the stream + std::auto_ptr<IOStream> stream(new BackupStoreFileEncodeStream); + + // Do the initial setup + ((BackupStoreFileEncodeStream*)stream.get())->Setup(Filename, precipe, ContainerID, rStoreFilename, pModificationTime); + precipe = 0; // Stream has taken ownership of this + + // Tell user about completely different status? + if(pIsCompletelyDifferent != 0) + { + *pIsCompletelyDifferent = completelyDifferent; + } + + // Return the stream for the caller + return stream; + } + catch(...) + { + // cleanup + if(pindex != 0) + { + ::free(pindex); + pindex = 0; + } + if(precipe != 0) + { + delete precipe; + precipe = 0; + } + throw; + } +} + +// -------------------------------------------------------------------------- +// +// Function +// Name: static LoadIndex(IOStream &, int64_t, BlocksAvailableEntry **, int64_t, bool &) +// Purpose: Read in an index, and decrypt, and store in the in memory block format. +// rCanDiffFromThis is set to false if the version of the from file is too old. +// Created: 12/1/04 +// +// -------------------------------------------------------------------------- +static void LoadIndex(IOStream &rBlockIndex, int64_t ThisID, BlocksAvailableEntry **ppIndex, int64_t &rNumBlocksOut, int Timeout, bool &rCanDiffFromThis) +{ + // Reset + rNumBlocksOut = 0; + rCanDiffFromThis = false; + + // Read header + file_BlockIndexHeader hdr; + if(!rBlockIndex.ReadFullBuffer(&hdr, sizeof(hdr), 0 /* not interested in bytes read if this fails */, Timeout)) + { + // Couldn't read header + THROW_EXCEPTION(BackupStoreException, CouldntReadEntireStructureFromStream) + } + +#ifndef BOX_DISABLE_BACKWARDS_COMPATIBILITY_BACKUPSTOREFILE + // Check against backwards comptaibility stuff + if(hdr.mMagicValue == (int32_t)htonl(OBJECTMAGIC_FILE_BLOCKS_MAGIC_VALUE_V0)) + { + // Won't diff against old version + + // Absorb rest of stream + char buffer[2048]; + while(rBlockIndex.StreamDataLeft()) + { + rBlockIndex.Read(buffer, sizeof(buffer), 1000 /* 1 sec timeout */); + } + + // Tell caller + rCanDiffFromThis = false; + return; + } +#endif + + // Check magic + if(hdr.mMagicValue != (int32_t)htonl(OBJECTMAGIC_FILE_BLOCKS_MAGIC_VALUE_V1)) + { + THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile) + } + + // Check that we're not trying to diff against a file which references blocks from another file + if(((int64_t)box_ntoh64(hdr.mOtherFileID)) != 0) + { + THROW_EXCEPTION(BackupStoreException, CannotDiffAnIncompleteStoreFile) + } + + // Mark as an acceptable diff. + rCanDiffFromThis = true; + + // Get basic information + int64_t numBlocks = box_ntoh64(hdr.mNumBlocks); + uint64_t entryIVBase = box_ntoh64(hdr.mEntryIVBase); + + //TODO: Verify that these sizes look reasonable + + // Allocate space for the index + BlocksAvailableEntry *pindex = (BlocksAvailableEntry*)::malloc(sizeof(BlocksAvailableEntry) * numBlocks); + if(pindex == 0) + { + throw std::bad_alloc(); + } + + try + { + for(int64_t b = 0; b < numBlocks; ++b) + { + // Read an entry from the stream + file_BlockIndexEntry entry; + if(!rBlockIndex.ReadFullBuffer(&entry, sizeof(entry), 0 /* not interested in bytes read if this fails */, Timeout)) + { + // Couldn't read entry + THROW_EXCEPTION(BackupStoreException, CouldntReadEntireStructureFromStream) + } + + // Calculate IV for this entry + uint64_t iv = entryIVBase; + iv += b; + // Network byte order + iv = box_hton64(iv); + sBlowfishDecryptBlockEntry.SetIV(&iv); + + // Decrypt the encrypted section + file_BlockIndexEntryEnc entryEnc; + int sectionSize = sBlowfishDecryptBlockEntry.TransformBlock(&entryEnc, sizeof(entryEnc), + entry.mEnEnc, sizeof(entry.mEnEnc)); + if(sectionSize != sizeof(entryEnc)) + { + THROW_EXCEPTION(BackupStoreException, BlockEntryEncodingDidntGiveExpectedLength) + } + + // Check that we're not trying to diff against a file which references blocks from another file + if(((int64_t)box_ntoh64(entry.mEncodedSize)) <= 0) + { + THROW_EXCEPTION(BackupStoreException, CannotDiffAnIncompleteStoreFile) + } + + // Store all the required information + pindex[b].mpNextInHashList = 0; // hash list not set up yet + pindex[b].mSize = ntohl(entryEnc.mSize); + pindex[b].mWeakChecksum = ntohl(entryEnc.mWeakChecksum); + ::memcpy(pindex[b].mStrongChecksum, entryEnc.mStrongChecksum, sizeof(pindex[b].mStrongChecksum)); + } + + // Store index pointer for called + ASSERT(ppIndex != 0); + *ppIndex = pindex; + + // Store number of blocks for caller + rNumBlocksOut = numBlocks; + + } + catch(...) + { + // clean up and send the exception along its way + ::free(pindex); + throw; + } +} + + +// -------------------------------------------------------------------------- +// +// Function +// Name: static FindMostUsedSizes(BlocksAvailableEntry *, int64_t, int32_t[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]) +// Purpose: Finds the most commonly used block sizes in the index +// Created: 12/1/04 +// +// -------------------------------------------------------------------------- +static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]) +{ + // Array for lengths + int64_t sizeCounts[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; + + // Set arrays to lots of zeros (= unused entries) + for(int l = 0; l < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++l) + { + Sizes[l] = 0; + sizeCounts[l] = 0; + } + + // Array for collecting sizes + std::map<int32_t, int64_t> foundSizes; + + // Run through blocks and make a count of the entries + for(int64_t b = 0; b < NumBlocks; ++b) + { + // Only if the block size is bigger than the minimum size we'll scan for + if(pIndex[b].mSize > BACKUP_FILE_DIFF_MIN_BLOCK_SIZE) + { + // Find entry? + std::map<int32_t, int64_t>::const_iterator f(foundSizes.find(pIndex[b].mSize)); + if(f != foundSizes.end()) + { + // Increment existing entry + foundSizes[pIndex[b].mSize] = foundSizes[pIndex[b].mSize] + 1; + } + else + { + // New entry + foundSizes[pIndex[b].mSize] = 1; + } + } + } + + // Make the block sizes + for(std::map<int32_t, int64_t>::const_iterator i(foundSizes.begin()); i != foundSizes.end(); ++i) + { + // Find the position of the size in the array + for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t) + { + // Instead of sorting on the raw count of blocks, + // take the file area covered by this block size. + if(i->second * i->first > sizeCounts[t] * Sizes[t]) + { + // Then this size belong before this entry -- shuffle them up + for(int s = (BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1); s >= t; --s) + { + Sizes[s] = Sizes[s-1]; + sizeCounts[s] = sizeCounts[s-1]; + } + + // Insert this size + Sizes[t] = i->first; + sizeCounts[t] = i->second; + + // Shouldn't do any more searching + break; + } + } + } + + // trace the size table in debug builds +#ifndef BOX_RELEASE_BUILD + if(BackupStoreFile::TraceDetailsOfDiffProcess) + { + for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t) + { + BOX_TRACE("Diff block size " << t << ": " << + Sizes[t] << " (count = " << + sizeCounts[t] << ")"); + } + } +#endif +} + + + +// -------------------------------------------------------------------------- +// +// Function +// Name: static SearchForMatchingBlocks(IOStream &, std::map<int64_t, int64_t> &, BlocksAvailableEntry *, int64_t, int32_t[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]) +// Purpose: Find the matching blocks within the file. +// Created: 12/1/04 +// +// -------------------------------------------------------------------------- +static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> &rFoundBlocks, + BlocksAvailableEntry *pIndex, int64_t NumBlocks, + int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES], DiffTimer *pDiffTimer) +{ + Timer maximumDiffingTime(0, "MaximumDiffingTime"); + + if(pDiffTimer && pDiffTimer->IsManaged()) + { + maximumDiffingTime = Timer(pDiffTimer->GetMaximumDiffingTime(), + "MaximumDiffingTime"); + } + + std::map<int64_t, int32_t> goodnessOfFit; + + // Allocate the hash lookup table + BlocksAvailableEntry **phashTable = (BlocksAvailableEntry **)::malloc(sizeof(BlocksAvailableEntry *) * (64*1024)); + + // Choose a size for the buffer, just a little bit more than the maximum block size + int32_t bufSize = Sizes[0]; + for(int z = 1; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z) + { + if(Sizes[z] > bufSize) bufSize = Sizes[z]; + } + bufSize += 4; + ASSERT(bufSize > Sizes[0]); + ASSERT(bufSize > 0); + if(bufSize > (BACKUP_FILE_MAX_BLOCK_SIZE + 1024)) + { + THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile) + } + + // TODO: Because we read in the file a scanned block size at a time, + // it is likely to be inefficient. Probably will be much better to + // calculate checksums for all block sizes in a single pass. + + // Allocate the buffers. + uint8_t *pbuffer0 = (uint8_t *)::malloc(bufSize); + uint8_t *pbuffer1 = (uint8_t *)::malloc(bufSize); + try + { + // Check buffer allocation + if(pbuffer0 == 0 || pbuffer1 == 0 || phashTable == 0) + { + // If a buffer got allocated, it will be cleaned up in the catch block + throw std::bad_alloc(); + } + + // Flag to abort the run, if too many blocks are found -- avoid using + // huge amounts of processor time when files contain many similar blocks. + bool abortSearch = false; + + // Search for each block size in turn + // NOTE: Do the smallest size first, so that the scheme for adding + // entries in the found list works as expected and replaces smallers block + // with larger blocks when it finds matches at the same offset in the file. + for(int s = BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1; s >= 0; --s) + { + ASSERT(Sizes[s] <= bufSize); + BOX_TRACE("Diff pass " << s << ", for block size " << + Sizes[s]); + + // Check we haven't finished + if(Sizes[s] == 0) + { + // empty entry, try next size + continue; + } + + // Set up the hash table entries + SetupHashTable(pIndex, NumBlocks, Sizes[s], phashTable); + + // Shift file position to beginning + rFile.Seek(0, IOStream::SeekType_Absolute); + + // Read first block + if(rFile.Read(pbuffer0, Sizes[s]) != Sizes[s]) + { + // Size of file too short to match -- do next size + continue; + } + + // Setup block pointers + uint8_t *beginnings = pbuffer0; + uint8_t *endings = pbuffer1; + int offset = 0; + + // Calculate the first checksum, ready for rolling + RollingChecksum rolling(beginnings, Sizes[s]); + + // Then roll, until the file is exhausted + int64_t fileBlockNumber = 0; + int64_t fileOffset = 0; + int rollOverInitialBytes = 0; + while(true) + { + if(maximumDiffingTime.HasExpired()) + { + ASSERT(pDiffTimer != NULL); + BOX_INFO("MaximumDiffingTime reached - " + "suspending file diff"); + abortSearch = true; + break; + } + + if(pDiffTimer) + { + pDiffTimer->DoKeepAlive(); + } + + // Load in another block of data, and record how big it is + int bytesInEndings = rFile.Read(endings, Sizes[s]); + int tmp; + + // Skip any bytes from a previous matched block + if(rollOverInitialBytes > 0 && offset < bytesInEndings) + { + int spaceLeft = bytesInEndings - offset; + int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes; + + rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll); + + offset += thisRoll; + fileOffset += thisRoll; + rollOverInitialBytes -= thisRoll; + + if(rollOverInitialBytes) + { + goto refresh; + } + } + + if(goodnessOfFit.count(fileOffset)) + { + tmp = goodnessOfFit[fileOffset]; + } + else + { + tmp = 0; + } + + if(tmp >= Sizes[s]) + { + // Skip over bigger ready-matched blocks completely + rollOverInitialBytes = tmp; + int spaceLeft = bytesInEndings - offset; + int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes; + + rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll); + + offset += thisRoll; + fileOffset += thisRoll; + rollOverInitialBytes -= thisRoll; + + if(rollOverInitialBytes) + { + goto refresh; + } + } + + while(offset < bytesInEndings) + { + // Is current checksum in hash list? + uint16_t hash = rolling.GetComponentForHashing(); + if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s])) + { + if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks)) + { + BOX_TRACE("Found block match for " << hash << " of " << Sizes[s] << " bytes at offset " << fileOffset); + goodnessOfFit[fileOffset] = Sizes[s]; + + // Block matched, roll the checksum forward to the next block without doing + // any more comparisons, because these are pointless (as any more matches will be ignored when + // the recipe is generated) and just take up valuable processor time. Edge cases are + // especially nasty, using huge amounts of time and memory. + int skip = Sizes[s]; + if(offset < bytesInEndings && skip > 0) + { + int spaceLeft = bytesInEndings - offset; + int thisRoll = (skip > spaceLeft) ? spaceLeft : skip; + + rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll); + + offset += thisRoll; + fileOffset += thisRoll; + skip -= thisRoll; + } + // Not all the bytes necessary will have been skipped, so get them + // skipped after the next block is loaded. + rollOverInitialBytes = skip; + + // End this loop, so the final byte isn't used again + break; + } + else + { + BOX_TRACE("False alarm match for " << hash << " of " << Sizes[s] << " bytes at offset " << fileOffset); + } + + int64_t NumBlocksFound = static_cast<int64_t>( + rFoundBlocks.size()); + int64_t MaxBlocksFound = NumBlocks * + BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE; + + if(NumBlocksFound > MaxBlocksFound) + { + abortSearch = true; + break; + } + } + + // Roll checksum forward + rolling.RollForward(beginnings[offset], endings[offset], Sizes[s]); + + // Increment offsets + ++offset; + ++fileOffset; + } + + if(abortSearch) break; + + refresh: + // Finished? + if(bytesInEndings != Sizes[s]) + { + // No more data in file -- check the final block + // (Do a copy and paste of 5 lines of code instead of introducing a comparison for + // each byte of the file) + uint16_t hash = rolling.GetComponentForHashing(); + if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s])) + { + if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks)) + { + goodnessOfFit[fileOffset] = Sizes[s]; + } + } + + // finish + break; + } + + // Switch buffers, reset offset + beginnings = endings; + endings = (beginnings == pbuffer0)?(pbuffer1):(pbuffer0); // ie the other buffer + offset = 0; + + // And count the blocks which have been done + ++fileBlockNumber; + } + + if(abortSearch) break; + } + + // Free buffers and hash table + ::free(pbuffer1); + pbuffer1 = 0; + ::free(pbuffer0); + pbuffer0 = 0; + ::free(phashTable); + phashTable = 0; + } + catch(...) + { + // Cleanup and throw + if(pbuffer1 != 0) ::free(pbuffer1); + if(pbuffer0 != 0) ::free(pbuffer0); + if(phashTable != 0) ::free(phashTable); + throw; + } + +#ifndef BOX_RELEASE_BUILD + if(BackupStoreFile::TraceDetailsOfDiffProcess) + { + // Trace out the found blocks in debug mode + BOX_TRACE("Diff: list of found blocks"); + BOX_TRACE("======== ======== ======== ========"); + BOX_TRACE(" Offset BlkIdx Size Movement"); + for(std::map<int64_t, int64_t>::const_iterator i(rFoundBlocks.begin()); i != rFoundBlocks.end(); ++i) + { + int64_t orgLoc = 0; + for(int64_t b = 0; b < i->second; ++b) + { + orgLoc += pIndex[b].mSize; + } + BOX_TRACE(std::setw(8) << i->first << " " << + std::setw(8) << i->second << " " << + std::setw(8) << pIndex[i->second].mSize << + " " << + std::setw(8) << (i->first - orgLoc)); + } + BOX_TRACE("======== ======== ======== ========"); + } +#endif +} + + +// -------------------------------------------------------------------------- +// +// Function +// Name: static SetupHashTable(BlocksAvailableEntry *, int64_t, in32_t, BlocksAvailableEntry **) +// Purpose: Set up the hash table ready for a scan +// Created: 14/1/04 +// +// -------------------------------------------------------------------------- +static void SetupHashTable(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t BlockSize, BlocksAvailableEntry **pHashTable) +{ + // Set all entries in the hash table to zero + ::memset(pHashTable, 0, (sizeof(BlocksAvailableEntry *) * (64*1024))); + + // Scan through the blocks, building the hash table + for(int64_t b = 0; b < NumBlocks; ++b) + { + // Only look at the required block size + if(pIndex[b].mSize == BlockSize) + { + // Get the value under which to hash this entry + uint16_t hash = RollingChecksum::ExtractHashingComponent(pIndex[b].mWeakChecksum); + + // Already present in table? + if(pHashTable[hash] != 0) + { + //BOX_TRACE("Another hash entry for " << hash << " found"); + // Yes -- need to set the pointer in this entry to the current entry to build the linked list + pIndex[b].mpNextInHashList = pHashTable[hash]; + } + + // Put a pointer to this entry in the hash table + pHashTable[hash] = pIndex + b; + } + } +} + + +// -------------------------------------------------------------------------- +// +// Function +// Name: static bool SecondStageMatch(xxx) +// Purpose: When a match in the hash table is found, scan for second stage match using strong checksum. +// Created: 14/1/04 +// +// -------------------------------------------------------------------------- +static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings, + int Offset, int32_t BlockSize, int64_t FileBlockNumber, BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks) +{ + // Check parameters + ASSERT(pBeginnings != 0); + ASSERT(pEndings != 0); + ASSERT(Offset >= 0); + ASSERT(BlockSize > 0); + ASSERT(pFirstInHashList != 0); + ASSERT(pIndex != 0); + +#ifndef BOX_RELEASE_BUILD + uint16_t DEBUG_Hash = fastSum.GetComponentForHashing(); +#endif + uint32_t Checksum = fastSum.GetChecksum(); + + // Before we go to the expense of the MD5, make sure it's a darn good match on the checksum we already know. + BlocksAvailableEntry *scan = pFirstInHashList; + bool found=false; + while(scan != 0) + { + if(scan->mWeakChecksum == Checksum) + { + found = true; + break; + } + scan = scan->mpNextInHashList; + } + if(!found) + { + return false; + } + + // Calculate the strong MD5 digest for this block + MD5Digest strong; + // Add the data from the beginnings + strong.Add(pBeginnings + Offset, BlockSize - Offset); + // Add any data from the endings + if(Offset > 0) + { + strong.Add(pEndings, Offset); + } + strong.Finish(); + + // Then go through the entries in the hash list, comparing with the strong digest calculated + scan = pFirstInHashList; + //BOX_TRACE("second stage match"); + while(scan != 0) + { + //BOX_TRACE("scan size " << scan->mSize << + // ", block size " << BlockSize << + // ", hash " << Hash); + ASSERT(scan->mSize == BlockSize); + ASSERT(RollingChecksum::ExtractHashingComponent(scan->mWeakChecksum) == DEBUG_Hash); + + // Compare? + if(strong.DigestMatches(scan->mStrongChecksum)) + { + //BOX_TRACE("Match!\n"); + // Found! Add to list of found blocks... + int64_t fileOffset = (FileBlockNumber * BlockSize) + Offset; + int64_t blockIndex = (scan - pIndex); // pointer arthmitic is frowned upon. But most efficient way of doing it here -- alternative is to use more memory + + // We do NOT search for smallest blocks first, as this code originally assumed. + // To prevent this from potentially overwriting a better match, the caller must determine + // the relative "goodness" of any existing match and this one, and avoid the call if it + // could be detrimental. + rFoundBlocks[fileOffset] = blockIndex; + + // No point in searching further, report success + return true; + } + + // Next + scan = scan->mpNextInHashList; + } + + // Not matched + return false; +} + + +// -------------------------------------------------------------------------- +// +// Function +// Name: static GenerateRecipe(BackupStoreFileEncodeStream::Recipe &, BlocksAvailableEntry *, int64_t, std::map<int64_t, int64_t> &) +// Purpose: Fills in the recipe from the found block list +// Created: 15/1/04 +// +// -------------------------------------------------------------------------- +static void GenerateRecipe(BackupStoreFileEncodeStream::Recipe &rRecipe, BlocksAvailableEntry *pIndex, + int64_t NumBlocks, std::map<int64_t, int64_t> &rFoundBlocks, int64_t SizeOfInputFile) +{ + // NOTE: This function could be a lot more sophisiticated. For example, if + // a small block overlaps a big block like this + // **** + // ******************************* + // then the small block will be used, not the big one. But it'd be better to + // just ignore the small block and keep the big one. However, some stats should + // be gathered about real world files before writing complex code which might + // go wrong. + + // Initialise a blank instruction + BackupStoreFileEncodeStream::RecipeInstruction instruction; + #define RESET_INSTRUCTION \ + instruction.mSpaceBefore = 0; \ + instruction.mBlocks = 0; \ + instruction.mpStartBlock = 0; + RESET_INSTRUCTION + + // First, a special case for when there are no found blocks + if(rFoundBlocks.size() == 0) + { + // No blocks, just a load of space + instruction.mSpaceBefore = SizeOfInputFile; + rRecipe.push_back(instruction); + + #ifndef BOX_RELEASE_BUILD + if(BackupStoreFile::TraceDetailsOfDiffProcess) + { + BOX_TRACE("Diff: Default recipe generated, " << + SizeOfInputFile << " bytes of file"); + } + #endif + + // Don't do anything + return; + } + + // Current location + int64_t loc = 0; + + // Then iterate through the list, generating the recipe + std::map<int64_t, int64_t>::const_iterator i(rFoundBlocks.begin()); + ASSERT(i != rFoundBlocks.end()); // check logic + + // Counting for debug tracing +#ifndef BOX_RELEASE_BUILD + int64_t debug_NewBytesFound = 0; + int64_t debug_OldBlocksUsed = 0; +#endif + + for(; i != rFoundBlocks.end(); ++i) + { + // Remember... map is (position in file) -> (index of block in pIndex) + + if(i->first < loc) + { + // This block overlaps the last one + continue; + } + else if(i->first > loc) + { + // There's a gap between the end of the last thing and this block. + // If there's an instruction waiting, push it onto the list + if(instruction.mSpaceBefore != 0 || instruction.mpStartBlock != 0) + { + rRecipe.push_back(instruction); + } + // Start a new instruction, with the gap ready + RESET_INSTRUCTION + instruction.mSpaceBefore = i->first - loc; + // Move location forward to match + loc += instruction.mSpaceBefore; +#ifndef BOX_RELEASE_BUILD + debug_NewBytesFound += instruction.mSpaceBefore; +#endif + } + + // First, does the current instruction need pushing back, because this block is not + // sequential to the last one? + if(instruction.mpStartBlock != 0 && (pIndex + i->second) != (instruction.mpStartBlock + instruction.mBlocks)) + { + rRecipe.push_back(instruction); + RESET_INSTRUCTION + } + + // Add in this block + if(instruction.mpStartBlock == 0) + { + // This block starts a new instruction + instruction.mpStartBlock = pIndex + i->second; + instruction.mBlocks = 1; + } + else + { + // It continues the previous section of blocks + instruction.mBlocks += 1; + } + +#ifndef BOX_RELEASE_BUILD + debug_OldBlocksUsed++; +#endif + + // Move location forward + loc += pIndex[i->second].mSize; + } + + // Push the last instruction generated + rRecipe.push_back(instruction); + + // Is there any space left at the end which needs sending? + if(loc != SizeOfInputFile) + { + RESET_INSTRUCTION + instruction.mSpaceBefore = SizeOfInputFile - loc; +#ifndef BOX_RELEASE_BUILD + debug_NewBytesFound += instruction.mSpaceBefore; +#endif + rRecipe.push_back(instruction); + } + + // dump out the recipe +#ifndef BOX_RELEASE_BUILD + BOX_TRACE("Diff: " << + debug_NewBytesFound << " new bytes found, " << + debug_OldBlocksUsed << " old blocks used"); + if(BackupStoreFile::TraceDetailsOfDiffProcess) + { + BOX_TRACE("Diff: Recipe generated (size " << rRecipe.size()); + BOX_TRACE("======== ========= ========"); + BOX_TRACE("Space b4 FirstBlk NumBlks"); + { + for(unsigned int e = 0; e < rRecipe.size(); ++e) + { + char b[64]; +#ifdef WIN32 + sprintf(b, "%8I64d", (int64_t)(rRecipe[e].mpStartBlock - pIndex)); +#else + sprintf(b, "%8lld", (int64_t)(rRecipe[e].mpStartBlock - pIndex)); +#endif + BOX_TRACE(std::setw(8) << + rRecipe[e].mSpaceBefore << + " " << + ((rRecipe[e].mpStartBlock == 0)?" -":b) << + " " << std::setw(8) << + rRecipe[e].mBlocks); + } + } + BOX_TRACE("======== ========= ========"); + } +#endif +} |