// -------------------------------------------------------------------------- // // File // Name: BackupStoreFileDiff.cpp // Purpose: Functions relating to diffing BackupStoreFiles // Created: 12/1/04 // // -------------------------------------------------------------------------- #include "Box.h" #include #include #include #ifdef WIN32 #include #else #include #endif #include "BackupStoreFile.h" #include "BackupStoreFileWire.h" #include "BackupStoreFileCryptVar.h" #include "BackupStoreObjectMagic.h" #include "BackupStoreException.h" #include "BackupStoreFileEncodeStream.h" #include "BackupStoreConstants.h" #include "FileStream.h" #include "RollingChecksum.h" #include "MD5Digest.h" #include "CommonException.h" #include "MemLeakFindOn.h" using namespace BackupStoreFileCryptVar; using namespace BackupStoreFileCreation; // By default, don't trace out details of the diff as we go along -- would fill up logs significantly. // But it's useful for the test. #ifndef NDEBUG bool BackupStoreFile::TraceDetailsOfDiffProcess = false; #endif static void LoadIndex(IOStream &rBlockIndex, int64_t ThisID, BlocksAvailableEntry **ppIndex, int64_t &rNumBlocksOut, int Timeout, bool &rCanDiffFromThis); static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]); static void SearchForMatchingBlocks(IOStream &rFile, std::map &rFoundBlocks, BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]); static void SetupHashTable(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t BlockSize, BlocksAvailableEntry **pHashTable); static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings, int Offset, int32_t BlockSize, int64_t FileBlockNumber, BlocksAvailableEntry *pIndex, std::map &rFoundBlocks); static void GenerateRecipe(BackupStoreFileEncodeStream::Recipe &rRecipe, BlocksAvailableEntry *pIndex, int64_t NumBlocks, std::map &rFoundBlocks, int64_t SizeOfInputFile); // Avoid running on too long with diffs static int sMaximumDiffTime = 600; // maximum time to spend diffing static bool sDiffTimedOut = false; static bool sSetTimerSignelHandler = false; static void TimerSignalHandler(int signal); static void StartDiffTimer(); // -------------------------------------------------------------------------- // // Function // Name: BackupStoreFile::SetMaximumDiffingTime(int) // Purpose: Sets the maximum time to spend diffing, in seconds. Time is // process virutal time. // Created: 19/3/04 // // -------------------------------------------------------------------------- void BackupStoreFile::SetMaximumDiffingTime(int Seconds) { sMaximumDiffTime = Seconds; TRACE1("Set maximum diffing time to %d seconds\n", Seconds); } // -------------------------------------------------------------------------- // // Function // Name: BackupStoreFile::MoveStreamPositionToBlockIndex(IOStream &) // Purpose: Move the file pointer in this stream to just before the block index. // Assumes that the stream is at the beginning, seekable, and // reading from the stream is OK. // Created: 12/1/04 // // -------------------------------------------------------------------------- void BackupStoreFile::MoveStreamPositionToBlockIndex(IOStream &rStream) { // Size of file int64_t fileSize = rStream.BytesLeftToRead(); // Get header file_StreamFormat hdr; // Read the header if(!rStream.ReadFullBuffer(&hdr, sizeof(hdr), 0 /* not interested in bytes read if this fails */, IOStream::TimeOutInfinite)) { // Couldn't read header THROW_EXCEPTION(BackupStoreException, CouldntReadEntireStructureFromStream) } // Check magic number if(ntohl(hdr.mMagicValue) != OBJECTMAGIC_FILE_MAGIC_VALUE_V1 #ifndef BOX_DISABLE_BACKWARDS_COMPATIBILITY_BACKUPSTOREFILE && ntohl(hdr.mMagicValue) != OBJECTMAGIC_FILE_MAGIC_VALUE_V0 #endif ) { THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile) } // Work out where the index is int64_t numBlocks = box_ntoh64(hdr.mNumBlocks); int64_t blockHeaderPosFromEnd = ((numBlocks * sizeof(file_BlockIndexEntry)) + sizeof(file_BlockIndexHeader)); // Sanity check if(blockHeaderPosFromEnd > static_cast(fileSize - sizeof(file_StreamFormat))) { THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile) } // Seek to that position rStream.Seek(0 - blockHeaderPosFromEnd, IOStream::SeekType_End); // Done. Stream now in right position (as long as the file is formatted correctly) } // -------------------------------------------------------------------------- // // Function // Name: BackupStoreFile::EncodeFileDiff(const char *, int64_t, const BackupStoreFilename &, int64_t, IOStream &, int64_t *) // Purpose: Similar to EncodeFile, but takes the object ID of the file it's // diffing from, and the index of the blocks in a stream. It'll then // calculate which blocks can be reused from that old file. // The timeout is the timeout value for reading the diff block index. // If pIsCompletelyDifferent != 0, it will be set to true if the // the two files are completely different (do not share any block), false otherwise. // // Created: 12/1/04 // // -------------------------------------------------------------------------- std::auto_ptr BackupStoreFile::EncodeFileDiff(const char *Filename, int64_t ContainerID, const BackupStoreFilename &rStoreFilename, int64_t DiffFromObjectID, IOStream &rDiffFromBlockIndex, int Timeout, int64_t *pModificationTime, bool *pIsCompletelyDifferent) { // Is it a symlink? { struct stat st; if(::lstat(Filename, &st) != 0) { THROW_EXCEPTION(CommonException, OSFileError) } if((st.st_mode & S_IFLNK) == S_IFLNK) { // Don't do diffs for symlinks if(pIsCompletelyDifferent != 0) { *pIsCompletelyDifferent = true; } return EncodeFile(Filename, ContainerID, rStoreFilename, pModificationTime); } } // Load in the blocks BlocksAvailableEntry *pindex = 0; int64_t blocksInIndex = 0; bool canDiffFromThis = false; LoadIndex(rDiffFromBlockIndex, DiffFromObjectID, &pindex, blocksInIndex, Timeout, canDiffFromThis); //TRACE1("Diff: Blocks in index: %lld\n", blocksInIndex); if(!canDiffFromThis) { // Don't do diffing... if(pIsCompletelyDifferent != 0) { *pIsCompletelyDifferent = true; } return EncodeFile(Filename, ContainerID, rStoreFilename, pModificationTime); } // Pointer to recipe we're going to create BackupStoreFileEncodeStream::Recipe *precipe = 0; // Start the timeout timer, so that the operation doesn't continue for ever StartDiffTimer(); try { // Find which sizes should be scanned int32_t sizesToScan[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; FindMostUsedSizes(pindex, blocksInIndex, sizesToScan); // Flag for reporting to the user bool completelyDifferent; // BLOCK { // Search the file to find matching blocks std::map foundBlocks; // map of offset in file to index in block index int64_t sizeOfInputFile = 0; // BLOCK { FileStream file(Filename); // Get size of file sizeOfInputFile = file.BytesLeftToRead(); // Find all those lovely matching blocks SearchForMatchingBlocks(file, foundBlocks, pindex, blocksInIndex, sizesToScan); // Is it completely different? completelyDifferent = (foundBlocks.size() == 0); } // Create a recipe -- if the two files are completely different, don't put the from file ID in the recipe. precipe = new BackupStoreFileEncodeStream::Recipe(pindex, blocksInIndex, completelyDifferent?(0):(DiffFromObjectID)); BlocksAvailableEntry *pindexKeptRef = pindex; // we need this later, but must set pindex == 0 now, because of exceptions pindex = 0; // Recipe now has ownership // Fill it in GenerateRecipe(*precipe, pindexKeptRef, blocksInIndex, foundBlocks, sizeOfInputFile); } // foundBlocks no longer required // Create the stream std::auto_ptr stream(new BackupStoreFileEncodeStream); // Do the initial setup ((BackupStoreFileEncodeStream*)stream.get())->Setup(Filename, precipe, ContainerID, rStoreFilename, pModificationTime); precipe = 0; // Stream has taken ownership of this // Tell user about completely different status? if(pIsCompletelyDifferent != 0) { *pIsCompletelyDifferent = completelyDifferent; } // Return the stream for the caller return stream; } catch(...) { // cleanup if(pindex != 0) { ::free(pindex); pindex = 0; } if(precipe != 0) { delete precipe; precipe = 0; } throw; } } // -------------------------------------------------------------------------- // // Function // Name: static LoadIndex(IOStream &, int64_t, BlocksAvailableEntry **, int64_t, bool &) // Purpose: Read in an index, and decrypt, and store in the in memory block format. // rCanDiffFromThis is set to false if the version of the from file is too old. // Created: 12/1/04 // // -------------------------------------------------------------------------- static void LoadIndex(IOStream &rBlockIndex, int64_t ThisID, BlocksAvailableEntry **ppIndex, int64_t &rNumBlocksOut, int Timeout, bool &rCanDiffFromThis) { // Reset rNumBlocksOut = 0; rCanDiffFromThis = false; // Read header file_BlockIndexHeader hdr; if(!rBlockIndex.ReadFullBuffer(&hdr, sizeof(hdr), 0 /* not interested in bytes read if this fails */, Timeout)) { // Couldn't read header THROW_EXCEPTION(BackupStoreException, CouldntReadEntireStructureFromStream) } #ifndef BOX_DISABLE_BACKWARDS_COMPATIBILITY_BACKUPSTOREFILE // Check against backwards comptaibility stuff if(hdr.mMagicValue == (int32_t)htonl(OBJECTMAGIC_FILE_BLOCKS_MAGIC_VALUE_V0)) { // Won't diff against old version // Absorb rest of stream char buffer[2048]; while(rBlockIndex.StreamDataLeft()) { rBlockIndex.Read(buffer, sizeof(buffer), 1000 /* 1 sec timeout */); } // Tell caller rCanDiffFromThis = false; return; } #endif // Check magic if(hdr.mMagicValue != (int32_t)htonl(OBJECTMAGIC_FILE_BLOCKS_MAGIC_VALUE_V1)) { THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile) } // Check that we're not trying to diff against a file which references blocks from another file if(((int64_t)box_ntoh64(hdr.mOtherFileID)) != 0) { THROW_EXCEPTION(BackupStoreException, CannotDiffAnIncompleteStoreFile) } // Mark as an acceptable diff. rCanDiffFromThis = true; // Get basic information int64_t numBlocks = box_ntoh64(hdr.mNumBlocks); uint64_t entryIVBase = box_ntoh64(hdr.mEntryIVBase); //TODO: Verify that these sizes look reasonable // Allocate space for the index BlocksAvailableEntry *pindex = (BlocksAvailableEntry*)::malloc(sizeof(BlocksAvailableEntry) * numBlocks); if(pindex == 0) { throw std::bad_alloc(); } try { for(int64_t b = 0; b < numBlocks; ++b) { // Read an entry from the stream file_BlockIndexEntry entry; if(!rBlockIndex.ReadFullBuffer(&entry, sizeof(entry), 0 /* not interested in bytes read if this fails */, Timeout)) { // Couldn't read entry THROW_EXCEPTION(BackupStoreException, CouldntReadEntireStructureFromStream) } // Calculate IV for this entry uint64_t iv = entryIVBase; iv += b; // Network byte order iv = box_hton64(iv); sBlowfishDecryptBlockEntry.SetIV(&iv); // Decrypt the encrypted section file_BlockIndexEntryEnc entryEnc; int sectionSize = sBlowfishDecryptBlockEntry.TransformBlock(&entryEnc, sizeof(entryEnc), entry.mEnEnc, sizeof(entry.mEnEnc)); if(sectionSize != sizeof(entryEnc)) { THROW_EXCEPTION(BackupStoreException, BlockEntryEncodingDidntGiveExpectedLength) } // Check that we're not trying to diff against a file which references blocks from another file if(((int64_t)box_ntoh64(entry.mEncodedSize)) <= 0) { THROW_EXCEPTION(BackupStoreException, CannotDiffAnIncompleteStoreFile) } // Store all the required information pindex[b].mpNextInHashList = 0; // hash list not set up yet pindex[b].mSize = ntohl(entryEnc.mSize); pindex[b].mWeakChecksum = ntohl(entryEnc.mWeakChecksum); ::memcpy(pindex[b].mStrongChecksum, entryEnc.mStrongChecksum, sizeof(pindex[b].mStrongChecksum)); } // Store index pointer for called ASSERT(ppIndex != 0); *ppIndex = pindex; // Store number of blocks for caller rNumBlocksOut = numBlocks; } catch(...) { // clean up and send the exception along its way ::free(pindex); throw; } } // -------------------------------------------------------------------------- // // Function // Name: static FindMostUsedSizes(BlocksAvailableEntry *, int64_t, int32_t[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]) // Purpose: Finds the most commonly used block sizes in the index // Created: 12/1/04 // // -------------------------------------------------------------------------- static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]) { // Array for lengths int64_t sizeCounts[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]; // Set arrays to lots of zeros (= unused entries) for(int l = 0; l < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++l) { Sizes[l] = 0; sizeCounts[l] = 0; } // Array for collecting sizes std::map foundSizes; // Run through blocks and make a count of the entries for(int64_t b = 0; b < NumBlocks; ++b) { // Only if the block size is bigger than the minimum size we'll scan for if(pIndex[b].mSize > BACKUP_FILE_DIFF_MIN_BLOCK_SIZE) { // Find entry? std::map::const_iterator f(foundSizes.find(pIndex[b].mSize)); if(f != foundSizes.end()) { // Increment existing entry foundSizes[pIndex[b].mSize] = foundSizes[pIndex[b].mSize] + 1; } else { // New entry foundSizes[pIndex[b].mSize] = 1; } } } // Make the block sizes for(std::map::const_iterator i(foundSizes.begin()); i != foundSizes.end(); ++i) { // Find the position of the size in the array for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t) { // Instead of sorting on the raw count of blocks, // take the file area covered by this block size. if(i->second * i->first > sizeCounts[t] * Sizes[t]) { // Then this size belong before this entry -- shuffle them up for(int s = (BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1); s >= t; --s) { Sizes[s] = Sizes[s-1]; sizeCounts[s] = sizeCounts[s-1]; } // Insert this size Sizes[t] = i->first; sizeCounts[t] = i->second; // Shouldn't do any more searching break; } } } // trace the size table in debug builds #ifndef NDEBUG if(BackupStoreFile::TraceDetailsOfDiffProcess) { for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t) { TRACE3("Diff block size %d: %d (count = %lld)\n", t, Sizes[t], sizeCounts[t]); } } #endif } // -------------------------------------------------------------------------- // // Function // Name: static SearchForMatchingBlocks(IOStream &, std::map &, BlocksAvailableEntry *, int64_t, int32_t[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]) // Purpose: Find the matching blocks within the file. // Created: 12/1/04 // // -------------------------------------------------------------------------- static void SearchForMatchingBlocks(IOStream &rFile, std::map &rFoundBlocks, BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]) { std::map goodnessOfFit; // Allocate the hash lookup table BlocksAvailableEntry **phashTable = (BlocksAvailableEntry **)::malloc(sizeof(BlocksAvailableEntry *) * (64*1024)); // Choose a size for the buffer, just a little bit more than the maximum block size int32_t bufSize = Sizes[0]; for(int z = 1; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z) { if(Sizes[z] > bufSize) bufSize = Sizes[z]; } bufSize += 4; ASSERT(bufSize > Sizes[0]); ASSERT(bufSize > 0); if(bufSize > (BACKUP_FILE_MAX_BLOCK_SIZE + 1024)) { THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile) } // TODO: Use buffered file class // Because we read in the file a scanned block size at a time, it is likely to be // inefficient. Probably will be much better to use a buffering IOStream class which // reads data in at the size of the filesystem block size. // Allocate the buffers. uint8_t *pbuffer0 = (uint8_t *)::malloc(bufSize); uint8_t *pbuffer1 = (uint8_t *)::malloc(bufSize); try { // Check buffer allocation if(pbuffer0 == 0 || pbuffer1 == 0 || phashTable == 0) { // If a buffer got allocated, it will be cleaned up in the catch block throw std::bad_alloc(); } // Flag to abort the run, if too many blocks are found -- avoid using // huge amounts of processor time when files contain many similar blocks. bool abortSearch = false; // Search for each block size in turn // NOTE: Do the smallest size first, so that the scheme for adding // entries in the found list works as expected and replaces smallers block // with larger blocks when it finds matches at the same offset in the file. for(int s = BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1; s >= 0; --s) { ASSERT(Sizes[s] <= bufSize); //TRACE2("Diff pass %d, for block size %d\n", s, Sizes[s]); // Check we haven't finished if(Sizes[s] == 0) { // empty entry, try next size continue; } // Set up the hash table entries SetupHashTable(pIndex, NumBlocks, Sizes[s], phashTable); // Shift file position to beginning rFile.Seek(0, IOStream::SeekType_Absolute); // Read first block if(rFile.Read(pbuffer0, Sizes[s]) != Sizes[s]) { // Size of file too short to match -- do next size continue; } // Setup block pointers uint8_t *beginnings = pbuffer0; uint8_t *endings = pbuffer1; int offset = 0; // Calculate the first checksum, ready for rolling RollingChecksum rolling(beginnings, Sizes[s]); // Then roll, until the file is exhausted int64_t fileBlockNumber = 0; int64_t fileOffset = 0; int rollOverInitialBytes = 0; while(true) { // Load in another block of data, and record how big it is int bytesInEndings = rFile.Read(endings, Sizes[s]); int tmp; // Skip any bytes from a previous matched block if(rollOverInitialBytes > 0 && offset < bytesInEndings) { int spaceLeft = bytesInEndings - offset; int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes; rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll); offset += thisRoll; fileOffset += thisRoll; rollOverInitialBytes -= thisRoll; if(rollOverInitialBytes) { goto refresh; } } if(goodnessOfFit.count(fileOffset)) { tmp = goodnessOfFit[fileOffset]; } else { tmp = 0; } if(tmp >= Sizes[s]) { // Skip over bigger ready-matched blocks completely rollOverInitialBytes = tmp; int spaceLeft = bytesInEndings - offset; int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes; rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll); offset += thisRoll; fileOffset += thisRoll; rollOverInitialBytes -= thisRoll; if(rollOverInitialBytes) { goto refresh; } } while(offset < bytesInEndings) { // Is current checksum in hash list? uint16_t hash = rolling.GetComponentForHashing(); if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s])) { if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks)) { goodnessOfFit[fileOffset] = Sizes[s]; // Block matched, roll the checksum forward to the next block without doing // any more comparisons, because these are pointless (as any more matches will be ignored when // the receipe is generated) and just take up valuable processor time. Edge cases are // especially nasty, using huge amounts of time and memory. int skip = Sizes[s]; if(offset < bytesInEndings && skip > 0) { int spaceLeft = bytesInEndings - offset; int thisRoll = (skip > spaceLeft) ? spaceLeft : skip; rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll); offset += thisRoll; fileOffset += thisRoll; skip -= thisRoll; } // Not all the bytes necessary will have been skipped, so get them // skipped after the next block is loaded. rollOverInitialBytes = skip; // End this loop, so the final byte isn't used again break; } if(static_cast(rFoundBlocks.size()) > (NumBlocks * BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE) || sDiffTimedOut) { abortSearch = true; break; } } // Roll checksum forward rolling.RollForward(beginnings[offset], endings[offset], Sizes[s]); // Increment offsets ++offset; ++fileOffset; } if(abortSearch) break; refresh: // Finished? if(bytesInEndings != Sizes[s]) { // No more data in file -- check the final block // (Do a copy and paste of 5 lines of code instead of introducing a comparison for // each byte of the file) uint16_t hash = rolling.GetComponentForHashing(); if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s])) { if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks)) { goodnessOfFit[fileOffset] = Sizes[s]; } } // finish break; } // Switch buffers, reset offset beginnings = endings; endings = (beginnings == pbuffer0)?(pbuffer1):(pbuffer0); // ie the other buffer offset = 0; // And count the blocks which have been done ++fileBlockNumber; } if(abortSearch) break; } // Free buffers and hash table ::free(pbuffer1); pbuffer1 = 0; ::free(pbuffer0); pbuffer0 = 0; ::free(phashTable); phashTable = 0; } catch(...) { // Cleanup and throw if(pbuffer1 != 0) ::free(pbuffer1); if(pbuffer0 != 0) ::free(pbuffer0); if(phashTable != 0) ::free(phashTable); throw; } #ifndef NDEBUG if(BackupStoreFile::TraceDetailsOfDiffProcess) { // Trace out the found blocks in debug mode TRACE0("Diff: list of found blocks\n======== ======== ======== ========\n Offset BlkIdx Size Movement\n"); for(std::map::const_iterator i(rFoundBlocks.begin()); i != rFoundBlocks.end(); ++i) { int64_t orgLoc = 0; for(int64_t b = 0; b < i->second; ++b) { orgLoc += pIndex[b].mSize; } TRACE4("%8lld %8lld %8lld %8lld\n", i->first, i->second, (int64_t)(pIndex[i->second].mSize), i->first - orgLoc); } TRACE0("======== ======== ======== ========\n"); } #endif } // -------------------------------------------------------------------------- // // Function // Name: static SetupHashTable(BlocksAvailableEntry *, int64_t, in32_t, BlocksAvailableEntry **) // Purpose: Set up the hash table ready for a scan // Created: 14/1/04 // // -------------------------------------------------------------------------- static void SetupHashTable(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t BlockSize, BlocksAvailableEntry **pHashTable) { // Set all entries in the hash table to zero ::memset(pHashTable, 0, (sizeof(BlocksAvailableEntry *) * (64*1024))); // Scan through the blocks, building the hash table for(int64_t b = 0; b < NumBlocks; ++b) { // Only look at the required block size if(pIndex[b].mSize == BlockSize) { // Get the value under which to hash this entry uint16_t hash = RollingChecksum::ExtractHashingComponent(pIndex[b].mWeakChecksum); // Already present in table? if(pHashTable[hash] != 0) { //TRACE1("Another hash entry for %d found\n", hash); // Yes -- need to set the pointer in this entry to the current entry to build the linked list pIndex[b].mpNextInHashList = pHashTable[hash]; } // Put a pointer to this entry in the hash table pHashTable[hash] = pIndex + b; } } } // -------------------------------------------------------------------------- // // Function // Name: static bool SecondStageMatch(xxx) // Purpose: When a match in the hash table is found, scan for second stage match using strong checksum. // Created: 14/1/04 // // -------------------------------------------------------------------------- static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings, int Offset, int32_t BlockSize, int64_t FileBlockNumber, BlocksAvailableEntry *pIndex, std::map &rFoundBlocks) { // Check parameters ASSERT(pBeginnings != 0); ASSERT(pEndings != 0); ASSERT(Offset >= 0); ASSERT(BlockSize > 0); ASSERT(pFirstInHashList != 0); ASSERT(pIndex != 0); uint16_t Hash = fastSum.GetComponentForHashing(); uint32_t Checksum = fastSum.GetChecksum(); // Before we go to the expense of the MD5, make sure it's a darn good match on the checksum we already know. BlocksAvailableEntry *scan = pFirstInHashList; bool found=false; while(scan != 0) { if(scan->mWeakChecksum == Checksum) { found = true; break; } scan = scan->mpNextInHashList; } if(!found) { return false; } // Calculate the strong MD5 digest for this block MD5Digest strong; // Add the data from the beginnings strong.Add(pBeginnings + Offset, BlockSize - Offset); // Add any data from the endings if(Offset > 0) { strong.Add(pEndings, Offset); } strong.Finish(); // Then go through the entries in the hash list, comparing with the strong digest calculated scan = pFirstInHashList; //TRACE0("second stage match\n"); while(scan != 0) { //TRACE3("scan size %d, block size %d, hash %d\n", scan->mSize, BlockSize, Hash); ASSERT(scan->mSize == BlockSize); ASSERT(RollingChecksum::ExtractHashingComponent(scan->mWeakChecksum) == Hash); // Compare? if(strong.DigestMatches(scan->mStrongChecksum)) { //TRACE0("Match!\n"); // Found! Add to list of found blocks... int64_t fileOffset = (FileBlockNumber * BlockSize) + Offset; int64_t blockIndex = (scan - pIndex); // pointer arthmitic is frowned upon. But most efficient way of doing it here -- alternative is to use more memory // We do NOT search for smallest blocks first, as this code originally assumed. // To prevent this from potentially overwriting a better match, the caller must determine // the relative "goodness" of any existing match and this one, and avoid the call if it // could be detrimental. rFoundBlocks[fileOffset] = blockIndex; // No point in searching further, report success return true; } // Next scan = scan->mpNextInHashList; } // Not matched return false; } // -------------------------------------------------------------------------- // // Function // Name: static GenerateRecipe(BackupStoreFileEncodeStream::Recipe &, BlocksAvailableEntry *, int64_t, std::map &) // Purpose: Fills in the recipe from the found block list // Created: 15/1/04 // // -------------------------------------------------------------------------- static void GenerateRecipe(BackupStoreFileEncodeStream::Recipe &rRecipe, BlocksAvailableEntry *pIndex, int64_t NumBlocks, std::map &rFoundBlocks, int64_t SizeOfInputFile) { // NOTE: This function could be a lot more sophisiticated. For example, if // a small block overlaps a big block like this // **** // ******************************* // then the small block will be used, not the big one. But it'd be better to // just ignore the small block and keep the big one. However, some stats should // be gathered about real world files before writing complex code which might // go wrong. // Initialise a blank instruction BackupStoreFileEncodeStream::RecipeInstruction instruction; #define RESET_INSTRUCTION \ instruction.mSpaceBefore = 0; \ instruction.mBlocks = 0; \ instruction.mpStartBlock = 0; RESET_INSTRUCTION // First, a special case for when there are no found blocks if(rFoundBlocks.size() == 0) { // No blocks, just a load of space instruction.mSpaceBefore = SizeOfInputFile; rRecipe.push_back(instruction); #ifndef NDEBUG if(BackupStoreFile::TraceDetailsOfDiffProcess) { TRACE1("Diff: Default recipe generated, %lld bytes of file\n", SizeOfInputFile); } #endif // Don't do anything return; } // Current location int64_t loc = 0; // Then iterate through the list, generating the recipe std::map::const_iterator i(rFoundBlocks.begin()); ASSERT(i != rFoundBlocks.end()); // check logic // Counting for debug tracing #ifndef NDEBUG int64_t debug_NewBytesFound = 0; int64_t debug_OldBlocksUsed = 0; #endif for(; i != rFoundBlocks.end(); ++i) { // Remember... map is (position in file) -> (index of block in pIndex) if(i->first < loc) { // This block overlaps the last one continue; } else if(i->first > loc) { // There's a gap between the end of the last thing and this block. // If there's an instruction waiting, push it onto the list if(instruction.mSpaceBefore != 0 || instruction.mpStartBlock != 0) { rRecipe.push_back(instruction); } // Start a new instruction, with the gap ready RESET_INSTRUCTION instruction.mSpaceBefore = i->first - loc; // Move location forward to match loc += instruction.mSpaceBefore; #ifndef NDEBUG debug_NewBytesFound += instruction.mSpaceBefore; #endif } // First, does the current instruction need pushing back, because this block is not // sequential to the last one? if(instruction.mpStartBlock != 0 && (pIndex + i->second) != (instruction.mpStartBlock + instruction.mBlocks)) { rRecipe.push_back(instruction); RESET_INSTRUCTION } // Add in this block if(instruction.mpStartBlock == 0) { // This block starts a new instruction instruction.mpStartBlock = pIndex + i->second; instruction.mBlocks = 1; } else { // It continues the previous section of blocks instruction.mBlocks += 1; } #ifndef NDEBUG debug_OldBlocksUsed++; #endif // Move location forward loc += pIndex[i->second].mSize; } // Push the last instruction generated rRecipe.push_back(instruction); // Is there any space left at the end which needs sending? if(loc != SizeOfInputFile) { RESET_INSTRUCTION instruction.mSpaceBefore = SizeOfInputFile - loc; #ifndef NDEBUG debug_NewBytesFound += instruction.mSpaceBefore; #endif rRecipe.push_back(instruction); } // dump out the recipe #ifndef NDEBUG TRACE2("Diff: %lld new bytes found, %lld old blocks used\n", debug_NewBytesFound, debug_OldBlocksUsed); if(BackupStoreFile::TraceDetailsOfDiffProcess) { TRACE1("Diff: Recipe generated (size %d)\n======== ========= ========\nSpace b4 FirstBlk NumBlks\n", rRecipe.size()); { for(unsigned int e = 0; e < rRecipe.size(); ++e) { char b[64]; #ifdef WIN32 sprintf(b, "%8I64d", (int64_t)(rRecipe[e].mpStartBlock - pIndex)); #else sprintf(b, "%8lld", (int64_t)(rRecipe[e].mpStartBlock - pIndex)); #endif TRACE3("%8lld %s %8lld\n", rRecipe[e].mSpaceBefore, (rRecipe[e].mpStartBlock == 0)?" -":b, (int64_t)rRecipe[e].mBlocks); } } TRACE0("======== ========= ========\n"); } #endif } // -------------------------------------------------------------------------- // // Function // Name: static TimerSignalHandler(int) // Purpose: Signal handler // Created: 19/3/04 // // -------------------------------------------------------------------------- void TimerSignalHandler(int signal) { sDiffTimedOut = true; } // -------------------------------------------------------------------------- // // Function // Name: static StartDiffTimer() // Purpose: Starts the diff timeout timer // Created: 19/3/04 // // -------------------------------------------------------------------------- void StartDiffTimer() { // Set timer signal handler if(!sSetTimerSignelHandler) { ::signal(SIGVTALRM, TimerSignalHandler); sSetTimerSignelHandler = true; } struct itimerval timeout; // Don't want this to repeat timeout.it_interval.tv_sec = 0; timeout.it_interval.tv_usec = 0; // Single timeout after the specified number of seconds timeout.it_value.tv_sec = sMaximumDiffTime; timeout.it_value.tv_usec = 0; // Set timer if(::setitimer(ITIMER_VIRTUAL, &timeout, NULL) != 0) { TRACE0("WARNING: couldn't set diff timeout\n"); } // Unset flag (last thing) sDiffTimedOut = false; }