/* * Copyright 2011, Ben Langmead * * This file is part of Bowtie 2. * * Bowtie 2 is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Bowtie 2 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Bowtie 2. If not, see . */ #ifndef EBWT_H_ #define EBWT_H_ #include #include #include #include #include #include #include #include #include #include #include #ifdef BOWTIE_MM #include #include #endif #include "shmem.h" #include "alphabet.h" #include "assert_helpers.h" #include "bitpack.h" #include "blockwise_sa.h" #include "endian_swap.h" #include "word_io.h" #include "random_source.h" #include "ref_read.h" #include "threading.h" #include "str_util.h" #include "mm.h" #include "timer.h" #include "reference.h" #include "search_globals.h" #include "ds.h" #include "random_source.h" #include "mem_ids.h" #include "btypes.h" #ifdef POPCNT_CAPABILITY #include "processor_support.h" #endif using namespace std; // From ccnt_lut.cpp, automatically generated by gen_lookup_tables.pl extern uint8_t cCntLUT_4[4][4][256]; static const uint64_t c_table[4] = { 0xffffffffffffffff, 0xaaaaaaaaaaaaaaaa, 0x5555555555555555, 0x0000000000000000 }; #ifndef VMSG_NL #define VMSG_NL(...) \ if(this->verbose()) { \ stringstream tmp; \ tmp << __VA_ARGS__ << endl; \ this->verbose(tmp.str()); \ } #endif #ifndef VMSG #define VMSG(...) \ if(this->verbose()) { \ stringstream tmp; \ tmp << __VA_ARGS__; \ this->verbose(tmp.str()); \ } #endif /** * Flags describing type of Ebwt. */ enum EBWT_FLAGS { EBWT_COLOR = 2, // true -> Ebwt is colorspace EBWT_ENTIRE_REV = 4 // true -> reverse Ebwt is the whole // concatenated string reversed, rather than // each stretch reversed }; /** * Extended Burrows-Wheeler transform header. This together with the * actual data arrays and other text-specific parameters defined in * class Ebwt constitute the entire Ebwt. */ class EbwtParams { public: EbwtParams() { } EbwtParams( TIndexOffU len, int32_t lineRate, int32_t offRate, int32_t ftabChars, bool color, bool entireReverse) { init(len, lineRate, offRate, ftabChars, color, entireReverse); } EbwtParams(const EbwtParams& eh) { init(eh._len, eh._lineRate, eh._offRate, eh._ftabChars, eh._color, eh._entireReverse); } void init( TIndexOffU len, int32_t lineRate, int32_t offRate, int32_t ftabChars, bool color, bool entireReverse) { _color = color; _entireReverse = entireReverse; _len = len; _bwtLen = _len + 1; _sz = (len+3)/4; _bwtSz = (len/4 + 1); _lineRate = lineRate; _origOffRate = offRate; _offRate = offRate; _offMask = OFF_MASK << _offRate; _ftabChars = ftabChars; _eftabLen = _ftabChars*2; _eftabSz = _eftabLen*OFF_SIZE; _ftabLen = (1 << (_ftabChars*2))+1; _ftabSz = _ftabLen*OFF_SIZE; _offsLen = (_bwtLen + (1 << _offRate) - 1) >> _offRate; _offsSz = (uint64_t)_offsLen*OFF_SIZE; _lineSz = 1 << _lineRate; _sideSz = _lineSz * 1 /* lines per side */; _sideBwtSz = _sideSz - OFF_SIZE*4; _sideBwtLen = _sideBwtSz*4; _numSides = (_bwtSz+(_sideBwtSz)-1)/(_sideBwtSz); _numLines = _numSides * 1 /* lines per side */; _ebwtTotLen = _numSides * _sideSz; _ebwtTotSz = _ebwtTotLen; assert(repOk()); } TIndexOffU len() const { return _len; } TIndexOffU lenNucs() const { return _len + (_color ? 1 : 0); } TIndexOffU bwtLen() const { return _bwtLen; } TIndexOffU sz() const { return _sz; } TIndexOffU bwtSz() const { return _bwtSz; } int32_t lineRate() const { return _lineRate; } int32_t origOffRate() const { return _origOffRate; } int32_t offRate() const { return _offRate; } TIndexOffU offMask() const { return _offMask; } int32_t ftabChars() const { return _ftabChars; } int32_t eftabLen() const { return _eftabLen; } int32_t eftabSz() const { return _eftabSz; } TIndexOffU ftabLen() const { return _ftabLen; } TIndexOffU ftabSz() const { return _ftabSz; } TIndexOffU offsLen() const { return _offsLen; } uint64_t offsSz() const { return _offsSz; } int32_t lineSz() const { return _lineSz; } int32_t sideSz() const { return _sideSz; } int32_t sideBwtSz() const { return _sideBwtSz; } int32_t sideBwtLen() const { return _sideBwtLen; } TIndexOffU numSides() const { return _numSides; } TIndexOffU numLines() const { return _numLines; } TIndexOffU ebwtTotLen() const { return _ebwtTotLen; } TIndexOffU ebwtTotSz() const { return _ebwtTotSz; } bool color() const { return _color; } bool entireReverse() const { return _entireReverse; } /** * Set a new suffix-array sampling rate, which involves updating * rate, mask, sample length, and sample size. */ void setOffRate(int __offRate) { _offRate = __offRate; _offMask = OFF_MASK << _offRate; _offsLen = (_bwtLen + (1 << _offRate) - 1) >> _offRate; _offsSz = (uint64_t)_offsLen * OFF_SIZE; } #ifndef NDEBUG /// Check that this EbwtParams is internally consistent bool repOk() const { assert_gt(_len, 0); assert_gt(_lineRate, 3); assert_geq(_offRate, 0); assert_leq(_ftabChars, 16); assert_geq(_ftabChars, 1); assert_lt(_lineRate, 32); assert_lt(_ftabChars, 32); assert_eq(0, _ebwtTotSz % _lineSz); return true; } #endif /** * Pretty-print the header contents to the given output stream. */ void print(ostream& out) const { out << "Headers:" << endl << " len: " << _len << endl << " bwtLen: " << _bwtLen << endl << " sz: " << _sz << endl << " bwtSz: " << _bwtSz << endl << " lineRate: " << _lineRate << endl << " offRate: " << _offRate << endl << " offMask: 0x" << hex << _offMask << dec << endl << " ftabChars: " << _ftabChars << endl << " eftabLen: " << _eftabLen << endl << " eftabSz: " << _eftabSz << endl << " ftabLen: " << _ftabLen << endl << " ftabSz: " << _ftabSz << endl << " offsLen: " << _offsLen << endl << " offsSz: " << _offsSz << endl << " lineSz: " << _lineSz << endl << " sideSz: " << _sideSz << endl << " sideBwtSz: " << _sideBwtSz << endl << " sideBwtLen: " << _sideBwtLen << endl << " numSides: " << _numSides << endl << " numLines: " << _numLines << endl << " ebwtTotLen: " << _ebwtTotLen << endl << " ebwtTotSz: " << _ebwtTotSz << endl << " color: " << _color << endl << " reverse: " << _entireReverse << endl; } TIndexOffU _len; TIndexOffU _bwtLen; TIndexOffU _sz; TIndexOffU _bwtSz; int32_t _lineRate; int32_t _origOffRate; int32_t _offRate; TIndexOffU _offMask; int32_t _ftabChars; uint32_t _eftabLen; uint32_t _eftabSz; TIndexOffU _ftabLen; TIndexOffU _ftabSz; TIndexOffU _offsLen; uint64_t _offsSz; uint32_t _lineSz; uint32_t _sideSz; uint32_t _sideBwtSz; uint32_t _sideBwtLen; TIndexOffU _numSides; TIndexOffU _numLines; TIndexOffU _ebwtTotLen; TIndexOffU _ebwtTotSz; bool _color; bool _entireReverse; }; /** * Exception to throw when a file-realted error occurs. */ class EbwtFileOpenException : public std::runtime_error { public: EbwtFileOpenException(const std::string& msg = "") : std::runtime_error(msg) { } }; /** * Calculate size of file with given name. */ static inline int64_t fileSize(const char* name) { std::ifstream f; f.open(name, std::ios_base::binary | std::ios_base::in); if (!f.good() || f.eof() || !f.is_open()) { return 0; } f.seekg(0, std::ios_base::beg); std::ifstream::pos_type begin_pos = f.tellg(); f.seekg(0, std::ios_base::end); return static_cast(f.tellg() - begin_pos); } /** * Encapsulates a location in the bwt text in terms of the side it * occurs in and its offset within the side. */ struct SideLocus { SideLocus() : _sideByteOff(0), _sideNum(0), _charOff(0), _by(-1), _bp(-1) { } /** * Construct from row and other relevant information about the Ebwt. */ SideLocus(TIndexOffU row, const EbwtParams& ep, const uint8_t* ebwt) { initFromRow(row, ep, ebwt); } /** * Init two SideLocus objects from a top/bot pair, using the result * from one call to initFromRow to possibly avoid a second call. */ static void initFromTopBot( TIndexOffU top, TIndexOffU bot, const EbwtParams& ep, const uint8_t* ebwt, SideLocus& ltop, SideLocus& lbot) { const TIndexOffU sideBwtLen = ep._sideBwtLen; assert_gt(bot, top); ltop.initFromRow(top, ep, ebwt); TIndexOffU spread = bot - top; // Many cache misses on the following lines if(ltop._charOff + spread < sideBwtLen) { lbot._charOff = (uint32_t)(ltop._charOff + spread); lbot._sideNum = ltop._sideNum; lbot._sideByteOff = ltop._sideByteOff; lbot._by = lbot._charOff >> 2; assert_lt(lbot._by, (int)ep._sideBwtSz); lbot._bp = lbot._charOff & 3; } else { lbot.initFromRow(bot, ep, ebwt); } } /** * Calculate SideLocus based on a row and other relevant * information about the shape of the Ebwt. */ void initFromRow(TIndexOffU row, const EbwtParams& ep, const uint8_t* ebwt) { const int32_t sideSz = ep._sideSz; // Side length is hard-coded for now; this allows the compiler // to do clever things to accelerate / and %. _sideNum = row / (48*OFF_SIZE); assert_lt(_sideNum, ep._numSides); _charOff = row % (48*OFF_SIZE); _sideByteOff = _sideNum * sideSz; assert_leq(row, ep._len); assert_leq(_sideByteOff + sideSz, ep._ebwtTotSz); // Tons of cache misses on the next line _by = _charOff >> 2; // byte within side assert_lt(_by, (int)ep._sideBwtSz); _bp = _charOff & 3; // bit-pair within byte } /** * Transform this SideLocus to refer to the next side (i.e. the one * corresponding to the next side downstream). Set all cursors to * point to the beginning of the side. */ void nextSide(const EbwtParams& ep) { assert(valid()); _sideByteOff += ep.sideSz(); _sideNum++; _by = _bp = _charOff = 0; assert(valid()); } /** * Return true iff this is an initialized SideLocus */ bool valid() const { if(_bp != -1) { return true; } return false; } /** * Convert locus to BW row it corresponds to. */ TIndexOffU toBWRow() const { return _sideNum * 48*OFF_SIZE + _charOff; } #ifndef NDEBUG /** * Check that SideLocus is internally consistent and consistent * with the (provided) EbwtParams. */ bool repOk(const EbwtParams& ep) const { ASSERT_ONLY(TIndexOffU row = _sideNum * 48*OFF_SIZE + _charOff); assert_leq(row, ep._len); assert_range(-1, 3, _bp); assert_range(0, (int)ep._sideBwtSz, _by); return true; } #endif /// Make this look like an invalid SideLocus void invalidate() { _bp = -1; } /** * Return a read-only pointer to the beginning of the top side. */ const uint8_t *side(const uint8_t* ebwt) const { return ebwt + _sideByteOff; } TIndexOffU _sideByteOff; // offset of top side within ebwt[] TIndexOffU _sideNum; // index of side uint32_t _charOff; // character offset within side int32_t _by; // byte within side (not adjusted for bw sides) int32_t _bp; // bitpair within byte (not adjusted for bw sides) }; #ifdef POPCNT_CAPABILITY // wrapping of "struct" struct USE_POPCNT_GENERIC { #endif // Use this standard bit-bashing population count inline static int pop64(uint64_t x) { // Lots of cache misses on following lines (>10K) x = x - ((x >> 1) & 0x5555555555555555llu); x = (x & 0x3333333333333333llu) + ((x >> 2) & 0x3333333333333333llu); x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0Fllu; x = x + (x >> 8); x = x + (x >> 16); x = x + (x >> 32); return (int)(x & 0x3Fllu); } #ifdef POPCNT_CAPABILITY // wrapping a "struct" }; #endif #ifdef POPCNT_CAPABILITY struct USE_POPCNT_INSTRUCTION { inline static int pop64(uint64_t x) { int64_t count; asm ("popcnt %[x],%[count]\n": [count] "=&r" (count): [x] "r" (x)); return count; } }; #endif /** * Tricky-bit-bashing bitpair counting for given two-bit value (0-3) * within a 64-bit argument. */ #ifdef POPCNT_CAPABILITY template #endif inline static int countInU64(int c, uint64_t dw) { uint64_t c0 = c_table[c]; uint64_t x0 = dw ^ c0; uint64_t x1 = (x0 >> 1); uint64_t x2 = x1 & (0x5555555555555555); uint64_t x3 = x0 & x2; #ifdef POPCNT_CAPABILITY uint64_t tmp = Operation().pop64(x3); #else uint64_t tmp = pop64(x3); #endif return (int) tmp; } // Forward declarations for Ebwt class class EbwtSearchParams; /** * Extended Burrows-Wheeler transform data. * * An Ebwt may be transferred to and from RAM with calls to * evictFromMemory() and loadIntoMemory(). By default, a newly-created * Ebwt is not loaded into memory; if the user would like to use a * newly-created Ebwt to answer queries, they must first call * loadIntoMemory(). */ class Ebwt { public: #define Ebwt_INITS \ _toBigEndian(currentlyBigEndian()), \ _overrideOffRate(overrideOffRate), \ _verbose(verbose), \ _passMemExc(passMemExc), \ _sanity(sanityCheck), \ fw_(fw), \ _in1(NULL), \ _in2(NULL), \ _zOff(OFF_MASK), \ _zEbwtByteOff(OFF_MASK), \ _zEbwtBpOff(-1), \ _nPat(0), \ _nFrag(0), \ _plen(EBWT_CAT), \ _rstarts(EBWT_CAT), \ _fchr(EBWT_CAT), \ _ftab(EBWT_CAT), \ _eftab(EBWT_CAT), \ _offs(EBWT_CAT), \ _ebwt(EBWT_CAT), \ _useMm(false), \ useShmem_(false), \ _refnames(EBWT_CAT), \ mmFile1_(NULL), \ mmFile2_(NULL) /// Construct an Ebwt from the given input file Ebwt(const string& in, int color, int needEntireReverse, bool fw, int32_t overrideOffRate, // = -1, int32_t offRatePlus, // = -1, bool useMm, // = false, bool useShmem, // = false, bool mmSweep, // = false, bool loadNames, // = false, bool loadSASamp, // = true, bool loadFtab, // = true, bool loadRstarts, // = true, bool verbose, // = false, bool startVerbose, // = false, bool passMemExc, // = false, bool sanityCheck) : // = false) : Ebwt_INITS { assert(!useMm || !useShmem); #ifdef POPCNT_CAPABILITY ProcessorSupport ps; _usePOPCNTinstruction = ps.POPCNTenabled(); #endif packed_ = false; _useMm = useMm; useShmem_ = useShmem; _in1Str = in + ".1." + gEbwt_ext; _in2Str = in + ".2." + gEbwt_ext; readIntoMemory( color, // expect index to be colorspace? fw ? -1 : needEntireReverse, // need REF_READ_REVERSE loadSASamp, // load the SA sample portion? loadFtab, // load the ftab & eftab? loadRstarts, // load the rstarts array? true, // stop after loading the header portion? &_eh, // params mmSweep, // mmSweep loadNames, // loadNames startVerbose); // startVerbose // If the offRate has been overridden, reflect that in the // _eh._offRate field if(offRatePlus > 0 && _overrideOffRate == -1) { _overrideOffRate = _eh._offRate + offRatePlus; } if(_overrideOffRate > _eh._offRate) { _eh.setOffRate(_overrideOffRate); assert_eq(_overrideOffRate, _eh._offRate); } assert(repOk()); } /// Construct an Ebwt from the given header parameters and string /// vector, optionally using a blockwise suffix sorter with the /// given 'bmax' and 'dcv' parameters. The string vector is /// ultimately joined and the joined string is passed to buildToDisk(). template Ebwt( TStr exampleStr, bool packed, int color, int needEntireReverse, int32_t lineRate, int32_t offRate, int32_t ftabChars, int nthreads, const string& file, // base filename for EBWT files bool fw, bool useBlockwise, TIndexOffU bmax, TIndexOffU bmaxSqrtMult, TIndexOffU bmaxDivN, int dcv, EList& is, EList& szs, TIndexOffU sztot, const RefReadInParams& refparams, uint32_t seed, int32_t overrideOffRate = -1, bool doSaFile = false, bool doBwtFile = false, bool verbose = false, bool passMemExc = false, bool sanityCheck = false) : Ebwt_INITS, _eh( joinedLen(szs), lineRate, offRate, ftabChars, color, refparams.reverse == REF_READ_REVERSE) { #ifdef POPCNT_CAPABILITY ProcessorSupport ps; _usePOPCNTinstruction = ps.POPCNTenabled(); #endif _in1Str = file + ".1." + gEbwt_ext; _in2Str = file + ".2." + gEbwt_ext; packed_ = packed; // Open output files ofstream fout1(_in1Str.c_str(), ios::binary); if(!fout1.good()) { cerr << "Could not open index file for writing: \"" << _in1Str.c_str() << "\"" << endl << "Please make sure the directory exists and that permissions allow writing by" << endl << "Bowtie." << endl; throw 1; } ofstream fout2(_in2Str.c_str(), ios::binary); if(!fout2.good()) { cerr << "Could not open index file for writing: \"" << _in2Str.c_str() << "\"" << endl << "Please make sure the directory exists and that permissions allow writing by" << endl << "Bowtie." << endl; throw 1; } _inSaStr = file + ".sa"; _inBwtStr = file + ".bwt"; ofstream *saOut = NULL, *bwtOut = NULL; if(doSaFile) { saOut = new ofstream(_inSaStr.c_str(), ios::binary); if(!saOut->good()) { cerr << "Could not open suffix-array file for writing: \"" << _inSaStr.c_str() << "\"" << endl << "Please make sure the directory exists and that permissions allow writing by" << endl << "Bowtie." << endl; throw 1; } } if(doBwtFile) { bwtOut = new ofstream(_inBwtStr.c_str(), ios::binary); if(!bwtOut->good()) { cerr << "Could not open suffix-array file for writing: \"" << _inBwtStr.c_str() << "\"" << endl << "Please make sure the directory exists and that permissions allow writing by" << endl << "Bowtie." << endl; throw 1; } } // Build SA(T) and BWT(T) block by block initFromVector( is, szs, sztot, refparams, fout1, fout2, file, saOut, bwtOut, nthreads, useBlockwise, bmax, bmaxSqrtMult, bmaxDivN, dcv, seed, verbose); // Close output files fout1.flush(); int64_t tellpSz1 = (int64_t)fout1.tellp(); VMSG_NL("Wrote " << fout1.tellp() << " bytes to primary EBWT file: " << _in1Str.c_str()); fout1.close(); bool err = false; if(tellpSz1 > fileSize(_in1Str.c_str())) { err = true; cerr << "Index is corrupt: File size for " << _in1Str.c_str() << " should have been " << tellpSz1 << " but is actually " << fileSize(_in1Str.c_str()) << "." << endl; } fout2.flush(); int64_t tellpSz2 = (int64_t)fout2.tellp(); VMSG_NL("Wrote " << fout2.tellp() << " bytes to secondary EBWT file: " << _in2Str.c_str()); fout2.close(); if(tellpSz2 > fileSize(_in2Str.c_str())) { err = true; cerr << "Index is corrupt: File size for " << _in2Str.c_str() << " should have been " << tellpSz2 << " but is actually " << fileSize(_in2Str.c_str()) << "." << endl; } if(saOut != NULL) { // Check on suffix array output file size int64_t tellpSzSa = (int64_t)saOut->tellp(); VMSG_NL("Wrote " << tellpSzSa << " bytes to suffix-array file: " << _inSaStr.c_str()); saOut->close(); if(tellpSzSa > fileSize(_inSaStr.c_str())) { err = true; cerr << "Index is corrupt: File size for " << _inSaStr.c_str() << " should have been " << tellpSzSa << " but is actually " << fileSize(_inSaStr.c_str()) << "." << endl; } } if(bwtOut != NULL) { // Check on suffix array output file size int64_t tellpSzBwt = (int64_t)bwtOut->tellp(); VMSG_NL("Wrote " << tellpSzBwt << " bytes to BWT file: " << _inBwtStr.c_str()); bwtOut->close(); if(tellpSzBwt > fileSize(_inBwtStr.c_str())) { err = true; cerr << "Index is corrupt: File size for " << _inBwtStr.c_str() << " should have been " << tellpSzBwt << " but is actually " << fileSize(_inBwtStr.c_str()) << "." << endl; } } if(err) { cerr << "Please check if there is a problem with the disk or if disk is full." << endl; throw 1; } // Reopen as input streams VMSG_NL("Re-opening _in1 and _in2 as input streams"); if(_sanity) { VMSG_NL("Sanity-checking Bt2"); assert(!isInMemory()); readIntoMemory( color, // colorspace? fw ? -1 : needEntireReverse, // 1 -> need the reverse to be reverse-of-concat true, // load SA sample (_offs[])? true, // load ftab (_ftab[] & _eftab[])? true, // load r-starts (_rstarts[])? false, // just load header? NULL, // Params object to fill false, // mm sweep? true, // load names? false); // verbose startup? sanityCheckAll(refparams.reverse); evictFromMemory(); assert(!isInMemory()); } VMSG_NL("Returning from Ebwt constructor"); } /** * Static constructor for a pair of forward/reverse indexes for the * given reference string. */ template static pair fromString( const char* str, bool packed, int color, int reverse, bool bigEndian, int32_t lineRate, int32_t offRate, int32_t ftabChars, const string& file, bool useBlockwise, TIndexOffU bmax, TIndexOffU bmaxSqrtMult, TIndexOffU bmaxDivN, int dcv, uint32_t seed, bool verbose, bool autoMem, bool sanity) { EList strs(EBWT_CAT); strs.push_back(std::string(str)); return fromStrings( strs, packed, color, reverse, bigEndian, lineRate, offRate, ftabChars, file, useBlockwise, bmax, bmaxSqrtMult, bmaxDivN, dcv, seed, verbose, autoMem, sanity); } /** * Static constructor for a pair of forward/reverse indexes for the * given list of reference strings. */ template static pair fromStrings( const EList& strs, bool packed, int color, int reverse, bool bigEndian, int32_t lineRate, int32_t offRate, int32_t ftabChars, const string& file, bool useBlockwise, TIndexOffU bmax, TIndexOffU bmaxSqrtMult, TIndexOffU bmaxDivN, int dcv, uint32_t seed, bool verbose, bool autoMem, bool sanity) { assert(!strs.empty()); EList is(EBWT_CAT); RefReadInParams refparams(color, REF_READ_FORWARD, false, false); // Adapt sequence strings to stringstreams open for input auto_ptr ss(new stringstream()); for(TIndexOffU i = 0; i < strs.size(); i++) { (*ss) << ">" << i << endl << strs[i] << endl; } auto_ptr fb(new FileBuf(ss.get())); assert(!fb->eof()); assert(fb->get() == '>'); ASSERT_ONLY(fb->reset()); assert(!fb->eof()); is.push_back(fb.get()); // Vector for the ordered list of "records" comprising the input // sequences. A record represents a stretch of unambiguous // characters in one of the input sequences. EList szs(EBWT_CAT); std::pair sztot; sztot = BitPairReference::szsFromFasta(is, file, bigEndian, refparams, szs, sanity); // Construct Ebwt from input strings and parameters Ebwt *ebwtFw = new Ebwt( TStr(), packed, refparams.color ? 1 : 0, -1, // fw lineRate, offRate, // suffix-array sampling rate ftabChars, // number of chars in initial arrow-pair calc file, // basename for .?.ebwt files true, // fw? useBlockwise, // useBlockwise bmax, // block size for blockwise SA builder bmaxSqrtMult, // block size as multiplier of sqrt(len) bmaxDivN, // block size as divisor of len dcv, // difference-cover period is, // list of input streams szs, // list of reference sizes sztot.first, // total size of all unambiguous ref chars refparams, // reference read-in parameters seed, // pseudo-random number generator seed -1, // override offRate verbose, // be talkative autoMem, // pass exceptions up to the toplevel so that we can adjust memory settings automatically sanity); // verify results and internal consistency refparams.reverse = reverse; szs.clear(); sztot = BitPairReference::szsFromFasta(is, file, bigEndian, refparams, szs, sanity); // Construct Ebwt from input strings and parameters Ebwt *ebwtBw = new Ebwt( TStr(), packed, refparams.color ? 1 : 0, reverse == REF_READ_REVERSE, lineRate, offRate, // suffix-array sampling rate ftabChars, // number of chars in initial arrow-pair calc file + ".rev",// basename for .?.ebwt files false, // fw? useBlockwise, // useBlockwise bmax, // block size for blockwise SA builder bmaxSqrtMult, // block size as multiplier of sqrt(len) bmaxDivN, // block size as divisor of len dcv, // difference-cover period is, // list of input streams szs, // list of reference sizes sztot.first, // total size of all unambiguous ref chars refparams, // reference read-in parameters seed, // pseudo-random number generator seed -1, // override offRate verbose, // be talkative autoMem, // pass exceptions up to the toplevel so that we can adjust memory settings automatically sanity); // verify results and internal consistency return make_pair(ebwtFw, ebwtBw); } /// Return true iff the Ebwt is packed bool isPacked() { return packed_; } /** * Write the rstarts array given the szs array for the reference. */ void szsToDisk(const EList& szs, ostream& os, int reverse); /** * Helper for the constructors above. Takes a vector of text * strings and joins them into a single string with a call to * joinToDisk, which does a join (with padding) and writes some of * the resulting data directly to disk rather than keep it in * memory. It then constructs a suffix-array producer (what kind * depends on 'useBlockwise') for the resulting sequence. The * suffix-array producer can then be used to obtain chunks of the * joined string's suffix array. */ template void initFromVector(EList& is, EList& szs, TIndexOffU sztot, const RefReadInParams& refparams, ofstream& out1, ofstream& out2, const string& outfile, ofstream* saOut, ofstream* bwtOut, int nthreads, bool useBlockwise, TIndexOffU bmax, TIndexOffU bmaxSqrtMult, TIndexOffU bmaxDivN, int dcv, uint32_t seed, bool verbose) { // Compose text strings into single string VMSG_NL("Calculating joined length"); TStr s; // holds the entire joined reference after call to joinToDisk TIndexOffU jlen; jlen = joinedLen(szs); assert_geq(jlen, sztot); VMSG_NL("Writing header"); writeFromMemory(true, out1, out2); try { VMSG_NL("Reserving space for joined string"); s.resize(jlen); VMSG_NL("Joining reference sequences"); if(refparams.reverse == REF_READ_REVERSE) { { Timer timer(cout, " Time to join reference sequences: ", _verbose); joinToDisk(is, szs, sztot, refparams, s, out1, out2); } { Timer timer(cout, " Time to reverse reference sequence: ", _verbose); EList tmp(EBWT_CAT); s.reverse(); reverseRefRecords(szs, tmp, false, verbose); szsToDisk(tmp, out1, refparams.reverse); } } else { Timer timer(cout, " Time to join reference sequences: ", _verbose); joinToDisk(is, szs, sztot, refparams, s, out1, out2); szsToDisk(szs, out1, refparams.reverse); } // Joined reference sequence now in 's' } catch(bad_alloc& e) { // If we throw an allocation exception in the try block, // that means that the joined version of the reference // string itself is too larger to fit in memory. The only // alternatives are to tell the user to give us more memory // or to try again with a packed representation of the // reference (if we haven't tried that already). cerr << "Could not allocate space for a joined string of " << jlen << " elements." << endl; if(!isPacked() && _passMemExc) { // Pass the exception up so that we can retry using a // packed string representation throw e; } // There's no point passing this exception on. The fact // that we couldn't allocate the joined string means that // --bmax is irrelevant - the user should re-run with // ebwt-build-packed if(isPacked()) { cerr << "Please try running bowtie-build on a computer with more memory." << endl; } else { cerr << "Please try running bowtie-build in packed mode (-p/--packed) or in automatic" << endl << "mode (-a/--auto), or try again on a computer with more memory." << endl; } if(sizeof(void*) == 4) { cerr << "If this computer has more than 4 GB of memory, try using a 64-bit executable;" << endl << "this executable is 32-bit." << endl; } throw 1; } // Succesfully obtained joined reference string assert_geq(s.length(), jlen); if(bmax != OFF_MASK) { VMSG_NL("bmax according to bmax setting: " << bmax); } else if(bmaxSqrtMult != OFF_MASK) { bmax *= bmaxSqrtMult; VMSG_NL("bmax according to bmaxSqrtMult setting: " << bmax); } else if(bmaxDivN != OFF_MASK) { bmax = max(jlen / bmaxDivN, 1); VMSG_NL("bmax according to bmaxDivN setting: " << bmax); } else { bmax = (TIndexOffU)sqrt(s.length()); VMSG_NL("bmax defaulted to: " << bmax); } int iter = 0; bool first = true; streampos out1pos = out1.tellp(); streampos out2pos = out2.tellp(); // Look for bmax/dcv parameters that work. while(true) { if(!first && bmax < 40 && _passMemExc) { cerr << "Could not find approrpiate bmax/dcv settings for building this index." << endl; if(!isPacked()) { // Throw an exception exception so that we can // retry using a packed string representation throw bad_alloc(); } else { cerr << "Already tried a packed string representation." << endl; } cerr << "Please try indexing this reference on a computer with more memory." << endl; if(sizeof(void*) == 4) { cerr << "If this computer has more than 4 GB of memory, try using a 64-bit executable;" << endl << "this executable is 32-bit." << endl; } throw 1; } if(!first) { out1.seekp(out1pos); out2.seekp(out2pos); } if(dcv > 4096) dcv = 4096; if((iter % 6) == 5 && dcv < 4096 && dcv != 0) { dcv <<= 1; // double difference-cover period } else { bmax -= (bmax >> 2); // reduce by 25% } VMSG("Using parameters --bmax " << bmax); if(dcv == 0) { VMSG_NL(" and *no difference cover*"); } else { VMSG_NL(" --dcv " << dcv); } iter++; try { { VMSG_NL(" Doing ahead-of-time memory usage test"); // Make a quick-and-dirty attempt to force a bad_alloc iff // we would have thrown one eventually as part of // constructing the DifferenceCoverSample dcv <<= 1; TIndexOffU sz = (TIndexOffU)DifferenceCoverSample::simulateAllocs(s, dcv >> 1); if(nthreads > 1) sz *= (nthreads + 1); AutoArray tmp(sz, EBWT_CAT); dcv >>= 1; // Likewise with the KarkkainenBlockwiseSA sz = (TIndexOffU)KarkkainenBlockwiseSA::simulateAllocs(s, bmax); AutoArray tmp2(sz, EBWT_CAT); // Now throw in the 'ftab' and 'isaSample' structures // that we'll eventually allocate in buildToDisk AutoArray ftab(_eh._ftabLen * 2, EBWT_CAT); AutoArray side(_eh._sideSz, EBWT_CAT); // Grab another 20 MB out of caution AutoArray extra(20*1024*1024, EBWT_CAT); // If we made it here without throwing bad_alloc, then we // passed the memory-usage stress test VMSG(" Passed! Constructing with these parameters: --bmax " << bmax << " --dcv " << dcv); if(isPacked()) { VMSG(" --packed"); } VMSG_NL(""); } VMSG_NL("Constructing suffix-array element generator"); KarkkainenBlockwiseSA bsa(s, bmax, nthreads, dcv, seed, _sanity, _passMemExc, _verbose, outfile); assert(bsa.suffixItrIsReset()); assert_eq(bsa.size(), s.length()+1); VMSG_NL("Converting suffix-array elements to index image"); buildToDisk(bsa, s, out1, out2, saOut, bwtOut); out1.flush(); out2.flush(); bool failed = out1.fail() || out2.fail(); if(saOut != NULL) { saOut->flush(); failed = failed || saOut->fail(); } if(bwtOut != NULL) { bwtOut->flush(); failed = failed || bwtOut->fail(); } if(failed) { cerr << "An error occurred writing the index to disk. Please check if the disk is full." << endl; throw 1; } break; } catch(bad_alloc& e) { if(_passMemExc) { VMSG_NL(" Ran out of memory; automatically trying more memory-economical parameters."); } else { cerr << "Out of memory while constructing suffix array. Please try using a smaller" << endl << "number of blocks by specifying a smaller --bmax or a larger --bmaxdivn" << endl; throw 1; } } first = false; } assert(repOk()); // Now write reference sequence names on the end assert_eq(this->_refnames.size(), this->_nPat); for(TIndexOffU i = 0; i < this->_refnames.size(); i++) { out1 << this->_refnames[i].c_str() << endl; } out1 << '\0'; out1.flush(); out2.flush(); if(out1.fail() || out2.fail()) { cerr << "An error occurred writing the index to disk. Please check if the disk is full." << endl; throw 1; } VMSG_NL("Returning from initFromVector"); } /** * Return the length that the joined string of the given string * list will have. Note that this is indifferent to how the text * fragments correspond to input sequences - it just cares about * the lengths of the fragments. */ TIndexOffU joinedLen(EList& szs) { TIndexOffU ret = 0; for(unsigned int i = 0; i < szs.size(); i++) { ret += szs[i].len; } return ret; } /// Destruct an Ebwt ~Ebwt() { _fchr.reset(); _ftab.reset(); _eftab.reset(); _plen.reset(); _rstarts.reset(); _offs.reset(); _ebwt.reset(); if(offs() != NULL && useShmem_) { FREE_SHARED(offs()); } if(ebwt() != NULL && useShmem_) { FREE_SHARED(ebwt()); } if (_in1 != NULL) fclose(_in1); if (_in2 != NULL) fclose(_in2); } /// Accessors inline const EbwtParams& eh() const { return _eh; } TIndexOffU zOff() const { return _zOff; } TIndexOffU zEbwtByteOff() const { return _zEbwtByteOff; } TIndexOff zEbwtBpOff() const { return _zEbwtBpOff; } TIndexOffU nPat() const { return _nPat; } TIndexOffU nFrag() const { return _nFrag; } inline TIndexOffU* fchr() { return _fchr.get(); } inline TIndexOffU* ftab() { return _ftab.get(); } inline TIndexOffU* eftab() { return _eftab.get(); } inline TIndexOffU* offs() { return _offs.get(); } inline TIndexOffU* plen() { return _plen.get(); } inline TIndexOffU* rstarts() { return _rstarts.get(); } inline uint8_t* ebwt() { return _ebwt.get(); } inline const TIndexOffU* fchr() const { return _fchr.get(); } inline const TIndexOffU* ftab() const { return _ftab.get(); } inline const TIndexOffU* eftab() const { return _eftab.get(); } inline const TIndexOffU* offs() const { return _offs.get(); } inline const TIndexOffU* plen() const { return _plen.get(); } inline const TIndexOffU* rstarts() const { return _rstarts.get(); } inline const uint8_t* ebwt() const { return _ebwt.get(); } bool toBe() const { return _toBigEndian; } bool verbose() const { return _verbose; } bool sanityCheck() const { return _sanity; } EList& refnames() { return _refnames; } bool fw() const { return fw_; } #ifdef POPCNT_CAPABILITY bool _usePOPCNTinstruction; #endif /** * Returns true iff the index contains the given string (exactly). The * given string must contain only unambiguous characters. TODO: * support skipping of ambiguous characters. */ bool contains( const BTDnaString& str, TIndexOffU *top = NULL, TIndexOffU *bot = NULL) const; /** * Returns true iff the index contains the given string (exactly). The * given string must contain only unambiguous characters. TODO: * support skipping of ambiguous characters. */ bool contains( const char *str, TIndexOffU *top = NULL, TIndexOffU *bot = NULL) const { return contains(BTDnaString(str, true), top, bot); } /// Return true iff the Ebwt is currently in memory bool isInMemory() const { if(ebwt() != NULL) { // Note: We might have skipped loading _offs, _ftab, // _eftab, and _rstarts depending on whether this is the // reverse index and what algorithm is being used. assert(_eh.repOk()); //assert(_ftab != NULL); //assert(_eftab != NULL); assert(fchr() != NULL); //assert(_offs != NULL); //assert(_rstarts != NULL); assert_neq(_zEbwtByteOff, OFF_MASK); assert_neq(_zEbwtBpOff, -1); return true; } else { assert(ftab() == NULL); assert(eftab() == NULL); assert(fchr() == NULL); assert(offs() == NULL); assert(rstarts() == NULL); assert_eq(_zEbwtByteOff, OFF_MASK); assert_eq(_zEbwtBpOff, -1); return false; } } /// Return true iff the Ebwt is currently stored on disk bool isEvicted() const { return !isInMemory(); } /** * Load this Ebwt into memory by reading it in from the _in1 and * _in2 streams. */ void loadIntoMemory( int color, int needEntireReverse, bool loadSASamp, bool loadFtab, bool loadRstarts, bool loadNames, bool verbose) { readIntoMemory( color, // expect index to be colorspace? needEntireReverse, // require reverse index to be concatenated reference reversed loadSASamp, // load the SA sample portion? loadFtab, // load the ftab (_ftab[] and _eftab[])? loadRstarts, // load the r-starts (_rstarts[])? false, // stop after loading the header portion? NULL, // params false, // mmSweep loadNames, // loadNames verbose); // startVerbose } /** * Frees memory associated with the Ebwt. */ void evictFromMemory() { assert(isInMemory()); _fchr.free(); _ftab.free(); _eftab.free(); _rstarts.free(); _offs.free(); // might not be under control of APtrWrap _ebwt.free(); // might not be under control of APtrWrap // Keep plen; it's small and the client may want to seq it // even when the others are evicted. //_plen = NULL; _zEbwtByteOff = OFF_MASK; _zEbwtBpOff = -1; } /** * Turn a substring of 'seq' starting at offset 'off' and having * length equal to the index's 'ftabChars' into an int that can be * used to index into the ftab array. */ TIndexOffU ftabSeqToInt( const BTDnaString& seq, size_t off, bool rev) const { int fc = _eh._ftabChars; size_t lo = off, hi = lo + fc; assert_leq(hi, seq.length()); TIndexOffU ftabOff = 0; for(int i = 0; i < fc; i++) { bool fwex = fw(); if(rev) fwex = !fwex; // We add characters to the ftabOff in the order they would // have been consumed in a normal search. For BWT, this // means right-to-left order; for BWT' it's left-to-right. int c = (fwex ? seq[lo + i] : seq[hi - i - 1]); if(c > 3) { return std::numeric_limits::max(); } assert_range(0, 3, c); ftabOff <<= 2; ftabOff |= c; } return ftabOff; } /** * Non-static facade for static function ftabHi. */ TIndexOffU ftabHi(TIndexOffU i) const { return Ebwt::ftabHi( ftab(), eftab(), _eh._len, _eh._ftabLen, _eh._eftabLen, i); } /** * Get "high interpretation" of ftab entry at index i. The high * interpretation of a regular ftab entry is just the entry * itself. The high interpretation of an extended entry is the * second correpsonding ui32 in the eftab. * * It's a static member because it's convenient to ask this * question before the Ebwt is fully initialized. */ static TIndexOffU ftabHi( const TIndexOffU *ftab, const TIndexOffU *eftab, TIndexOffU len, TIndexOffU ftabLen, TIndexOffU eftabLen, TIndexOffU i) { assert_lt(i, ftabLen); if(ftab[i] <= len) { return ftab[i]; } else { TIndexOffU efIdx = ftab[i] ^ OFF_MASK; assert_lt(efIdx*2+1, eftabLen); return eftab[efIdx*2+1]; } } /** * Non-static facade for static function ftabLo. */ TIndexOffU ftabLo(TIndexOffU i) const { return Ebwt::ftabLo( ftab(), eftab(), _eh._len, _eh._ftabLen, _eh._eftabLen, i); } /** * Get low bound of ftab range. */ TIndexOffU ftabLo(const BTDnaString& seq, size_t off) const { return ftabLo(ftabSeqToInt(seq, off, false)); } /** * Get high bound of ftab range. */ TIndexOffU ftabHi(const BTDnaString& seq, size_t off) const { return ftabHi(ftabSeqToInt(seq, off, false)); } /** * Extract characters from seq starting at offset 'off' and going either * forward or backward, depending on 'rev'. Order matters when compiling * the integer that gets looked up in the ftab. Each successive character * is ORed into the least significant bit-pair, and characters are * integrated in the direction of the search. */ bool ftabLoHi( const BTDnaString& seq, // sequence to extract from size_t off, // offset into seq to begin extracting bool rev, // reverse while extracting TIndexOffU& top, TIndexOffU& bot) const { TIndexOffU fi = ftabSeqToInt(seq, off, rev); if(fi == std::numeric_limits::max()) { return false; } top = ftabHi(fi); bot = ftabLo(fi+1); assert_geq(bot, top); return true; } /** * Get "low interpretation" of ftab entry at index i. The low * interpretation of a regular ftab entry is just the entry * itself. The low interpretation of an extended entry is the * first correpsonding ui32 in the eftab. * * It's a static member because it's convenient to ask this * question before the Ebwt is fully initialized. */ static TIndexOffU ftabLo( const TIndexOffU *ftab, const TIndexOffU *eftab, TIndexOffU len, TIndexOffU ftabLen, TIndexOffU eftabLen, TIndexOffU i) { assert_lt(i, ftabLen); if(ftab[i] <= len) { return ftab[i]; } else { TIndexOffU efIdx = ftab[i] ^ OFF_MASK; assert_lt(efIdx*2+1, eftabLen); return eftab[efIdx*2]; } } /** * Try to resolve the reference offset of the BW element 'elt'. If * it can be resolved immediately, return the reference offset. If * it cannot be resolved immediately, return max value. */ TIndexOffU tryOffset(TIndexOffU elt) const { assert(offs() != NULL); if(elt == _zOff) return 0; if((elt & _eh._offMask) == elt) { TIndexOffU eltOff = elt >> _eh._offRate; assert_lt(eltOff, _eh._offsLen); TIndexOffU off = offs()[eltOff]; assert_neq(OFF_MASK, off); return off; } else { // Try looking at zoff return OFF_MASK; } } /** * Try to resolve the reference offset of the BW element 'elt' such * that the offset returned is at the right-hand side of the * forward reference substring involved in the hit. */ TIndexOffU tryOffset( TIndexOffU elt, bool fw, TIndexOffU hitlen) const { TIndexOffU off = tryOffset(elt); if(off != OFF_MASK && !fw) { assert_lt(off, _eh._len); off = _eh._len - off - 1; assert_geq(off, hitlen-1); off -= (hitlen-1); assert_lt(off, _eh._len); } return off; } /** * Walk 'steps' steps to the left and return the row arrived at. */ TIndexOffU walkLeft(TIndexOffU row, TIndexOffU steps) const; /** * Resolve the reference offset of the BW element 'elt'. */ TIndexOffU getOffset(TIndexOffU row) const; /** * Resolve the reference offset of the BW element 'elt' such that * the offset returned is at the right-hand side of the forward * reference substring involved in the hit. */ TIndexOffU getOffset( TIndexOffU elt, bool fw, TIndexOffU hitlen) const; /** * When using read() to create an Ebwt, we have to set a couple of * additional fields in the Ebwt object that aren't part of the * parameter list and are not stored explicitly in the file. Right * now, this just involves initializing _zEbwtByteOff and * _zEbwtBpOff from _zOff. */ void postReadInit(EbwtParams& eh) { TIndexOffU sideNum = _zOff / eh._sideBwtLen; TIndexOffU sideCharOff = _zOff % eh._sideBwtLen; TIndexOffU sideByteOff = sideNum * eh._sideSz; _zEbwtByteOff = sideCharOff >> 2; assert_lt(_zEbwtByteOff, eh._sideBwtSz); _zEbwtBpOff = sideCharOff & 3; assert_lt(_zEbwtBpOff, 4); _zEbwtByteOff += sideByteOff; assert(repOk(eh)); // Ebwt should be fully initialized now } /** * Given basename of an Ebwt index, read and return its flag. */ static int32_t readFlags(const string& instr); /** * Pretty-print the Ebwt to the given output stream. */ void print(ostream& out) const { print(out, _eh); } /** * Pretty-print the Ebwt and given EbwtParams to the given output * stream. */ void print(ostream& out, const EbwtParams& eh) const { eh.print(out); // print params out << "Ebwt (" << (isInMemory()? "memory" : "disk") << "):" << endl << " zOff: " << _zOff << endl << " zEbwtByteOff: " << _zEbwtByteOff << endl << " zEbwtBpOff: " << _zEbwtBpOff << endl << " nPat: " << _nPat << endl << " plen: "; if(plen() == NULL) { out << "NULL" << endl; } else { out << "non-NULL, [0] = " << plen()[0] << endl; } out << " rstarts: "; if(rstarts() == NULL) { out << "NULL" << endl; } else { out << "non-NULL, [0] = " << rstarts()[0] << endl; } out << " ebwt: "; if(ebwt() == NULL) { out << "NULL" << endl; } else { out << "non-NULL, [0] = " << ebwt()[0] << endl; } out << " fchr: "; if(fchr() == NULL) { out << "NULL" << endl; } else { out << "non-NULL, [0] = " << fchr()[0] << endl; } out << " ftab: "; if(ftab() == NULL) { out << "NULL" << endl; } else { out << "non-NULL, [0] = " << ftab()[0] << endl; } out << " eftab: "; if(eftab() == NULL) { out << "NULL" << endl; } else { out << "non-NULL, [0] = " << eftab()[0] << endl; } out << " offs: "; if(offs() == NULL) { out << "NULL" << endl; } else { out << "non-NULL, [0] = " << offs()[0] << endl; } } // Building template static TStr join(EList& l, uint32_t seed); template static TStr join(EList& l, EList& szs, TIndexOffU sztot, const RefReadInParams& refparams, uint32_t seed); template void joinToDisk(EList& l, EList& szs, TIndexOffU sztot, const RefReadInParams& refparams, TStr& ret, ostream& out1, ostream& out2); template void buildToDisk(InorderBlockwiseSA& sa, const TStr& s, ostream& out1, ostream& out2, ostream* saOut, ostream* bwtOut); // I/O void readIntoMemory(int color, int needEntireRev, bool loadSASamp, bool loadFtab, bool loadRstarts, bool justHeader, EbwtParams *params, bool mmSweep, bool loadNames, bool startVerbose); void writeFromMemory(bool justHeader, ostream& out1, ostream& out2) const; void writeFromMemory(bool justHeader, const string& out1, const string& out2) const; // Sanity checking void sanityCheckUpToSide(TIndexOff upToSide) const; void sanityCheckAll(int reverse) const; void restore(SString& s) const; void checkOrigs(const EList >& os, bool color, bool mirror) const; // Searching and reporting void joinedToTextOff(TIndexOffU qlen, TIndexOffU off, TIndexOffU& tidx, TIndexOffU& textoff, TIndexOffU& tlen, bool rejectStraddle, bool& straddled) const; #define WITHIN_BWT_LEN(x) \ assert_leq(x[0], this->_eh._sideBwtLen); \ assert_leq(x[1], this->_eh._sideBwtLen); \ assert_leq(x[2], this->_eh._sideBwtLen); \ assert_leq(x[3], this->_eh._sideBwtLen) #define WITHIN_FCHR(x) \ assert_leq(x[0], this->fchr()[1]); \ assert_leq(x[1], this->fchr()[2]); \ assert_leq(x[2], this->fchr()[3]); \ assert_leq(x[3], this->fchr()[4]) #define WITHIN_FCHR_DOLLARA(x) \ assert_leq(x[0], this->fchr()[1]+1); \ assert_leq(x[1], this->fchr()[2]); \ assert_leq(x[2], this->fchr()[3]); \ assert_leq(x[3], this->fchr()[4]) /** * Count all occurrences of character c from the beginning of the * forward side to and add in the occ[] count up to the side * break just prior to the side. * * A Bowtie 2 side is shaped like: * * XXXXXXXXXXXXXXXX [A] [C] [G] [T] * --------48------ -4- -4- -4- -4- (numbers in bytes) */ inline TIndexOffU countBt2Side(const SideLocus& l, int c) const { assert_range(0, 3, c); assert_range(0, (int)this->_eh._sideBwtSz-1, (int)l._by); assert_range(0, 3, (int)l._bp); const uint8_t *side = l.side(this->ebwt()); TIndexOffU cCnt = countUpTo(l, c); assert_leq(cCnt, l.toBWRow()); assert_leq(cCnt, this->_eh._sideBwtLen); if(c == 0 && l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) { // Adjust for the fact that we represented $ with an 'A', but // shouldn't count it as an 'A' here if((l._sideByteOff + l._by > _zEbwtByteOff) || (l._sideByteOff + l._by == _zEbwtByteOff && l._bp > _zEbwtBpOff)) { cCnt--; // Adjust for '$' looking like an 'A' } } TIndexOffU ret; // Now factor in the occ[] count at the side break const uint8_t *acgt8 = side + _eh._sideBwtSz; const TIndexOffU *acgt = reinterpret_cast(acgt8); assert_leq(acgt[0], this->_eh._numSides * this->_eh._sideBwtLen); // b/c it's used as padding assert_leq(acgt[1], this->_eh._len); assert_leq(acgt[2], this->_eh._len); assert_leq(acgt[3], this->_eh._len); ret = acgt[c] + cCnt + this->fchr()[c]; #ifndef NDEBUG assert_leq(ret, this->fchr()[c+1]); // can't have jumpded into next char's section if(c == 0) { assert_leq(cCnt, this->_eh._sideBwtLen); } else { assert_leq(ret, this->_eh._bwtLen); } #endif return ret; } /** * Count all occurrences of all four nucleotides up to the starting * point (which must be in a forward side) given by 'l' storing the * result in 'cntsUpto', then count nucleotide occurrences within the * range of length 'num' storing the result in 'cntsIn'. Also, keep * track of the characters occurring within the range by setting * 'masks' accordingly (masks[1][10] == true -> 11th character is a * 'C', and masks[0][10] == masks[2][10] == masks[3][10] == false. */ inline void countBt2SideRange( SideLocus& l, // top locus TIndexOffU num, // number of elts in range to tall // @double-check TIndexOffU* cntsUpto, // A/C/G/T counts up to top TIndexOffU* cntsIn, // A/C/G/T counts within range EList *masks) const // masks indicating which range elts = A/C/G/T { assert_gt(num, 0); assert_range(0, (int)this->_eh._sideBwtSz-1, (int)l._by); assert_range(0, 3, (int)l._bp); countUpToEx(l, cntsUpto); WITHIN_FCHR_DOLLARA(cntsUpto); WITHIN_BWT_LEN(cntsUpto); const uint8_t *side = l.side(this->ebwt()); if(l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) { // Adjust for the fact that we represented $ with an 'A', but // shouldn't count it as an 'A' here if((l._sideByteOff + l._by > _zEbwtByteOff) || (l._sideByteOff + l._by == _zEbwtByteOff && l._bp > _zEbwtBpOff)) { cntsUpto[0]--; // Adjust for '$' looking like an 'A' } } // Now factor in the occ[] count at the side break const TIndexOffU *acgt = reinterpret_cast(side + _eh._sideBwtSz); assert_leq(acgt[0], this->fchr()[1] + this->_eh.sideBwtLen()); assert_leq(acgt[1], this->fchr()[2]-this->fchr()[1]); assert_leq(acgt[2], this->fchr()[3]-this->fchr()[2]); assert_leq(acgt[3], this->fchr()[4]-this->fchr()[3]); assert_leq(acgt[0], this->_eh._len + this->_eh.sideBwtLen()); assert_leq(acgt[1], this->_eh._len); assert_leq(acgt[2], this->_eh._len); assert_leq(acgt[3], this->_eh._len); cntsUpto[0] += (acgt[0] + this->fchr()[0]); cntsUpto[1] += (acgt[1] + this->fchr()[1]); cntsUpto[2] += (acgt[2] + this->fchr()[2]); cntsUpto[3] += (acgt[3] + this->fchr()[3]); masks[0].resize(num); masks[1].resize(num); masks[2].resize(num); masks[3].resize(num); WITHIN_FCHR_DOLLARA(cntsUpto); WITHIN_FCHR_DOLLARA(cntsIn); // 'cntsUpto' is complete now. // Walk forward until we've tallied the entire 'In' range TIndexOffU nm = 0; // Rest of this side nm += countBt2SideRange2(l, true, num - nm, cntsIn, masks, nm); assert_eq(nm, cntsIn[0] + cntsIn[1] + cntsIn[2] + cntsIn[3]); assert_leq(nm, num); SideLocus lcopy = l; while(nm < num) { // Subsequent sides, if necessary lcopy.nextSide(this->_eh); nm += countBt2SideRange2(lcopy, false, num - nm, cntsIn, masks, nm); WITHIN_FCHR_DOLLARA(cntsIn); assert_leq(nm, num); assert_eq(nm, cntsIn[0] + cntsIn[1] + cntsIn[2] + cntsIn[3]); } assert_eq(num, cntsIn[0] + cntsIn[1] + cntsIn[2] + cntsIn[3]); WITHIN_FCHR_DOLLARA(cntsIn); } /** * Count all occurrences of character c from the beginning of the * forward side to and add in the occ[] count up to the side * break just prior to the side. * * A forward side is shaped like: * * [A] [C] XXXXXXXXXXXXXXXX * -4- -4- --------56------ (numbers in bytes) * ^ * Side ptr (result from SideLocus.side()) * * And following it is a reverse side shaped like: * * [G] [T] XXXXXXXXXXXXXXXX * -4- -4- --------56------ (numbers in bytes) * ^ * Side ptr (result from SideLocus.side()) * */ inline void countBt2SideEx(const SideLocus& l, TIndexOffU* arrs) const { assert_range(0, (int)this->_eh._sideBwtSz-1, (int)l._by); assert_range(0, 3, (int)l._bp); countUpToEx(l, arrs); if(l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) { // Adjust for the fact that we represented $ with an 'A', but // shouldn't count it as an 'A' here if((l._sideByteOff + l._by > _zEbwtByteOff) || (l._sideByteOff + l._by == _zEbwtByteOff && l._bp > _zEbwtBpOff)) { arrs[0]--; // Adjust for '$' looking like an 'A' } } WITHIN_FCHR(arrs); WITHIN_BWT_LEN(arrs); // Now factor in the occ[] count at the side break const uint8_t *side = l.side(this->ebwt()); const uint8_t *acgt16 = side + this->_eh._sideSz - OFF_SIZE*4; const TIndexOffU *acgt = reinterpret_cast(acgt16); assert_leq(acgt[0], this->fchr()[1] + this->_eh.sideBwtLen()); assert_leq(acgt[1], this->fchr()[2]-this->fchr()[1]); assert_leq(acgt[2], this->fchr()[3]-this->fchr()[2]); assert_leq(acgt[3], this->fchr()[4]-this->fchr()[3]); assert_leq(acgt[0], this->_eh._len + this->_eh.sideBwtLen()); assert_leq(acgt[1], this->_eh._len); assert_leq(acgt[2], this->_eh._len); assert_leq(acgt[3], this->_eh._len); arrs[0] += (acgt[0] + this->fchr()[0]); arrs[1] += (acgt[1] + this->fchr()[1]); arrs[2] += (acgt[2] + this->fchr()[2]); arrs[3] += (acgt[3] + this->fchr()[3]); WITHIN_FCHR(arrs); } /** * Counts the number of occurrences of character 'c' in the given Ebwt * side up to (but not including) the given byte/bitpair (by/bp). * * This is a performance-critical function. This is the top search- * related hit in the time profile. * * Function gets 11.09% in profile */ inline TIndexOffU countUpTo(const SideLocus& l, int c) const { // @double-check // Count occurrences of c in each 64-bit (using bit trickery); // Someday countInU64() and pop() functions should be // vectorized/SSE-ized in case that helps. TIndexOffU cCnt = 0; const uint8_t *side = l.side(this->ebwt()); int i = 0; #ifdef POPCNT_CAPABILITY if ( _usePOPCNTinstruction) { for(; i + 7 < l._by; i += 8) { cCnt += countInU64(c, *(uint64_t*)&side[i]); } } else { for(; i + 7 < l._by; i += 8) { cCnt += countInU64(c, *(uint64_t*)&side[i]); } } #else for(; i + 7 < l._by; i += 8) { cCnt += countInU64(c, *(uint64_t*)&side[i]); } #endif // Count occurences of c in the rest of the side (using LUT) for(; i < l._by; i++) { cCnt += cCntLUT_4[0][c][side[i]]; } // Count occurences of c in the rest of the byte if(l._bp > 0) { cCnt += cCntLUT_4[(int)l._bp][c][side[i]]; } return cCnt; } /** * Tricky-bit-bashing bitpair counting for given two-bit value (0-3) * within a 64-bit argument. * * Function gets 2.32% in profile */ #ifdef POPCNT_CAPABILITY template #endif inline static void countInU64Ex(uint64_t dw, TIndexOffU* arrs) { uint64_t c0 = c_table[0]; uint64_t x0 = dw ^ c0; uint64_t x1 = (x0 >> 1); uint64_t x2 = x1 & (0x5555555555555555llu); uint64_t x3 = x0 & x2; #ifdef POPCNT_CAPABILITY uint64_t tmp = Operation().pop64(x3); #else uint64_t tmp = pop64(x3); #endif arrs[0] += (uint32_t) tmp; c0 = c_table[1]; x0 = dw ^ c0; x1 = (x0 >> 1); x2 = x1 & (0x5555555555555555llu); x3 = x0 & x2; #ifdef POPCNT_CAPABILITY tmp = Operation().pop64(x3); #else tmp = pop64(x3); #endif arrs[1] += (uint32_t) tmp; c0 = c_table[2]; x0 = dw ^ c0; x1 = (x0 >> 1); x2 = x1 & (0x5555555555555555llu); x3 = x0 & x2; #ifdef POPCNT_CAPABILITY tmp = Operation().pop64(x3); #else tmp = pop64(x3); #endif arrs[2] += (uint32_t) tmp; c0 = c_table[3]; x0 = dw ^ c0; x1 = (x0 >> 1); x2 = x1 & (0x5555555555555555llu); x3 = x0 & x2; #ifdef POPCNT_CAPABILITY tmp = Operation().pop64(x3); #else tmp = pop64(x3); #endif arrs[3] += (uint32_t) tmp; } /** * Counts the number of occurrences of all four nucleotides in the * given side up to (but not including) the given byte/bitpair (by/bp). * Count for 'a' goes in arrs[0], 'c' in arrs[1], etc. */ inline void countUpToEx(const SideLocus& l, TIndexOffU* arrs) const { int i = 0; // Count occurrences of each nucleotide in each 64-bit word using // bit trickery; note: this seems does not seem to lend a // significant boost to performance in practice. If you comment // out this whole loop (which won't affect correctness - it will // just cause the following loop to take up the slack) then runtime // does not change noticeably. Someday the countInU64() and pop() // functions should be vectorized/SSE-ized in case that helps. const uint8_t *side = l.side(this->ebwt()); #ifdef POPCNT_CAPABILITY if (_usePOPCNTinstruction) { for(; i+7 < l._by; i += 8) { countInU64Ex(*(uint64_t*)&side[i], arrs); } } else { for(; i+7 < l._by; i += 8) { countInU64Ex(*(uint64_t*)&side[i], arrs); } } #else for(; i+7 < l._by; i += 8) { countInU64Ex(*(uint64_t*)&side[i], arrs); } #endif // Count occurences of nucleotides in the rest of the side (using LUT) // Many cache misses on following lines (~20K) for(; i < l._by; i++) { arrs[0] += cCntLUT_4[0][0][side[i]]; arrs[1] += cCntLUT_4[0][1][side[i]]; arrs[2] += cCntLUT_4[0][2][side[i]]; arrs[3] += cCntLUT_4[0][3][side[i]]; } // Count occurences of c in the rest of the byte if(l._bp > 0) { arrs[0] += cCntLUT_4[(int)l._bp][0][side[i]]; arrs[1] += cCntLUT_4[(int)l._bp][1][side[i]]; arrs[2] += cCntLUT_4[(int)l._bp][2][side[i]]; arrs[3] += cCntLUT_4[(int)l._bp][3][side[i]]; } } #ifndef NDEBUG /** * Given top and bot loci, calculate counts of all four DNA chars up to * those loci. Used for more advanced backtracking-search. */ inline void mapLFEx( const SideLocus& l, TIndexOffU *arrs ASSERT_ONLY(, bool overrideSanity = false) ) const { assert_eq(0, arrs[0]); assert_eq(0, arrs[1]); assert_eq(0, arrs[2]); assert_eq(0, arrs[3]); countBt2SideEx(l, arrs); if(_sanity && !overrideSanity) { // Make sure results match up with individual calls to mapLF; // be sure to override sanity-checking in the callee, or we'll // have infinite recursion assert_eq(mapLF(l, 0, true), arrs[0]); assert_eq(mapLF(l, 1, true), arrs[1]); assert_eq(mapLF(l, 2, true), arrs[2]); assert_eq(mapLF(l, 3, true), arrs[3]); } } #endif /** * Given top and bot rows, calculate counts of all four DNA chars up to * those loci. */ inline void mapLFEx( TIndexOffU top, TIndexOffU bot, TIndexOffU *tops, TIndexOffU *bots ASSERT_ONLY(, bool overrideSanity = false) ) const { SideLocus ltop, lbot; SideLocus::initFromTopBot(top, bot, _eh, ebwt(), ltop, lbot); mapLFEx(ltop, lbot, tops, bots ASSERT_ONLY(, overrideSanity)); } /** * Given top and bot loci, calculate counts of all four DNA chars up to * those loci. Used for more advanced backtracking-search. */ inline void mapLFEx( const SideLocus& ltop, const SideLocus& lbot, TIndexOffU *tops, TIndexOffU *bots ASSERT_ONLY(, bool overrideSanity = false) ) const { assert(ltop.repOk(this->eh())); assert(lbot.repOk(this->eh())); assert_eq(0, tops[0]); assert_eq(0, bots[0]); assert_eq(0, tops[1]); assert_eq(0, bots[1]); assert_eq(0, tops[2]); assert_eq(0, bots[2]); assert_eq(0, tops[3]); assert_eq(0, bots[3]); countBt2SideEx(ltop, tops); countBt2SideEx(lbot, bots); #ifndef NDEBUG if(_sanity && !overrideSanity) { // Make sure results match up with individual calls to mapLF; // be sure to override sanity-checking in the callee, or we'll // have infinite recursion assert_eq(mapLF(ltop, 0, true), tops[0]); assert_eq(mapLF(ltop, 1, true), tops[1]); assert_eq(mapLF(ltop, 2, true), tops[2]); assert_eq(mapLF(ltop, 3, true), tops[3]); assert_eq(mapLF(lbot, 0, true), bots[0]); assert_eq(mapLF(lbot, 1, true), bots[1]); assert_eq(mapLF(lbot, 2, true), bots[2]); assert_eq(mapLF(lbot, 3, true), bots[3]); } #endif } /** * Counts the number of occurrences of all four nucleotides in the * given side from the given byte/bitpair (l->_by/l->_bp) (or the * beginning of the side if l == 0). Count for 'a' goes in arrs[0], * 'c' in arrs[1], etc. * * Note: must account for $. * * Must fill in masks */ inline TIndexOffU countBt2SideRange2( // @double-check const SideLocus& l, bool startAtLocus, TIndexOffU num, TIndexOffU* arrs, EList *masks, TIndexOffU maskOff) const { assert(!masks[0].empty()); assert_eq(masks[0].size(), masks[1].size()); assert_eq(masks[0].size(), masks[2].size()); assert_eq(masks[0].size(), masks[3].size()); ASSERT_ONLY(TIndexOffU myarrs[4] = {0, 0, 0, 0}); TIndexOffU nm = 0; // number of nucleotides tallied so far int iby = 0; // initial byte offset int ibp = 0; // initial base-pair offset if(startAtLocus) { iby = l._by; ibp = l._bp; } else { // Start at beginning } int by = iby, bp = ibp; assert_lt(bp, 4); assert_lt(by, (int)this->_eh._sideBwtSz); const uint8_t *side = l.side(this->ebwt()); while(nm < num) { int c = (side[by] >> (bp * 2)) & 3; assert_lt(maskOff + nm, masks[c].size()); masks[0][maskOff + nm] = masks[1][maskOff + nm] = masks[2][maskOff + nm] = masks[3][maskOff + nm] = false; assert_range(0, 3, c); // Note: we tally $ just like an A arrs[c]++; // tally it ASSERT_ONLY(myarrs[c]++); masks[c][maskOff + nm] = true; // not dead nm++; if(++bp == 4) { bp = 0; by++; assert_leq(by, (int)this->_eh._sideBwtSz); if(by == (int)this->_eh._sideBwtSz) { // Fell off the end of the side break; } } } WITHIN_FCHR_DOLLARA(arrs); #ifndef NDEBUG if(_sanity) { // Make sure results match up with a call to mapLFEx. TIndexOffU tops[4] = {0, 0, 0, 0}; TIndexOffU bots[4] = {0, 0, 0, 0}; TIndexOffU top = l.toBWRow(); TIndexOffU bot = top + nm; mapLFEx(top, bot, tops, bots, false); assert(myarrs[0] == (bots[0] - tops[0]) || myarrs[0] == (bots[0] - tops[0])+1); assert_eq(myarrs[1], bots[1] - tops[1]); assert_eq(myarrs[2], bots[2] - tops[2]); assert_eq(myarrs[3], bots[3] - tops[3]); } #endif return nm; } /** * Return the final character in row i (i.e. the i'th character in the * BWT transform). Note that the 'L' in the name of the function * stands for 'last', as in the literature. */ inline int rowL(const SideLocus& l) const { // Extract and return appropriate bit-pair return unpack_2b_from_8b(l.side(this->ebwt())[l._by], l._bp); } /** * Return the final character in row i (i.e. the i'th character in the * BWT transform). Note that the 'L' in the name of the function * stands for 'last', as in the literature. */ inline int rowL(TIndexOffU i) const { // Extract and return appropriate bit-pair SideLocus l; l.initFromRow(i, _eh, ebwt()); return rowL(l); } /** * Given top and bot loci, calculate counts of all four DNA chars up to * those loci. Used for more advanced backtracking-search. */ inline void mapLFRange( SideLocus& ltop, SideLocus& lbot, TIndexOffU num, // Number of elts TIndexOffU* cntsUpto, // A/C/G/T counts up to top TIndexOffU* cntsIn, // A/C/G/T counts within range EList *masks ASSERT_ONLY(, bool overrideSanity = false) ) const { assert(ltop.repOk(this->eh())); assert(lbot.repOk(this->eh())); assert_eq(num, lbot.toBWRow() - ltop.toBWRow()); assert_eq(0, cntsUpto[0]); assert_eq(0, cntsIn[0]); assert_eq(0, cntsUpto[1]); assert_eq(0, cntsIn[1]); assert_eq(0, cntsUpto[2]); assert_eq(0, cntsIn[2]); assert_eq(0, cntsUpto[3]); assert_eq(0, cntsIn[3]); countBt2SideRange(ltop, num, cntsUpto, cntsIn, masks); assert_eq(num, cntsIn[0] + cntsIn[1] + cntsIn[2] + cntsIn[3]); #ifndef NDEBUG if(_sanity && !overrideSanity) { // Make sure results match up with individual calls to mapLF; // be sure to override sanity-checking in the callee, or we'll // have infinite recursion TIndexOffU tops[4] = {0, 0, 0, 0}; TIndexOffU bots[4] = {0, 0, 0, 0}; assert(ltop.repOk(this->eh())); assert(lbot.repOk(this->eh())); mapLFEx(ltop, lbot, tops, bots, false); for(int i = 0; i < 4; i++) { assert(cntsUpto[i] == tops[i] || tops[i] == bots[i]); if(i == 0) { assert(cntsIn[i] == bots[i]-tops[i] || cntsIn[i] == bots[i]-tops[i]+1); } else { assert_eq(cntsIn[i], bots[i]-tops[i]); } } } #endif } /** * Given row i, return the row that the LF mapping maps i to. */ inline TIndexOffU mapLF( const SideLocus& l ASSERT_ONLY(, bool overrideSanity = false) ) const { ASSERT_ONLY(TIndexOffU srcrow = l.toBWRow()); TIndexOffU ret; assert(l.side(this->ebwt()) != NULL); int c = rowL(l); assert_lt(c, 4); assert_geq(c, 0); ret = countBt2Side(l, c); assert_lt(ret, this->_eh._bwtLen); assert_neq(srcrow, ret); #ifndef NDEBUG if(_sanity && !overrideSanity) { // Make sure results match up with results from mapLFEx; // be sure to override sanity-checking in the callee, or we'll // have infinite recursion TIndexOffU arrs[] = { 0, 0, 0, 0 }; mapLFEx(l, arrs, true); assert_eq(arrs[c], ret); } #endif return ret; } /** * Given row i and character c, return the row that the LF mapping maps * i to on character c. */ inline TIndexOffU mapLF( const SideLocus& l, int c ASSERT_ONLY(, bool overrideSanity = false) ) const { TIndexOffU ret; assert_lt(c, 4); assert_geq(c, 0); ret = countBt2Side(l, c); assert_lt(ret, this->_eh._bwtLen); #ifndef NDEBUG if(_sanity && !overrideSanity) { // Make sure results match up with results from mapLFEx; // be sure to override sanity-checking in the callee, or we'll // have infinite recursion TIndexOffU arrs[] = { 0, 0, 0, 0 }; mapLFEx(l, arrs, true); assert_eq(arrs[c], ret); } #endif return ret; } /** * Given top and bot loci, calculate counts of all four DNA chars up to * those loci. Also, update a set of tops and bots for the reverse * index/direction using the idea from the bi-directional BWT paper. */ inline void mapBiLFEx( const SideLocus& ltop, const SideLocus& lbot, TIndexOffU *tops, TIndexOffU *bots, TIndexOffU *topsP, // topsP[0] = top TIndexOffU *botsP ASSERT_ONLY(, bool overrideSanity = false) ) const { #ifndef NDEBUG for(int i = 0; i < 4; i++) { assert_eq(0, tops[0]); assert_eq(0, bots[0]); } #endif countBt2SideEx(ltop, tops); countBt2SideEx(lbot, bots); #ifndef NDEBUG if(_sanity && !overrideSanity) { // Make sure results match up with individual calls to mapLF; // be sure to override sanity-checking in the callee, or we'll // have infinite recursion assert_eq(mapLF(ltop, 0, true), tops[0]); assert_eq(mapLF(ltop, 1, true), tops[1]); assert_eq(mapLF(ltop, 2, true), tops[2]); assert_eq(mapLF(ltop, 3, true), tops[3]); assert_eq(mapLF(lbot, 0, true), bots[0]); assert_eq(mapLF(lbot, 1, true), bots[1]); assert_eq(mapLF(lbot, 2, true), bots[2]); assert_eq(mapLF(lbot, 3, true), bots[3]); } #endif // bots[0..3] - tops[0..3] = # of ways to extend the suffix with an // A, C, G, T botsP[0] = topsP[0] + (bots[0] - tops[0]); topsP[1] = botsP[0]; botsP[1] = topsP[1] + (bots[1] - tops[1]); topsP[2] = botsP[1]; botsP[2] = topsP[2] + (bots[2] - tops[2]); topsP[3] = botsP[2]; botsP[3] = topsP[3] + (bots[3] - tops[3]); } /** * Given row and its locus information, proceed on the given character * and return the next row, or all-fs if we can't proceed on that * character. Returns max value if this row ends in $. */ inline TIndexOffU mapLF1( TIndexOffU row, // starting row const SideLocus& l, // locus for starting row int c // character to proceed on ASSERT_ONLY(, bool overrideSanity = false) ) const { if(rowL(l) != c || row == _zOff) return OFF_MASK; TIndexOffU ret; assert_lt(c, 4); assert_geq(c, 0); ret = countBt2Side(l, c); assert_lt(ret, this->_eh._bwtLen); #ifndef NDEBUG if(_sanity && !overrideSanity) { // Make sure results match up with results from mapLFEx; // be sure to override sanity-checking in the callee, or we'll // have infinite recursion TIndexOffU arrs[] = { 0, 0, 0, 0 }; mapLFEx(l, arrs, true); assert_eq(arrs[c], ret); } #endif return ret; } /** * Given row and its locus information, set the row to LF(row) and * return the character that was in the final column. */ inline int mapLF1( TIndexOffU& row, // starting row const SideLocus& l // locus for starting row ASSERT_ONLY(, bool overrideSanity = false) ) const { if(row == _zOff) return -1; int c = rowL(l); assert_range(0, 3, c); row = countBt2Side(l, c); assert_lt(row, this->_eh._bwtLen); #ifndef NDEBUG if(_sanity && !overrideSanity) { // Make sure results match up with results from mapLFEx; // be sure to override sanity-checking in the callee, or we'll // have infinite recursion TIndexOffU arrs[] = { 0, 0, 0, 0 }; mapLFEx(l, arrs, true); assert_eq(arrs[c], row); } #endif return c; } #ifndef NDEBUG /// Check that in-memory Ebwt is internally consistent with respect /// to given EbwtParams; assert if not bool inMemoryRepOk(const EbwtParams& eh) const { assert_geq(_zEbwtBpOff, 0); assert_lt(_zEbwtBpOff, 4); assert_lt(_zEbwtByteOff, eh._ebwtTotSz); assert_lt(_zOff, eh._bwtLen); assert_geq(_nFrag, _nPat); return true; } /// Check that in-memory Ebwt is internally consistent; assert if /// not bool inMemoryRepOk() const { return repOk(_eh); } /// Check that Ebwt is internally consistent with respect to given /// EbwtParams; assert if not bool repOk(const EbwtParams& eh) const { assert(_eh.repOk()); if(isInMemory()) { return inMemoryRepOk(eh); } return true; } /// Check that Ebwt is internally consistent; assert if not bool repOk() const { return repOk(_eh); } #endif bool _toBigEndian; int32_t _overrideOffRate; bool _verbose; bool _passMemExc; bool _sanity; bool fw_; // true iff this is a forward index FILE *_in1; // input fd for primary index file FILE *_in2; // input fd for secondary index file string _in1Str; // filename for primary index file string _in2Str; // filename for secondary index file string _inSaStr; // filename for suffix-array file string _inBwtStr; // filename for BWT file TIndexOffU _zOff; TIndexOffU _zEbwtByteOff; TIndexOff _zEbwtBpOff; TIndexOffU _nPat; /// number of reference texts TIndexOffU _nFrag; /// number of fragments APtrWrap _plen; APtrWrap _rstarts; // starting offset of fragments / text indexes // _fchr, _ftab and _eftab are expected to be relatively small // (usually < 1MB, perhaps a few MB if _fchr is particularly large // - like, say, 11). For this reason, we don't bother with writing // them to disk through separate output streams; we APtrWrap _fchr; APtrWrap _ftab; APtrWrap _eftab; // "extended" entries for _ftab // _offs may be extremely large. E.g. for DNA w/ offRate=4 (one // offset every 16 rows), the total size of _offs is the same as // the total size of the input sequence APtrWrap _offs; // _ebwt is the Extended Burrows-Wheeler Transform itself, and thus // is at least as large as the input sequence. APtrWrap _ebwt; bool _useMm; /// use memory-mapped files to hold the index bool useShmem_; /// use shared memory to hold large parts of the index EList _refnames; /// names of the reference sequences char *mmFile1_; char *mmFile2_; EbwtParams _eh; bool packed_; static const TIndexOffU default_bmax = OFF_MASK; static const TIndexOffU default_bmaxMultSqrt = OFF_MASK; static const TIndexOffU default_bmaxDivN = 4; static const int default_dcv = 1024; static const bool default_noDc = false; static const bool default_useBlockwise = true; static const uint32_t default_seed = 0; #ifdef BOWTIE_64BIT_INDEX static const int default_lineRate = 7; #else static const int default_lineRate = 6; #endif static const int default_offRate = 5; static const int default_offRatePlus = 0; static const int default_ftabChars = 10; static const bool default_bigEndian = false; private: ostream& log() const { return cout; // TODO: turn this into a parameter } /// Print a verbose message and flush (flushing is helpful for /// debugging) void verbose(const string& s) const { if(this->verbose()) { this->log() << s.c_str(); this->log().flush(); } } }; /** * Read reference names from an input stream 'in' for an Ebwt primary * file and store them in 'refnames'. */ void readEbwtRefnames(FILE* fin, EList& refnames); /** * Read reference names from the index with basename 'in' and store * them in 'refnames'. */ void readEbwtRefnames(const string& instr, EList& refnames); /** * Read just enough of the Ebwt's header to determine whether it's * colorspace. */ bool readEbwtColor(const string& instr); /** * Read just enough of the Ebwt's header to determine whether it's * entirely reversed. */ bool readEntireReverse(const string& instr); /////////////////////////////////////////////////////////////////////// // // Functions for building Ebwts // /////////////////////////////////////////////////////////////////////// /** * Join several text strings together in a way that's compatible with * the text-chunking scheme dictated by chunkRate parameter. * * The non-static member Ebwt::join additionally builds auxilliary * arrays that maintain a mapping between chunks in the joined string * and the original text strings. */ template TStr Ebwt::join(EList& l, uint32_t seed) { RandomSource rand; // reproducible given same seed rand.init(seed); TStr ret; TIndexOffU guessLen = 0; for(TIndexOffU i = 0; i < l.size(); i++) { guessLen += length(l[i]); } ret.resize(guessLen); TIndexOffU off = 0; for(size_t i = 0; i < l.size(); i++) { TStr& s = l[i]; assert_gt(s.length(), 0); for(size_t j = 0; j < s.size(); j++) { ret.set(s[j], off++); } } return ret; } /** * Join several text strings together in a way that's compatible with * the text-chunking scheme dictated by chunkRate parameter. * * The non-static member Ebwt::join additionally builds auxilliary * arrays that maintain a mapping between chunks in the joined string * and the original text strings. */ template TStr Ebwt::join(EList& l, EList& szs, TIndexOffU sztot, const RefReadInParams& refparams, uint32_t seed) { RandomSource rand; // reproducible given same seed rand.init(seed); RefReadInParams rpcp = refparams; TStr ret; TIndexOffU guessLen = sztot; ret.resize(guessLen); ASSERT_ONLY(TIndexOffU szsi = 0); TIndexOffU dstoff = 0; for(TIndexOffU i = 0; i < l.size(); i++) { // For each sequence we can pull out of istream l[i]... assert(!l[i]->eof()); bool first = true; while(!l[i]->eof()) { RefRecord rec = fastaRefReadAppend(*l[i], first, ret, dstoff, rpcp); first = false; TIndexOffU bases = rec.len; assert_eq(rec.off, szs[szsi].off); assert_eq(rec.len, szs[szsi].len); assert_eq(rec.first, szs[szsi].first); ASSERT_ONLY(szsi++); if(bases == 0) continue; } } return ret; } /** * Join several text strings together according to the text-chunking * scheme specified in the EbwtParams. Ebwt fields calculated in this * function are written directly to disk. * * It is assumed, but not required, that the header values have already * been written to 'out1' before this function is called. * * The static member Ebwt::join just returns a joined version of a * list of strings without building any of the auxilliary arrays. */ template void Ebwt::joinToDisk( EList& l, EList& szs, TIndexOffU sztot, const RefReadInParams& refparams, TStr& ret, ostream& out1, ostream& out2) { RefReadInParams rpcp = refparams; assert_gt(szs.size(), 0); assert_gt(l.size(), 0); assert_gt(sztot, 0); // Not every fragment represents a distinct sequence - many // fragments may correspond to a single sequence. Count the // number of sequences here by counting the number of "first" // fragments. this->_nPat = 0; this->_nFrag = 0; for(TIndexOffU i = 0; i < szs.size(); i++) { if(szs[i].len > 0) this->_nFrag++; if(szs[i].first && szs[i].len > 0) this->_nPat++; } assert_gt(this->_nPat, 0); assert_geq(this->_nFrag, this->_nPat); _rstarts.reset(); writeU(out1, this->_nPat, this->toBe()); // Allocate plen[] try { this->_plen.init(new TIndexOffU[this->_nPat], this->_nPat); } catch(bad_alloc& e) { cerr << "Out of memory allocating plen[] in Ebwt::join()" << " at " << __FILE__ << ":" << __LINE__ << endl; throw e; } // For each pattern, set plen TIndexOff npat = -1; for(TIndexOffU i = 0; i < szs.size(); i++) { if(szs[i].first && szs[i].len > 0) { if(npat >= 0) { writeU(out1, this->plen()[npat], this->toBe()); } this->plen()[++npat] = (szs[i].len + szs[i].off); } else { // edge case, but we could get here with npat == -1 // e.g. when building from a reference of all Ns if (npat < 0) npat = 0; this->plen()[npat] += (szs[i].len + szs[i].off); } } assert_eq((TIndexOffU)npat, this->_nPat-1); writeU(out1, this->plen()[npat], this->toBe()); // Write the number of fragments writeU(out1, this->_nFrag, this->toBe()); TIndexOffU seqsRead = 0; ASSERT_ONLY(TIndexOffU szsi = 0); ASSERT_ONLY(TIndexOffU entsWritten = 0); TIndexOffU dstoff = 0; // For each filebuf for(unsigned int i = 0; i < l.size(); i++) { assert(!l[i]->eof()); bool first = true; TIndexOffU patoff = 0; // For each *fragment* (not necessary an entire sequence) we // can pull out of istream l[i]... while(!l[i]->eof()) { string name; // Push a new name onto our vector _refnames.push_back(""); RefRecord rec = fastaRefReadAppend( *l[i], first, ret, dstoff, rpcp, &_refnames.back()); first = false; TIndexOffU bases = rec.len; if(rec.first && rec.len > 0) { if(_refnames.back().length() == 0) { // If name was empty, replace with an index ostringstream stm; stm << seqsRead; _refnames.back() = stm.str(); } } else { // This record didn't actually start a new sequence so // no need to add a name //assert_eq(0, _refnames.back().length()); _refnames.pop_back(); } assert_lt(szsi, szs.size()); assert_eq(rec.off, szs[szsi].off); assert_eq(rec.len, szs[szsi].len); assert_eq(rec.first, szs[szsi].first); assert(rec.first || rec.off > 0); ASSERT_ONLY(szsi++); // Increment seqsRead if this is the first fragment if(rec.first && rec.len > 0) seqsRead++; if(bases == 0) continue; assert_leq(bases, this->plen()[seqsRead-1]); // Reset the patoff if this is the first fragment if(rec.first) patoff = 0; patoff += rec.off; // add fragment's offset from end of last frag. // Adjust rpcps //uint32_t seq = seqsRead-1; ASSERT_ONLY(entsWritten++); // This is where rstarts elements are written to the output stream //writeU32(out1, oldRetLen, this->toBe()); // offset from beginning of joined string //writeU32(out1, seq, this->toBe()); // sequence id //writeU32(out1, patoff, this->toBe()); // offset into sequence patoff += bases; } assert_gt(szsi, 0); l[i]->reset(); assert(!l[i]->eof()); #ifndef NDEBUG int c = l[i]->get(); assert_eq('>', c); assert(!l[i]->eof()); l[i]->reset(); assert(!l[i]->eof()); #endif } assert_eq(entsWritten, this->_nFrag); } /** * Build an Ebwt from a string 's' and its suffix array 'sa' (which * might actually be a suffix array *builder* that builds blocks of the * array on demand). The bulk of the Ebwt, i.e. the ebwt and offs * arrays, is written directly to disk. This is by design: keeping * those arrays in memory needlessly increases the footprint of the * building process. Instead, we prefer to build the Ebwt directly * "to disk" and then read it back into memory later as necessary. * * It is assumed that the header values and join-related values (nPat, * plen) have already been written to 'out1' before this function * is called. When this function is finished, it will have * additionally written ebwt, zOff, fchr, ftab and eftab to the primary * file and offs to the secondary file. * * Assume DNA/RNA/any alphabet with 4 or fewer elements. * Assume occ array entries are 32 bits each. * * @param sa the suffix array to convert to a Ebwt * @param s the original string * @param out */ template void Ebwt::buildToDisk( InorderBlockwiseSA& sa, const TStr& s, ostream& out1, ostream& out2, ostream* saOut, ostream* bwtOut) { const EbwtParams& eh = this->_eh; assert(eh.repOk()); assert_eq(s.length()+1, sa.size()); assert_eq(s.length(), eh._len); assert_gt(eh._lineRate, 3); assert(sa.suffixItrIsReset()); TIndexOffU len = eh._len; TIndexOffU ftabLen = eh._ftabLen; TIndexOffU sideSz = eh._sideSz; TIndexOffU ebwtTotSz = eh._ebwtTotSz; TIndexOffU fchr[] = {0, 0, 0, 0, 0}; EList ftab(EBWT_CAT); TIndexOffU zOff = OFF_MASK; // Save # of occurrences of each character as we walk along the bwt TIndexOffU occ[4] = {0, 0, 0, 0}; TIndexOffU occSave[4] = {0, 0, 0, 0}; // Record rows that should "absorb" adjacent rows in the ftab. // The absorbed rows represent suffixes shorter than the ftabChars // cutoff. uint8_t absorbCnt = 0; EList absorbFtab(EBWT_CAT); try { VMSG_NL("Allocating ftab, absorbFtab"); ftab.resize(ftabLen); ftab.fillZero(); absorbFtab.resize(ftabLen); absorbFtab.fillZero(); } catch(bad_alloc &e) { cerr << "Out of memory allocating ftab[] or absorbFtab[] " << "in Ebwt::buildToDisk() at " << __FILE__ << ":" << __LINE__ << endl; throw e; } // Allocate the side buffer; holds a single side as its being // constructed and then written to disk. Reused across all sides. #ifdef SIXTY4_FORMAT EList ebwtSide(EBWT_CAT); #else EList ebwtSide(EBWT_CAT); #endif try { #ifdef SIXTY4_FORMAT ebwtSide.resize(sideSz >> 3); #else ebwtSide.resize(sideSz); #endif } catch(bad_alloc &e) { cerr << "Out of memory allocating ebwtSide[] in " << "Ebwt::buildToDisk() at " << __FILE__ << ":" << __LINE__ << endl; throw e; } // Points to the base offset within ebwt for the side currently // being written TIndexOffU side = 0; // Whether we're assembling a forward or a reverse bucket bool fw; TIndexOff sideCur = 0; fw = true; // Have we skipped the '$' in the last column yet? ASSERT_ONLY(bool dollarSkipped = false); TIndexOffU si = 0; // string offset (chars) ASSERT_ONLY(TIndexOffU lastSufInt = 0); ASSERT_ONLY(bool inSA = true); // true iff saI still points inside suffix // array (as opposed to the padding at the // end) // Iterate over packed bwt bytes VMSG_NL("Entering Ebwt loop"); ASSERT_ONLY(TIndexOffU beforeEbwtOff = (TIndexOffU)out1.tellp()); // @double-check - pos_type, std::streampos // First integer in the suffix-array output file is the length of the // array, including $ if(saOut != NULL) { // Write length word writeU(*saOut, len+1, this->toBe()); } // First integer in the BWT output file is the length of BWT(T), including $ if(bwtOut != NULL) { // Write length word writeU(*bwtOut, len+1, this->toBe()); } while(side < ebwtTotSz) { // Sanity-check our cursor into the side buffer assert_geq(sideCur, 0); assert_lt(sideCur, (int)eh._sideBwtSz); assert_eq(0, side % sideSz); // 'side' must be on side boundary ebwtSide[sideCur] = 0; // clear assert_lt(side + sideCur, ebwtTotSz); // Iterate over bit-pairs in the si'th character of the BWT #ifdef SIXTY4_FORMAT for(int bpi = 0; bpi < 32; bpi++, si++) #else for(int bpi = 0; bpi < 4; bpi++, si++) #endif { int bwtChar; bool count = true; if(si <= len) { // Still in the SA; extract the bwtChar TIndexOffU saElt = sa.nextSuffix(); // Write it to the optional suffix-array output file if(saOut != NULL) { writeU(*saOut, saElt, this->toBe()); } // TODO: what exactly to write to the BWT output file? How to // represent $? How to pack nucleotides into bytes/words? // (that might have triggered sa to calc next suf block) if(saElt == 0) { // Don't add the '$' in the last column to the BWT // transform; we can't encode a $ (only A C T or G) // and counting it as, say, an A, will mess up the // LR mapping bwtChar = 0; count = false; ASSERT_ONLY(dollarSkipped = true); zOff = si; // remember the SA row that // corresponds to the 0th suffix } else { bwtChar = (int)(s[saElt-1]); assert_lt(bwtChar, 4); // Update the fchr fchr[bwtChar]++; } // Update ftab if((len-saElt) >= (TIndexOffU)eh._ftabChars) { // Turn the first ftabChars characters of the // suffix into an integer index into ftab. The // leftmost (lowest index) character of the suffix // goes in the most significant bit pair if the // integer. TIndexOffU sufInt = 0; for(int i = 0; i < eh._ftabChars; i++) { sufInt <<= 2; assert_lt((TIndexOffU)i, len-saElt); sufInt |= (unsigned char)(s[saElt+i]); } // Assert that this prefix-of-suffix is greater // than or equal to the last one (true b/c the // suffix array is sorted) #ifndef NDEBUG if(lastSufInt > 0) assert_geq(sufInt, lastSufInt); lastSufInt = sufInt; #endif // Update ftab assert_lt(sufInt+1, ftabLen); ftab[sufInt+1]++; if(absorbCnt > 0) { // Absorb all short suffixes since the last // transition into this transition absorbFtab[sufInt] = absorbCnt; absorbCnt = 0; } } else { // Otherwise if suffix is fewer than ftabChars // characters long, then add it to the 'absorbCnt'; // it will be absorbed into the next transition assert_lt(absorbCnt, 255); absorbCnt++; } // Suffix array offset boundary? - update offset array if((si & eh._offMask) == si) { assert_lt((si >> eh._offRate), eh._offsLen); // Write offsets directly to the secondary output // stream, thereby avoiding keeping them in memory writeU(out2, saElt, this->toBe()); } } else { // Strayed off the end of the SA, now we're just // padding out a bucket #ifndef NDEBUG if(inSA) { // Assert that we wrote all the characters in the // string before now assert_eq(si, len+1); inSA = false; } #endif // 'A' used for padding; important that padding be // counted in the occ[] array bwtChar = 0; } if(count) occ[bwtChar]++; // Append BWT char to bwt section of current side if(fw) { // Forward bucket: fill from least to most #ifdef SIXTY4_FORMAT ebwtSide[sideCur] |= ((uint64_t)bwtChar << (bpi << 1)); if(bwtChar > 0) assert_gt(ebwtSide[sideCur], 0); #else pack_2b_in_8b(bwtChar, ebwtSide[sideCur], bpi); assert_eq((ebwtSide[sideCur] >> (bpi*2)) & 3, bwtChar); #endif } else { // Backward bucket: fill from most to least #ifdef SIXTY4_FORMAT ebwtSide[sideCur] |= ((uint64_t)bwtChar << ((31 - bpi) << 1)); if(bwtChar > 0) assert_gt(ebwtSide[sideCur], 0); #else pack_2b_in_8b(bwtChar, ebwtSide[sideCur], 3-bpi); assert_eq((ebwtSide[sideCur] >> ((3-bpi)*2)) & 3, bwtChar); #endif } } // end loop over bit-pairs assert_eq(dollarSkipped ? 3 : 0, (occ[0] + occ[1] + occ[2] + occ[3]) & 3); #ifdef SIXTY4_FORMAT assert_eq(0, si & 31); #else assert_eq(0, si & 3); #endif sideCur++; if(sideCur == (int)eh._sideBwtSz) { sideCur = 0; TIndexOffU *cpptr = reinterpret_cast(ebwtSide.ptr()); // Write 'A', 'C', 'G' and 'T' tallies side += sideSz; assert_leq(side, eh._ebwtTotSz); #ifdef BOWTIE_64BIT_INDEX cpptr[(sideSz >> 3)-4] = endianizeU(occSave[0], this->toBe()); cpptr[(sideSz >> 3)-3] = endianizeU(occSave[1], this->toBe()); cpptr[(sideSz >> 3)-2] = endianizeU(occSave[2], this->toBe()); cpptr[(sideSz >> 3)-1] = endianizeU(occSave[3], this->toBe()); #else cpptr[(sideSz >> 2)-4] = endianizeU(occSave[0], this->toBe()); cpptr[(sideSz >> 2)-3] = endianizeU(occSave[1], this->toBe()); cpptr[(sideSz >> 2)-2] = endianizeU(occSave[2], this->toBe()); cpptr[(sideSz >> 2)-1] = endianizeU(occSave[3], this->toBe()); #endif occSave[0] = occ[0]; occSave[1] = occ[1]; occSave[2] = occ[2]; occSave[3] = occ[3]; // Write backward side to primary file out1.write((const char *)ebwtSide.ptr(), sideSz); } } VMSG_NL("Exited Ebwt loop"); assert_neq(zOff, OFF_MASK); if(absorbCnt > 0) { // Absorb any trailing, as-yet-unabsorbed short suffixes into // the last element of ftab absorbFtab[ftabLen-1] = absorbCnt; } // Assert that our loop counter got incremented right to the end assert_eq(side, eh._ebwtTotSz); // Assert that we wrote the expected amount to out1 assert_eq(((TIndexOffU)out1.tellp() - beforeEbwtOff), eh._ebwtTotSz); // @double-check - pos_type // assert that the last thing we did was write a forward bucket // // Write zOff to primary stream // writeU(out1, zOff, this->toBe()); // // Finish building fchr // // Exclusive prefix sum on fchr for(int i = 1; i < 4; i++) { fchr[i] += fchr[i-1]; } assert_eq(fchr[3], len); // Shift everybody up by one for(int i = 4; i >= 1; i--) { fchr[i] = fchr[i-1]; } fchr[0] = 0; if(_verbose) { for(int i = 0; i < 5; i++) cout << "fchr[" << "ACGT$"[i] << "]: " << fchr[i] << endl; } // Write fchr to primary file for(int i = 0; i < 5; i++) { writeU(out1, fchr[i], this->toBe()); } // // Finish building ftab and build eftab // // Prefix sum on ftable TIndexOffU eftabLen = 0; assert_eq(0, absorbFtab[0]); for(TIndexOffU i = 1; i < ftabLen; i++) { if(absorbFtab[i] > 0) eftabLen += 2; } assert_leq(eftabLen, (TIndexOffU)eh._ftabChars*2); eftabLen = eh._ftabChars*2; EList eftab(EBWT_CAT); try { eftab.resize(eftabLen); eftab.fillZero(); } catch(bad_alloc &e) { cerr << "Out of memory allocating eftab[] " << "in Ebwt::buildToDisk() at " << __FILE__ << ":" << __LINE__ << endl; throw e; } TIndexOffU eftabCur = 0; for(TIndexOffU i = 1; i < ftabLen; i++) { TIndexOffU lo = ftab[i] + Ebwt::ftabHi(ftab.ptr(), eftab.ptr(), len, ftabLen, eftabLen, i-1); if(absorbFtab[i] > 0) { // Skip a number of short pattern indicated by absorbFtab[i] TIndexOffU hi = lo + absorbFtab[i]; assert_lt(eftabCur*2+1, eftabLen); eftab[eftabCur*2] = lo; eftab[eftabCur*2+1] = hi; ftab[i] = (eftabCur++) ^ OFF_MASK; // insert pointer into eftab assert_eq(lo, Ebwt::ftabLo(ftab.ptr(), eftab.ptr(), len, ftabLen, eftabLen, i)); assert_eq(hi, Ebwt::ftabHi(ftab.ptr(), eftab.ptr(), len, ftabLen, eftabLen, i)); } else { ftab[i] = lo; } } assert_eq(Ebwt::ftabHi(ftab.ptr(), eftab.ptr(), len, ftabLen, eftabLen, ftabLen-1), len+1); // Write ftab to primary file for(TIndexOffU i = 0; i < ftabLen; i++) { writeU(out1, ftab[i], this->toBe()); } // Write eftab to primary file for(TIndexOffU i = 0; i < eftabLen; i++) { writeU(out1, eftab[i], this->toBe()); } // Note: if you'd like to sanity-check the Ebwt, you'll have to // read it back into memory first! assert(!isInMemory()); VMSG_NL("Exiting Ebwt::buildToDisk()"); } /** * Try to find the Bowtie index specified by the user. First try the * exact path given by the user. Then try the user-provided string * appended onto the path of the "indexes" subdirectory below this * executable, then try the provided string appended onto * "$BOWTIE2_INDEXES/". */ string adjustEbwtBase(const string& cmdline, const string& ebwtFileBase, bool verbose); extern string gLastIOErrMsg; /* Checks whether a call to read() failed or not. */ inline bool is_read_err(int fdesc, ssize_t ret, size_t count){ if (ret < 0) { std::stringstream sstm; sstm << "ERRNO: " << errno << " ERR Msg:" << strerror(errno) << std::endl; gLastIOErrMsg = sstm.str(); return true; } return false; } /* Checks whether a call to fread() failed or not. */ inline bool is_fread_err(FILE* file_hd, size_t ret, size_t count){ if (ferror(file_hd)) { gLastIOErrMsg = "Error Reading File!"; return true; } return false; } #endif /*EBWT_H_*/