diff options
author | Alexandre Mestiashvili <alex@biotec.tu-dresden.de> | 2013-02-19 17:29:57 +0100 |
---|---|---|
committer | Alexandre Mestiashvili <alex@biotec.tu-dresden.de> | 2013-02-19 17:29:57 +0100 |
commit | 9c8852ed171277357406985c54c6a339eef2f28d (patch) | |
tree | 4dcb7e514b9e2f18478a70c93553edf190bbf94d | |
parent | 2507f33d9874eb98ecc841c9a6e3d0ab1feb1a04 (diff) |
Imported Upstream version 2.0.6
-rw-r--r-- | Makefile | 25 | ||||
-rw-r--r-- | NEWS | 14 | ||||
-rw-r--r-- | VERSION | 2 | ||||
-rw-r--r-- | aligner_driver.cpp | 11 | ||||
-rw-r--r-- | aligner_result.cpp | 10 | ||||
-rw-r--r-- | aligner_result.h | 8 | ||||
-rw-r--r-- | aligner_seed2.h | 543 | ||||
-rw-r--r-- | aligner_sw.cpp | 4 | ||||
-rw-r--r-- | aligner_sw_driver.cpp | 8 | ||||
-rwxr-xr-x | bowtie2 | 92 | ||||
-rw-r--r-- | ds.h | 7 | ||||
-rw-r--r-- | ival_list.h | 4 | ||||
-rw-r--r-- | pat.cpp | 31 | ||||
-rw-r--r-- | ref_coord.h | 36 | ||||
-rw-r--r-- | spinlock.h | 4 | ||||
-rw-r--r-- | unique.h | 6 |
16 files changed, 628 insertions, 177 deletions
@@ -34,14 +34,18 @@ BOWTIE_SHARED_MEM = 0 # Detect Cygwin or MinGW WINDOWS = 0 +CYGWIN = 0 +MINGW = 0 ifneq (,$(findstring CYGWIN,$(shell uname))) -WINDOWS = 1 +WINDOWS = 1 +CYGWIN = 1 # POSIX memory-mapped files not currently supported on Windows BOWTIE_MM = 0 BOWTIE_SHARED_MEM = 0 else ifneq (,$(findstring MINGW,$(shell uname))) WINDOWS = 1 +MINGW = 1 # POSIX memory-mapped files not currently supported on Windows BOWTIE_MM = 0 BOWTIE_SHARED_MEM = 0 @@ -66,19 +70,22 @@ PTHREAD_LIB = PTHREAD_DEF = ifeq (1,$(BOWTIE_PTHREADS)) PTHREAD_DEF = -DBOWTIE_PTHREADS -ifeq (1,$(WINDOWS)) -# pthreads for windows forces us to be specific about the library +PTHREAD_LIB = -lpthread +ifeq (1,$(MINGW)) +# pthreads for windows under mingw forces us to be specific about the library PTHREAD_LIB = -lpthreadGC2 PTHREAD_PKG = pthreadGC2.dll -else -# There's also -pthread, but that only seems to work on Linux -PTHREAD_LIB = -lpthread endif endif LIBS = SEARCH_LIBS = $(PTHREAD_LIB) BUILD_LIBS = +INSPECT_LIBS = +ifeq (1,$(MINGW)) +BUILD_LIBS = $(PTHREAD_LIB) +INSPECT_LIBS = $(PTHREAD_LIB) +endif SHARED_CPPS = ccnt_lut.cpp ref_read.cpp alphabet.cpp shmem.cpp \ edit.cpp bt2_idx.cpp bt2_io.cpp bt2_util.cpp \ @@ -205,7 +212,7 @@ bowtie2-build: bt2_build.cpp $(SHARED_CPPS) $(HEADERS) -o $@ $< \ $(SHARED_CPPS) $(BUILD_CPPS_MAIN) \ $(LIBS) $(BUILD_LIBS) - + bowtie2-build-debug: bt2_build.cpp $(SHARED_CPPS) $(HEADERS) $(CXX) $(DEBUG_FLAGS) $(DEBUG_DEFS) $(EXTRA_FLAGS) \ $(DEFS) -DBOWTIE2 -Wall \ @@ -246,7 +253,7 @@ bowtie2-inspect: bt2_inspect.cpp $(HEADERS) $(SHARED_CPPS) $(INC) -I . \ -o $@ $< \ $(SHARED_CPPS) \ - $(LIBS) + $(LIBS) $(INSPECT_LIBS) bowtie2-inspect-debug: bt2_inspect.cpp $(HEADERS) $(SHARED_CPPS) $(CXX) $(DEBUG_FLAGS) \ @@ -255,7 +262,7 @@ bowtie2-inspect-debug: bt2_inspect.cpp $(HEADERS) $(SHARED_CPPS) $(INC) -I . \ -o $@ $< \ $(SHARED_CPPS) \ - $(LIBS) + $(LIBS) $(INSPECT_LIBS) .PHONY: bowtie2-src bowtie2-src: $(SRC_PKG_LIST) @@ -3,7 +3,7 @@ Bowtie 2 NEWS Bowtie 2 is now available for download from the project website, http://bowtie-bio.sf.net/bowtie2. 2.0.0-beta1 is the first version released to -the public and 2.0.5 is the latest version. Bowtie 2 is licensed under +the public and 2.0.6 is the latest version. Bowtie 2 is licensed under the GPLv3 license. See `LICENSE' file for details. Reporting Issues @@ -16,6 +16,18 @@ Please report any issues using the Sourceforge bug tracker: Version Release History ======================= +Version 2.0.6 - January 27, 2013 + * Fixed issue whereby spurious output would be written in --no-unal mode. + * Fixed issue whereby multiple input files combined with --reorder would + cause truncated output and a memory spike. + * Fixed spinlock datatype for Win64 API (LLP64 data model) which made it + crash when compiled under Windows 7 x64. + * Fixed bowtie2 wrapper to hadle filename/paths operations in a more + platform independent manner. + * Added pthread as a default library option under cygwin, and pthreadGC + for MinGW. + * Fixed some minor issue that made MinGW compilation fail. + Version 2.0.5 - January 4, 2013 * Fixed an issue that would cause excessive memory allocation when aligning to very repetitive genomes. @@ -1 +1 @@ -2.0.5
\ No newline at end of file +2.0.6 diff --git a/aligner_driver.cpp b/aligner_driver.cpp index e4227d6..b468780 100644 --- a/aligner_driver.cpp +++ b/aligner_driver.cpp @@ -132,8 +132,13 @@ int AlignerDriver::go( //cerr << iter << ". DESCENT_DRIVER_MEM" << endl; break; } else if(ret == DESCENT_DRIVER_STRATA) { + // DESCENT_DRIVER_STRATA is returned by DescentDriver.advance() + // when it has finished with a "non-empty" stratum: a stratum + // in which at least one alignment was found. Here we report + // the alignments in an arbitrary order. AlnRes res; - //cerr << iter << ". DESCENT_DRIVER_STRATA" << endl; + // Initialize alignment selector with the DescentDriver's + // alignment sink alsel_.init( dr1_.query(), dr1_.sink(), @@ -162,8 +167,10 @@ int AlignerDriver::go( raw_refbuf_, raw_destU32_, raw_matches_)); + // Get reference interval involved in alignment Interval refival(res.refid(), 0, res.fw(), res.reflen()); assert_gt(res.refExtent(), 0); + // Does alignment falls off end of reference? if(gReportOverhangs && !refival.containsIgnoreOrient(res.refival())) { @@ -182,7 +189,7 @@ int AlignerDriver::go( if(red1_.overlap(res)) { continue; // yes, redundant } - red1_.add(res); + red1_.add(res); // so we find subsequent redundancies // Report an unpaired alignment assert(!sink.state().doneWithMate(true)); assert(!sink.maxed()); diff --git a/aligner_result.cpp b/aligner_result.cpp index a03abb6..d554208 100644 --- a/aligner_result.cpp +++ b/aligner_result.cpp @@ -36,8 +36,8 @@ void AlnRes::reset() { ned_.clear(); aed_.clear(); score_.invalidate(); - refcoord_.invalidate(); - refival_.invalidate(); + refcoord_.reset(); + refival_.reset(); shapeSet_ = false; rdlen_ = 0; reflen_ = 0; @@ -61,8 +61,8 @@ void AlnRes::reset() { nuc5p_ = 0; nuc3p_ = 0; fraglenSet_ = false; - assert(!refcoord_.valid()); - assert(!refival_.valid()); + assert(!refcoord_.inited()); + assert(!refival_.inited()); } /** @@ -442,7 +442,7 @@ bool AlnRes::matchesRef( { assert(!empty()); assert(repOk()); - assert(refcoord_.valid()); + assert(refcoord_.inited()); bool fw = refcoord_.fw(); // Adjust reference string length according to edits size_t refallen = refNucExtent(); diff --git a/aligner_result.h b/aligner_result.h index a5b08ab..b91c419 100644 --- a/aligner_result.h +++ b/aligner_result.h @@ -836,8 +836,8 @@ public: if(!VALID_AL_SCORE(score_)) { assert(ned_.empty()); assert(aed_.empty()); - assert(!refcoord_.valid()); - assert(!refival_.valid()); + assert(!refcoord_.inited()); + assert(!refival_.inited()); return true; } else { return false; @@ -1081,8 +1081,8 @@ public: assert(refival_.repOk()); assert(VALID_AL_SCORE(score_) || ned_.empty()); assert(VALID_AL_SCORE(score_) || aed_.empty()); - assert(empty() || refcoord_.valid()); - assert(empty() || refival_.valid()); + assert(empty() || refcoord_.inited()); + assert(empty() || refival_.inited()); assert_geq(rdexrows_, rdextent_); assert(empty() || rdextent_ > 0); assert(empty() || rfextent_ > 0); diff --git a/aligner_seed2.h b/aligner_seed2.h index 0b095ca..0dd5027 100644 --- a/aligner_seed2.h +++ b/aligner_seed2.h @@ -1412,6 +1412,74 @@ struct DescentAlignment { }; /** + * A partial alignment result from a Descent where the reference offset has + * been resolved. + */ +struct DescentPartialResolvedAlignment { + + DescentPartialResolvedAlignment() { reset(); } + + /** + * Reset DescentAlignment to be uninitialized. + */ + void reset() { + topf = botf = 0; + pen = 0; + fw = false; + ei = en = 0; + refcoord.reset(); + } + + /** + * Initialize this DescentAlignment. + */ + void init( + TScore pen_, + bool fw_, + TIndexOff topf_, + TIndexOff botf_, + size_t ei_, + size_t en_, + const Coord& refcoord_) + { + assert_gt(botf_, topf_); + pen = pen_; + fw = fw_; + topf = topf_; + botf = botf_; + ei = ei_; + en = en_; + refcoord = refcoord_; + } + + /** + * Return true iff DescentAlignment is initialized. + */ + bool inited() const { + return botf > topf; + } + + /** + * Return the number of elements in this range. + */ + size_t size() const { + return botf - topf; + } + + TScore pen; // score + + bool fw; // forward or revcomp aligned? + + TIndexOff topf; // top in forward index + TIndexOff botf; // bot in forward index + + size_t ei; // First edit in DescentAlignmentSink::edits_ involved in aln + size_t en; // # edits in DescentAlignmentSink::edits_ involved in aln + + Coord refcoord; // reference coord of leftmost ref char involved +}; + +/** * Class that accepts alignments found during descent and maintains the state * required to dispense them to consumers in an appropriate order. * @@ -1510,11 +1578,12 @@ public: * 'al' and 'off', information about the element in terms of the range it's * part of and its offset into that range. */ - void elt(size_t i, DescentAlignment& al, size_t& off) const { + void elt(size_t i, DescentAlignment& al, size_t& ri, size_t& off) const { assert_lt(i, nelt()); for(size_t j = 0; j < als_.size(); j++) { if(i < als_[j].size()) { al = als_[j]; + ri = j; off = i; return; } @@ -1531,15 +1600,17 @@ public: } /** - * Return the number of alignment strata where (a) we found an alignment, - * and (b) the penalty is better than the penalty associated with the best - * heap node, which is passed in as 'best'. + * Return true iff (a) we found an alignment since the sink was initialized + * or since the last time advanceStratum() was called, and (b) the penalty + * associated with the current-best task on the heap ('best') is worse + * (higher) than the penalty associated with the alignments found most + * recently (worstPen_). */ - size_t stratumDone(TAlScore best) const { - if(nelt_ > 0 && best > worstPen_) { - return 1; + bool stratumDone(TAlScore bestPen) const { + if(nelt_ > 0 && bestPen > worstPen_) { + return true; } - return 0; + return false; } /** @@ -1597,6 +1668,144 @@ protected: }; /** + * Class that aggregates partial alignments taken from a snapshot of the + * DescentDriver heap. + */ +class DescentPartialResolvedAlignmentSink { + +public: + + /** + * Reset to uninitialized state. + */ + void reset() { + edits_.clear(); + als_.clear(); + nelt_ = 0; + bestPen_ = worstPen_ = std::numeric_limits<TAlScore>::max(); + } + + /** + * Return the total size occupued by the Descent driver and all its + * constituent parts. + */ + size_t totalSizeBytes() const { + return edits_.totalSizeBytes() + + als_.totalSizeBytes() + + sizeof(size_t); + } + + /** + * Return the total capacity of the Descent driver and all its constituent + * parts. + */ + size_t totalCapacityBytes() const { + return edits_.totalCapacityBytes() + + als_.totalCapacityBytes() + + sizeof(size_t); + } + + /** + * Return the number of SA ranges involved in hits. + */ + size_t nrange() const { + return als_.size(); + } + + /** + * Return the number of SA elements involved in hits. + */ + size_t nelt() const { + return nelt_; + } + + /** + * The caller provides 'i', which is an offset of a particular element in + * one of the SA ranges in the current stratum. This function returns, in + * 'al' and 'off', information about the element in terms of the range it's + * part of and its offset into that range. + */ + void elt(size_t i, DescentPartialResolvedAlignment& al, size_t& ri, size_t& off) const { + assert_lt(i, nelt()); + for(size_t j = 0; j < als_.size(); j++) { + if(i < als_[j].size()) { + al = als_[j]; + ri = j; + off = i; + return; + } + i -= als_[j].size(); + } + assert(false); + } + + /** + * Get a particular alignment. + */ + const DescentPartialResolvedAlignment& operator[](size_t i) const { + return als_[i]; + } + + /** + * Return true iff (a) we found an alignment since the sink was initialized + * or since the last time advanceStratum() was called, and (b) the penalty + * associated with the current-best task on the heap ('best') is worse + * (higher) than the penalty associated with the alignments found most + * recently (worstPen_). + */ + bool stratumDone(TAlScore bestPen) const { + if(nelt_ > 0 && bestPen > worstPen_) { + return true; + } + return false; + } + + /** + * The alignment consumer calls this to indicate that they are done with + * all the alignments in the current best non-empty stratum. We can + * therefore mark all those alignments as "reported" and start collecting + * results for the next stratum. + */ + void advanceStratum() { + assert_gt(nelt_, 0); + edits_.clear(); + als_.clear(); + nelt_ = 0; + bestPen_ = worstPen_ = std::numeric_limits<TAlScore>::max(); + } + +#ifndef NDEBUG + /** + * Check that partial alignment sink is internally consistent. + */ + bool repOk() const { + assert_geq(nelt_, als_.size()); + //for(size_t i = 1; i < als_.size(); i++) { + // assert_geq(als_[i].pen, als_[i-1].pen); + //} + assert(bestPen_ == std::numeric_limits<TAlScore>::max() || worstPen_ >= bestPen_); + return true; + } +#endif + + TAlScore bestPenalty() const { return bestPen_; } + TAlScore worstPenalty() const { return worstPen_; } + + size_t editsSize() const { return edits_.size(); } + size_t alsSize() const { return als_.size(); } + + const EList<Edit>& edits() const { return edits_; } + +protected: + + EList<Edit> edits_; + EList<DescentPartialResolvedAlignment> als_; + size_t nelt_; + TAlScore bestPen_; // best (smallest) penalty among as-yet-unreported alns + TAlScore worstPen_; // worst (greatest) penalty among as-yet-unreported alns +}; + +/** * Abstract parent for classes that select descent roots and descent * configurations given information about the read. */ @@ -1915,6 +2124,9 @@ public: RandomSource& rnd, // pseudo-random generator for sampling rows WalkMetrics& met) { + // We're going to sample from space of *alignments*, not ranges. So + // when we extract a sample, we'll have to do a little extra work to + // convert it to a <range, offset> coordinate. rnd_.init( sink.nelt(), // # elements to choose from true); // without replacement @@ -1957,59 +2169,49 @@ public: WalkMetrics& met, PerReadMetrics& prm) { + // Sample one alignment randomly from pool of remaining alignments size_t ri = (size_t)rnd_.next(rnd); - DescentAlignment al; size_t off = 0; - dr.sink().elt(ri, al, off); + DescentAlignment al; + size_t rangei = 0; + // Convert random alignment index into a <range, offset> coordinate + dr.sink().elt(ri, al, rangei, off); assert_lt(off, al.size()); Coord refcoord; WalkResult wr; - size_t ri_left = ri; uint32_t tidx = 0, toff = 0, tlen = 0; - for(size_t i = 0; i < gws_.size(); i++) { - if(ri_left < sas_[i].size()) { - gws_[i].advanceElement( - (uint32_t)ri_left, - ebwtFw, // forward Bowtie index for walking left - ref, // bitpair-encoded reference - sas_[i], // SA range with offsets - gwstate_, // GroupWalk state; scratch space - wr, // put the result here - met, // metrics - prm); // per-read metrics - assert_neq(0xffffffff, wr.toff); - bool straddled = false; - ebwtFw.joinedToTextOff( - wr.elt.len, - wr.toff, - tidx, - toff, - tlen, - true, // reject straddlers? - straddled); // straddled? - if(tidx == 0xffffffff) { - // The seed hit straddled a reference boundary so the seed - // hit isn't valid - return false; - } - // Coordinate of the seed hit w/r/t the pasted reference string - refcoord.init(tidx, (int64_t)toff, dr.sink()[i].fw); - break; - } - ri_left -= sas_[i].size(); + gws_[rangei].advanceElement( + (uint32_t)off, + ebwtFw, // forward Bowtie index for walking left + ref, // bitpair-encoded reference + sas_[rangei], // SA range with offsets + gwstate_, // GroupWalk state; scratch space + wr, // put the result here + met, // metrics + prm); // per-read metrics + assert_neq(0xffffffff, wr.toff); + bool straddled = false; + ebwtFw.joinedToTextOff( + wr.elt.len, + wr.toff, + tidx, + toff, + tlen, + true, // reject straddlers? + straddled); // straddled? + if(tidx == 0xffffffff) { + // The seed hit straddled a reference boundary so the seed + // hit isn't valid + return false; } + // Coordinate of the seed hit w/r/t the pasted reference string + refcoord.init(tidx, (int64_t)toff, dr.sink()[rangei].fw); const EList<Edit>& edits = dr.sink().edits(); size_t ns = 0, ngap = 0, nrefn = 0; for(size_t i = al.ei; i < al.ei + al.en; i++) { - if(edits[i].qchr == 'N' || edits[i].chr == 'N') { - ns++; - } - if(edits[i].chr == 'N') { - nrefn++; - } - if(edits[i].isGap()) { - ngap++; - } + if(edits[i].qchr == 'N' || edits[i].chr == 'N') ns++; + if(edits[i].chr == 'N') nrefn++; + if(edits[i].isGap()) ngap++; } AlnScore asc( -dr.sink().bestPenalty(), // numeric score @@ -2018,26 +2220,26 @@ public: rs.init( dr.query().length(), // # chars after hard trimming asc, // alignment score - &dr.sink().edits(), - al.ei, - al.en, - NULL, - 0, - 0, - refcoord, // leftmost ref pos of 1st al char - tlen, // length of reference aligned to - -1, - -1, - -1, - dr.minScore(), - -1, // nuc5p - -1, // nuc3p - false, // soft pre-trimming? - 0, // 5p pre-trimming - 0, // 3p pre-trimming - false, // soft trimming? - 0, // 5p trimming - 0); // 3p trimming + &dr.sink().edits(), // nucleotide edits array + al.ei, // nucleotide edits first pos + al.en, // nucleotide edits last pos + NULL, // ambig base array + 0, // ambig base first pos + 0, // ambig base last pos + refcoord, // coord of leftmost aligned char in ref + tlen, // length of reference aligned to + -1, // # seed mms allowed + -1, // seed length + -1, // seed interval + dr.minScore(), // minimum score for valid alignment + -1, // nuc5p (for colorspace) + -1, // nuc3p (for colorspace) + false, // soft pre-trimming? + 0, // 5p pre-trimming + 0, // 3p pre-trimming + false, // soft trimming? + 0, // 5p trimming + 0); // 3p trimming rs.setRefNs(nrefn); return true; } @@ -2080,4 +2282,203 @@ protected: GroupWalkState gwstate_; }; +/** + * Selects and prioritizes partial alignments from the heap of the + * DescentDriver. We assume that the heap is no longer changing (i.e. that the + * DescentDriver is done). Usually, the user will then attempt to extend the + * partial alignments into full alignments. This can happen incrementally; + * that is, the user might ask for the partial alignments one "batch" at a + * time, and the selector will only do as much work is necessary to supply each + * requesteded batch. + * + * The actual work done here includes: (a) scanning the heap for high-priority + * partial alignments, (b) setting up the rnd_, offs_, sas_, gws_, and gwstate_ + * fields and resolving offsets of partial alignments, (c) packaging and + * delivering batches of results to the caller. + * + * How to prioritize partial alignments? One idea is to use the same + * penalty-based prioritization used in the heap. This has pros: (a) maintains + * the guarantee that we're visiting alignments in best-to-worst order in + * end-to-end alignment mode, (b) the heap is already prioritized this way, so + * it's easier for us to compile high-priority partial alignments. But the con + * is that it doesn't take depth into account, which could mean that we're + * extending a lot of very short partial alignments first. + * + * A problem we should keep in mind is that some + */ +class DescentPartialAlignmentSelector { + +public: + + DescentPartialAlignmentSelector() : gwstate_(GW_CAT) { reset(); } + + /** + * Initialize a new selector w/r/t a read, index and heap of partial + * alignments. + */ + void init( + const Read& q, // read + const EHeap<TDescentPair>& heap, // the heap w/ the partial alns + TAlScore depthBonus, // use depth when prioritizing + size_t nbatch, // # of alignments in a batch + const Ebwt& ebwtFw, // forward Bowtie index for walk-left + const BitPairReference& ref, // bitpair-encoded reference + RandomSource& rnd, // pseudo-randoms for sampling rows + WalkMetrics& met) // metrics re: offset resolution + { + // Make our internal heap + if(depthBonus > 0) { + heap_.clear(); + for(size_t i = 0; i < heap.size(); i++) { + TDescentPair p = heap[i]; + p.first.pen += depthBonus * p.first.depth; + heap_.insert(p); + } + } else { + heap_ = heap; + } +#if 0 + // We're going to sample from space of *alignments*, not ranges. So + // when we extract a sample, we'll have to do a little extra work to + // convert it to a <range, offset> coordinate. + rnd_.init( + sink.nelt(), // # elements to choose from + true); // without replacement + offs_.resize(sink.nelt()); + offs_.fill(std::numeric_limits<TIndexOff>::max()); + sas_.resize(sink.nrange()); + gws_.resize(sink.nrange()); + size_t ei = 0; + for(size_t i = 0; i < sas_.size(); i++) { + size_t en = sink[i].botf - sink[i].topf; + sas_[i].init(sink[i].topf, q.length(), EListSlice<TIndexOff, 16>(offs_, ei, en)); + gws_[i].init(ebwtFw, ref, sas_[i], rnd, met); + ei += en; + } +#endif + } + + /** + * + */ + void compileBatch() { + } + + /** + * Reset the selector. + */ + void reset() { + heap_.clear(); + } + + /** + * Return true iff the selector is currently initialized. + */ + bool inited() const { + return !heap_.empty(); + } + + /** + * Get next alignment and convert it to an AlnRes. + */ + bool next( + const DescentDriver& dr, + const Ebwt& ebwtFw, // forward Bowtie index for walking left + const BitPairReference& ref, // bitpair-encoded reference + RandomSource& rnd, + AlnRes& rs, + WalkMetrics& met, + PerReadMetrics& prm) + { + // Sample one alignment randomly from pool of remaining alignments + size_t ri = (size_t)rnd_.next(rnd); + size_t off = 0; + DescentAlignment al; + size_t rangei = 0; + // Convert random alignment index into a <range, offset> coordinate + dr.sink().elt(ri, al, rangei, off); + assert_lt(off, al.size()); + Coord refcoord; + WalkResult wr; + uint32_t tidx = 0, toff = 0, tlen = 0; + gws_[rangei].advanceElement( + (uint32_t)off, + ebwtFw, // forward Bowtie index for walking left + ref, // bitpair-encoded reference + sas_[rangei], // SA range with offsets + gwstate_, // GroupWalk state; scratch space + wr, // put the result here + met, // metrics + prm); // per-read metrics + assert_neq(0xffffffff, wr.toff); + bool straddled = false; + ebwtFw.joinedToTextOff( + wr.elt.len, + wr.toff, + tidx, + toff, + tlen, + true, // reject straddlers? + straddled); // straddled? + if(tidx == 0xffffffff) { + // The seed hit straddled a reference boundary so the seed + // hit isn't valid + return false; + } + // Coordinate of the seed hit w/r/t the pasted reference string + refcoord.init(tidx, (int64_t)toff, dr.sink()[rangei].fw); + const EList<Edit>& edits = dr.sink().edits(); + size_t ns = 0, ngap = 0, nrefn = 0; + for(size_t i = al.ei; i < al.ei + al.en; i++) { + if(edits[i].qchr == 'N' || edits[i].chr == 'N') ns++; + if(edits[i].chr == 'N') nrefn++; + if(edits[i].isGap()) ngap++; + } + return true; + } + + /** + * Return true iff all elements have been reported. + */ + bool done() const { + return rnd_.done(); + } + + /** + * Return the total size occupued by the Descent driver and all its + * constituent parts. + */ + size_t totalSizeBytes() const { + return heap_.totalSizeBytes() + + rnd_.totalSizeBytes() + + offs_.totalSizeBytes() + + sas_.totalSizeBytes() + + gws_.totalSizeBytes(); + } + + /** + * Return the total capacity of the Descent driver and all its constituent + * parts. + */ + size_t totalCapacityBytes() const { + return heap_.totalCapacityBytes() + + rnd_.totalCapacityBytes() + + offs_.totalCapacityBytes() + + sas_.totalCapacityBytes() + + gws_.totalCapacityBytes(); + } + +protected: + + // This class's working heap. This might simply be a copy of the original + // heap, or it might be re-prioritized in some way. + EHeap<TDescentPair> heap_; + + Random1toN rnd_; + EList<TIndexOff, 16> offs_; + EList<SARangeWithOffs<EListSlice<TIndexOff, 16> > > sas_; + EList<GroupWalk2S<EListSlice<TIndexOff, 16>, 16> > gws_; + GroupWalkState gwstate_; +}; + #endif diff --git a/aligner_sw.cpp b/aligner_sw.cpp index ea3b178..1a8e77f 100644 --- a/aligner_sw.cpp +++ b/aligner_sw.cpp @@ -159,10 +159,10 @@ void SwAligner::initRef( const size_t rflen = (size_t)(rff - rfi); // Figure the number of Ns we're going to add to either side size_t leftNs = - (rfi >= 0 ? 0 : (size_t)std::abs(rfi)); + (rfi >= 0 ? 0 : (size_t)std::abs(static_cast<long>(rfi))); leftNs = min(leftNs, rflen); size_t rightNs = - (rff <= (TRefOff)reflen ? 0 : (size_t)std::abs(rff - (TRefOff)reflen)); + (rff <= (TRefOff)reflen ? 0 : (size_t)std::abs(static_cast<long>(rff - reflen))); rightNs = min(rightNs, rflen); // rflenInner = length of just the portion that doesn't overhang ref ends assert_geq(rflen, leftNs + rightNs); diff --git a/aligner_sw_driver.cpp b/aligner_sw_driver.cpp index 2d0c7b0..63965de 100644 --- a/aligner_sw_driver.cpp +++ b/aligner_sw_driver.cpp @@ -872,7 +872,7 @@ int SwDriver::extendSeeds( if(eeMode && eehits_[i].score < minsc) { return EXTEND_PERFECT_SCORE; } - bool small = satpos_[i].sat.size() < nsm; + bool is_small = satpos_[i].sat.size() < nsm; bool fw = satpos_[i].pos.fw; uint32_t rdoff = satpos_[i].pos.rdoff; uint32_t seedhitlen = satpos_[i].pos.seedlen; @@ -887,7 +887,7 @@ int SwDriver::extendSeeds( // range is large, just investigate one and move on - we might come // back to this range later. size_t riter = 0; - while(!rands_[i].done() && (first || small || eeMode)) { + while(!rands_[i].done() && (first || is_small || eeMode)) { assert(!gws_[i].done()); riter++; if(minsc == perfectScore) { @@ -1560,7 +1560,7 @@ int SwDriver::extendSeedsPaired( if(eeMode && eehits_[i].score < minsc) { return EXTEND_PERFECT_SCORE; } - bool small = satpos_[i].sat.size() < nsm; + bool is_small = satpos_[i].sat.size() < nsm; bool fw = satpos_[i].pos.fw; uint32_t rdoff = satpos_[i].pos.rdoff; uint32_t seedhitlen = satpos_[i].pos.seedlen; @@ -1574,7 +1574,7 @@ int SwDriver::extendSeedsPaired( // If the range is small, investigate all elements now. If the // range is large, just investigate one and move on - we might come // back to this range later. - while(!rands_[i].done() && (first || small || eeMode)) { + while(!rands_[i].done() && (first || is_small || eeMode)) { if(minsc == perfectScore) { if(!eeMode || eehits_[i].score < perfectScore) { return EXTEND_PERFECT_SCORE; @@ -31,11 +31,20 @@ use strict; use warnings; use Getopt::Long; -use FindBin qw($Bin); +use File::Spec; use POSIX; -my %signo = (); -my @signame = (); +my ($vol,$script_path,$prog) + = File::Spec->splitpath(File::Spec->rel2abs( __FILE__ )); +my $os_is_nix = ($^O eq "linux") || ($^O eq "darwin"); +my $align_bin = $os_is_nix ? 'bowtie2-align' : 'bowtie2-align.exe'; +my $build_bin = $os_is_nix ? 'bowtie2-build' : 'bowtie2-build.exe'; +my $align_prog = File::Spec->catpath($vol,$script_path,$align_bin); +my $build_prog = File::Spec->catpath($vol,$script_path,$build_bin); +my %signo = (); +my @signame = (); + + { # Get signal info use Config; @@ -47,14 +56,14 @@ my @signame = (); } } -(-x "$Bin/bowtie2-align.exe") || (-x "$Bin/bowtie2-align") || - die "Error: Expected bowtie2 to be in same directory with bowtie2-align:\n$Bin"; +(-x "$align_prog") || + die "Error: Expected bowtie2 to be in same directory with bowtie2-align:\n$script_path"; # Get description of arguments from Bowtie 2 so that we can distinguish Bowtie # 2 args from wrapper args sub getBt2Desc($) { my $d = shift; - my $cmd = "$Bin/bowtie2-align --wrapper basic-0 --arg-desc"; + my $cmd = "$align_prog --wrapper basic-0 --arg-desc"; open(my $fh, "$cmd |") || die "Failed to run command '$cmd'"; while(readline $fh) { chomp; @@ -175,7 +184,9 @@ for(my $i = 0; $i < scalar(@bt2_args); $i++) { # If the user asked us to redirect some reads to files, or to suppress # unaligned reads, then we need to capture the output from Bowtie 2 and pass it # through this wrapper. +my $passthru = 0; if(scalar(keys %read_fns) > 0 || $no_unal) { + $passthru = 1; push @bt2_args, "--passthrough"; $cap_out = "-"; for(my $i = 0; $i < scalar(@bt2_args); $i++) { @@ -335,7 +346,7 @@ if(defined($ref_str)) { print {$ofh} ">1\n$ref_str\n"; close($ofh); push @to_delete, $ofn; - system("$Bin/bowtie2-build $ofn $ofn") == 0 || + system("$build_bin $ofn $ofn") == 0 || die "Error: bowtie2-build returned non-0 exit level"; push @bt2_args, ("--index", "$ofn"); push @to_delete, ("$ofn.1.bt2", "$ofn.2.bt2", "$ofn.3.bt2", "$ofn.4.bt2", @@ -350,7 +361,7 @@ if($verbose) { my $debug_str = ($debug ? "-debug" : ""); # Construct command invoking bowtie2-align -my $cmd = "$Bin/bowtie2-align$debug_str --wrapper basic-0 ".join(" ", @bt2_args); +my $cmd = "$align_prog$debug_str --wrapper basic-0 ".join(" ", @bt2_args); # Possibly add read input on an anonymous pipe $cmd = "$readpipe $cmd" if defined($readpipe); @@ -401,7 +412,6 @@ if(defined($cap_out)) { } } } - my $passthru = scalar(keys %read_fns) > 0; while(<BT>) { chomp; my $filt = 0; @@ -411,38 +421,44 @@ if(defined($cap_out)) { my $tab2_i = index($_, "\t", $tab1_i); my $fl = substr($_, $tab1_i, $tab2_i - $tab1_i); my $unal = ($fl & 4) != 0; + $filt = 1 if $no_unal && $unal; if($passthru) { - my $mate1 = (($fl & 64) != 0); - my $mate2 = (($fl & 128) != 0); - my $unp = !$mate1 && !$mate2; - my $pair = !$unp; - # Next line is read with some whitespace escaped - my $l = <BT>; - chomp($l); - $l =~ s/%(..)/chr(hex($1))/eg; - if((defined($read_fhs{un}) || defined($read_fhs{al})) && $unp) { - if($unal) { - # Failed to align - print {$read_fhs{un}} $l if defined($read_fhs{un}); - } else { - # Aligned - print {$read_fhs{al}} $l if defined($read_fhs{al}); + if($filt) { + # Next line is read with some whitespace escaped, which we + # ignore b/c the record is filtered out by --no-unal + my $l = <BT>; + } else { + my $mate1 = (($fl & 64) != 0); + my $mate2 = (($fl & 128) != 0); + my $unp = !$mate1 && !$mate2; + my $pair = !$unp; + # Next line is read with some whitespace escaped + my $l = <BT>; + chomp($l); + $l =~ s/%(..)/chr(hex($1))/eg; + if((defined($read_fhs{un}) || defined($read_fhs{al})) && $unp) { + if($unal) { + # Failed to align + print {$read_fhs{un}} $l if defined($read_fhs{un}); + } else { + # Aligned + print {$read_fhs{al}} $l if defined($read_fhs{al}); + } } - } - if((defined($read_fhs{"un-conc"}) || defined($read_fhs{"al-conc"})) && $pair) { - my $conc = (($fl & 2) != 0); - if ($conc && $mate1) { - print {$read_fhs{"al-conc"}{1}} $l if defined($read_fhs{"al-conc"}); - } elsif($conc && $mate2) { - print {$read_fhs{"al-conc"}{2}} $l if defined($read_fhs{"al-conc"}); - } elsif(!$conc && $mate1) { - print {$read_fhs{"un-conc"}{1}} $l if defined($read_fhs{"un-conc"}); - } elsif(!$conc && $mate2) { - print {$read_fhs{"un-conc"}{2}} $l if defined($read_fhs{"un-conc"}); + if((defined($read_fhs{"un-conc"}) || defined($read_fhs{"al-conc"})) && $pair) { + my $conc = (($fl & 2) != 0); + if ($conc && $mate1) { + print {$read_fhs{"al-conc"}{1}} $l if defined($read_fhs{"al-conc"}); + } elsif($conc && $mate2) { + print {$read_fhs{"al-conc"}{2}} $l if defined($read_fhs{"al-conc"}); + } elsif(!$conc && $mate1) { + print {$read_fhs{"un-conc"}{1}} $l if defined($read_fhs{"un-conc"}); + } elsif(!$conc && $mate2) { + print {$read_fhs{"un-conc"}{2}} $l if defined($read_fhs{"un-conc"}); + } } } } - $filt = 1 if $no_unal && $unal; } print {$ofh} "$_\n" if !$filt; } @@ -466,6 +482,6 @@ if ($ret == -1) { printf STDERR "bowtie2-align died with signal %d (%s) $ad\n", ($ret & 127), $signm; exit 1; } elsif($ret != 0) { - printf STDERR "bowtie2-align exited with value %d\n", $ret >> 8; + printf STDERR "bowtie2-align exited with value %d\n", ($ret >> 8); } -exit $ret >> 8; +exit ($ret >> 8); @@ -2948,6 +2948,13 @@ public: return l_.empty(); } + /** + * Return element at offset i. + */ + const T& operator[](size_t i) const { + return l_[i]; + } + #ifndef NDEBUG /** * Check that heap property holds. diff --git a/ival_list.h b/ival_list.h index 354e585..0fc40a5 100644 --- a/ival_list.h +++ b/ival_list.h @@ -149,7 +149,7 @@ protected: Coord dn = std::max(sorted_[i-1].downstream(), sorted_[i].downstream()); sorted_[i].setUpstream(up); sorted_[i].setLength(dn.off() - up.off()); - sorted_[i-1].invalidate(); + sorted_[i-1].reset(); } } sorted_.sort(); @@ -157,7 +157,7 @@ protected: sorted_.resize(sorted_.size()-nmerged); #ifndef NDEBUG for(size_t i = 0; i < sorted_.size(); i++) { - assert(sorted_[i].valid()); + assert(sorted_[i].inited()); } #endif } @@ -514,9 +514,6 @@ bool VectorPatternSource::nextReadImpl( // Let Strings begin at the beginning of the respective bufs r.reset(); lock(); - readCnt_++; - rdid = readCnt_; - endid = readCnt_; if(cur_ >= v_.size()) { unlock(); // Clear all the Strings, as a signal to the caller that @@ -538,6 +535,8 @@ bool VectorPatternSource::nextReadImpl( r.name = os.str(); cur_++; done = cur_ == v_.size(); + rdid = endid = readCnt_; + readCnt_++; unlock(); success = true; return true; @@ -564,8 +563,6 @@ bool VectorPatternSource::nextReadPairImpl( cur_ <<= 1; } lock(); - readCnt_++; - rdid = endid = readCnt_; if(cur_ >= v_.size()-1) { unlock(); // Clear all the Strings, as a signal to the caller that @@ -595,6 +592,8 @@ bool VectorPatternSource::nextReadPairImpl( ra.color = rb.color = gColor; cur_++; done = cur_ >= v_.size()-1; + rdid = endid = readCnt_; + readCnt_++; unlock(); success = true; return true; @@ -696,8 +695,6 @@ bool FastaPatternSource::read( r.reset(); r.color = gColor; // Pick off the first carat - readCnt_++; - rdid = endid = readCnt_-1; c = fb_.get(); if(c < 0) { bail(r); success = false; done = true; return success; @@ -752,6 +749,8 @@ bool FastaPatternSource::read( // Empty sequences! cerr << "Warning: skipping empty FASTA read with name '" << r.name << "'" << endl; fb_.resetLastN(); + rdid = endid = readCnt_; + readCnt_++; success = true; done = false; return success; } assert_neq('>', c); @@ -803,6 +802,8 @@ bool FastaPatternSource::read( assert_gt(r.name.length(), 0); r.readOrigBuf.install(fb_.lastN(), fb_.lastNLen()); fb_.resetLastN(); + rdid = endid = readCnt_; + readCnt_++; return success; } @@ -816,8 +817,6 @@ bool FastqPatternSource::read( { int c; int dstLen = 0; - readCnt_++; - rdid = endid = readCnt_-1; success = true; done = false; r.reset(); @@ -948,6 +947,8 @@ bool FastqPatternSource::read( assert_eq('@', pk); fb_.get(); fb_.resetLastN(); + rdid = endid = readCnt_; + readCnt_++; return success; } @@ -1108,6 +1109,8 @@ bool FastqPatternSource::read( } r.trimmed3 = gTrim3; r.trimmed5 = mytrim5; + rdid = endid = readCnt_; + readCnt_++; return success; } @@ -1121,8 +1124,6 @@ bool TabbedPatternSource::read( { r.reset(); r.color = gColor; - readCnt_++; - rdid = endid = readCnt_-1; success = true; done = false; // fb_ is about to dish out the first character of the @@ -1166,6 +1167,8 @@ bool TabbedPatternSource::read( assert_neq('\n', fb_.peek()); r.readOrigBuf.install(fb_.lastN(), fb_.lastNLen()); fb_.resetLastN(); + rdid = endid = readCnt_; + readCnt_++; return true; } @@ -1179,8 +1182,6 @@ bool TabbedPatternSource::readPair( bool& done, bool& paired) { - readCnt_++; - rdid = endid = readCnt_-1; success = true; done = false; @@ -1242,6 +1243,8 @@ bool TabbedPatternSource::readPair( success = true; done = false; paired = false; + rdid = endid = readCnt_; + readCnt_++; return success; } paired = true; @@ -1292,6 +1295,8 @@ bool TabbedPatternSource::readPair( fb_.resetLastN(); rb.trimmed3 = gTrim3; rb.trimmed5 = mytrim5_2; + rdid = endid = readCnt_; + readCnt_++; return true; } diff --git a/ref_coord.h b/ref_coord.h index 4cd140e..14a55ea 100644 --- a/ref_coord.h +++ b/ref_coord.h @@ -36,7 +36,7 @@ class Coord { public: - Coord() { invalidate(); } + Coord() { reset(); } Coord(const Coord& c) { init(c); } @@ -64,8 +64,8 @@ public: * Return true iff this Coord is identical to the given Coord. */ bool operator==(const Coord& o) const { - assert(valid()); - assert(o.valid()); + assert(inited()); + assert(o.inited()); return ref_ == o.ref_ && off_ == o.off_ && fw() == o.fw(); } @@ -75,8 +75,6 @@ public: * less, or (c) its offset is less. */ bool operator<(const Coord& o) const { - //assert(valid()); - //assert(o.valid()); if(ref_ < o.ref_) return true; if(ref_ > o.ref_) return false; if(orient_ < o.orient_) return true; @@ -99,8 +97,6 @@ public: * orientation is greater, or (c) its offset is greater. */ bool operator>(const Coord& o) const { - //assert(valid()); - //assert(o.valid()); if(ref_ > o.ref_) return true; if(ref_ < o.ref_) return false; if(orient_ > o.orient_) return true; @@ -118,19 +114,19 @@ public: } /** - * Make this coord invalid. + * Reset this coord to uninitialized state. */ - void invalidate() { + void reset() { ref_ = std::numeric_limits<TRefId>::max(); off_ = std::numeric_limits<TRefOff>::max(); orient_ = -1; } /** - * Return true iff this Coord is valid (i.e. ref and off have both - * been set since the last call to invalidate()). + * Return true iff this Coord is initialized (i.e. ref and off have both + * been set since the last call to reset()). */ - bool valid() const { + bool inited() const { if(ref_ != std::numeric_limits<TRefId>::max() && off_ != std::numeric_limits<TRefOff>::max()) { @@ -144,7 +140,7 @@ public: * Get orientation of the Coord. */ bool fw() const { - assert(valid()); + assert(inited()); assert(orient_ == 0 || orient_ == 1); return orient_ == 1; } @@ -197,7 +193,7 @@ class Interval { public: - Interval() { invalidate(); } + Interval() { reset(); } explicit Interval(const Coord& upstream, TRefOff len) { init(upstream, len); @@ -232,18 +228,18 @@ public: } /** - * Make this coord invalid. + * Reset this interval to uninitialized state. */ - void invalidate() { - upstream_.invalidate(); + void reset() { + upstream_.reset(); len_ = 0; } /** - * Return true iff this Interval is valid. + * Return true iff this Interval is initialized. */ - bool valid() const { - if(upstream_.valid()) { + bool inited() const { + if(upstream_.inited()) { assert_gt(len_, 0); return true; } else { @@ -36,7 +36,7 @@ #if defined(__GNUC__) #if defined(__x86_64__) || defined(__i386__) -#ifdef TRY_SPINLOCK +#ifndef NO_TRY_SPINLOCK #define USE_SPINLOCK #endif #endif @@ -45,7 +45,7 @@ #ifdef USE_SPINLOCK #if defined(__x86_64__) -#define SPINLOCK_WORD long +#define SPINLOCK_WORD long long #else #define SPINLOCK_WORD int #endif @@ -231,7 +231,7 @@ public: } else { secbest = s.paired() ? s.secbestPaired().score() : s.secbest(mate1).score(); - TAlScore bestdiff = abs(abs(best)-abs(secbest)); + TAlScore bestdiff = abs(abs(static_cast<long>(best))-abs(static_cast<long>(secbest))); if(bestdiff >= diff * (double)0.9f) { if(bestOver == diff) { ret = 39; @@ -341,7 +341,7 @@ public: } else { secbest = s.paired() ? s.secbestPaired().score() : s.secbest(mate1).score(); - TAlScore bestdiff = abs(abs(best)-abs(secbest)); + TAlScore bestdiff = abs(abs(static_cast<long>(best))-abs(static_cast<long>(secbest))); if (bestdiff >= diff * (double)0.9f) ret = 40; else if(bestdiff >= diff * (double)0.8f) ret = 39; else if(bestdiff >= diff * (double)0.7f) ret = 38; @@ -464,7 +464,7 @@ public: } } else { secbest = s.secbest(mate1).score(); - TAlScore bestdiff = abs(abs(best)-abs(secbest)); + TAlScore bestdiff = abs(abs(static_cast<long>(best))-abs(static_cast<long>(secbest))); if(bestdiff >= diff * 0.1666 * 5) { ret = 6; } else if(bestdiff >= diff * 0.1666 * 4) { |