diff options
author | Alexandre Mestiashvili <alex@biotec.tu-dresden.de> | 2012-02-21 15:13:07 +0000 |
---|---|---|
committer | Alexandre Mestiashvili <alex@biotec.tu-dresden.de> | 2012-02-21 15:13:07 +0000 |
commit | 4b732fc3aa3fa95ee87904aff06325a38a804fb6 (patch) | |
tree | b6c5b60a298acbd0c4b5e04a41f8958ccfce6b73 /aligner_metrics.h |
Imported Upstream version 2.0.0-beta5
Diffstat (limited to 'aligner_metrics.h')
-rw-r--r-- | aligner_metrics.h | 352 |
1 files changed, 352 insertions, 0 deletions
diff --git a/aligner_metrics.h b/aligner_metrics.h new file mode 100644 index 0000000..9bd8bb7 --- /dev/null +++ b/aligner_metrics.h @@ -0,0 +1,352 @@ +/* + * Copyright 2011, Ben Langmead <blangmea@jhsph.edu> + * + * This file is part of Bowtie 2. + * + * Bowtie 2 is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Bowtie 2 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef ALIGNER_METRICS_H_ +#define ALIGNER_METRICS_H_ + +#include <math.h> +#include <iostream> +#include "alphabet.h" +#include "timer.h" +#include "sstring.h" + +using namespace std; + +/** + * Borrowed from http://www.johndcook.com/standard_deviation.html, + * which in turn is borrowed from Knuth. + */ +class RunningStat { +public: + RunningStat() : m_n(0), m_tot(0.0) { } + + void clear() { + m_n = 0; + m_tot = 0.0; + } + + void push(float x) { + m_n++; + m_tot += x; + // See Knuth TAOCP vol 2, 3rd edition, page 232 + if (m_n == 1) { + m_oldM = m_newM = x; + m_oldS = 0.0; + } else { + m_newM = m_oldM + (x - m_oldM)/m_n; + m_newS = m_oldS + (x - m_oldM)*(x - m_newM); + // set up for next iteration + m_oldM = m_newM; + m_oldS = m_newS; + } + } + + int num() const { + return m_n; + } + + double tot() const { + return m_tot; + } + + double mean() const { + return (m_n > 0) ? m_newM : 0.0; + } + + double variance() const { + return ( (m_n > 1) ? m_newS/(m_n - 1) : 0.0 ); + } + + double stddev() const { + return sqrt(variance()); + } + +private: + int m_n; + double m_tot; + double m_oldM, m_newM, m_oldS, m_newS; +}; + +/** + * Encapsulates a set of metrics that we would like an aligner to keep + * track of, so that we can possibly use it to diagnose performance + * issues. + */ +class AlignerMetrics { + +public: + + AlignerMetrics() : + curBacktracks_(0), + curBwtOps_(0), + first_(true), + curIsLowEntropy_(false), + curIsHomoPoly_(false), + curHadRanges_(false), + curNumNs_(0), + reads_(0), + homoReads_(0), + lowEntReads_(0), + hiEntReads_(0), + alignedReads_(0), + unalignedReads_(0), + threeOrMoreNReads_(0), + lessThanThreeNRreads_(0), + bwtOpsPerRead_(), + backtracksPerRead_(), + bwtOpsPerHomoRead_(), + backtracksPerHomoRead_(), + bwtOpsPerLoEntRead_(), + backtracksPerLoEntRead_(), + bwtOpsPerHiEntRead_(), + backtracksPerHiEntRead_(), + bwtOpsPerAlignedRead_(), + backtracksPerAlignedRead_(), + bwtOpsPerUnalignedRead_(), + backtracksPerUnalignedRead_(), + bwtOpsPer0nRead_(), + backtracksPer0nRead_(), + bwtOpsPer1nRead_(), + backtracksPer1nRead_(), + bwtOpsPer2nRead_(), + backtracksPer2nRead_(), + bwtOpsPer3orMoreNRead_(), + backtracksPer3orMoreNRead_(), + timer_(cout, "", false) + { } + + void printSummary() { + if(!first_) { + finishRead(); + } + cout << "AlignerMetrics:" << endl; + cout << " # Reads: " << reads_ << endl; + float hopct = (reads_ > 0) ? (((float)homoReads_)/((float)reads_)) : (0.0f); + hopct *= 100.0f; + cout << " % homo-polymeric: " << (hopct) << endl; + float lopct = (reads_ > 0) ? ((float)lowEntReads_/(float)(reads_)) : (0.0f); + lopct *= 100.0f; + cout << " % low-entropy: " << (lopct) << endl; + float unpct = (reads_ > 0) ? ((float)unalignedReads_/(float)(reads_)) : (0.0f); + unpct *= 100.0f; + cout << " % unaligned: " << (unpct) << endl; + float npct = (reads_ > 0) ? ((float)threeOrMoreNReads_/(float)(reads_)) : (0.0f); + npct *= 100.0f; + cout << " % with 3 or more Ns: " << (npct) << endl; + cout << endl; + cout << " Total BWT ops: avg: " << bwtOpsPerRead_.mean() << ", stddev: " << bwtOpsPerRead_.stddev() << endl; + cout << " Total Backtracks: avg: " << backtracksPerRead_.mean() << ", stddev: " << backtracksPerRead_.stddev() << endl; + time_t elapsed = timer_.elapsed(); + cout << " BWT ops per second: " << (bwtOpsPerRead_.tot()/elapsed) << endl; + cout << " Backtracks per second: " << (backtracksPerRead_.tot()/elapsed) << endl; + cout << endl; + cout << " Homo-poly:" << endl; + cout << " BWT ops: avg: " << bwtOpsPerHomoRead_.mean() << ", stddev: " << bwtOpsPerHomoRead_.stddev() << endl; + cout << " Backtracks: avg: " << backtracksPerHomoRead_.mean() << ", stddev: " << backtracksPerHomoRead_.stddev() << endl; + cout << " Low-entropy:" << endl; + cout << " BWT ops: avg: " << bwtOpsPerLoEntRead_.mean() << ", stddev: " << bwtOpsPerLoEntRead_.stddev() << endl; + cout << " Backtracks: avg: " << backtracksPerLoEntRead_.mean() << ", stddev: " << backtracksPerLoEntRead_.stddev() << endl; + cout << " High-entropy:" << endl; + cout << " BWT ops: avg: " << bwtOpsPerHiEntRead_.mean() << ", stddev: " << bwtOpsPerHiEntRead_.stddev() << endl; + cout << " Backtracks: avg: " << backtracksPerHiEntRead_.mean() << ", stddev: " << backtracksPerHiEntRead_.stddev() << endl; + cout << endl; + cout << " Unaligned:" << endl; + cout << " BWT ops: avg: " << bwtOpsPerUnalignedRead_.mean() << ", stddev: " << bwtOpsPerUnalignedRead_.stddev() << endl; + cout << " Backtracks: avg: " << backtracksPerUnalignedRead_.mean() << ", stddev: " << backtracksPerUnalignedRead_.stddev() << endl; + cout << " Aligned:" << endl; + cout << " BWT ops: avg: " << bwtOpsPerAlignedRead_.mean() << ", stddev: " << bwtOpsPerAlignedRead_.stddev() << endl; + cout << " Backtracks: avg: " << backtracksPerAlignedRead_.mean() << ", stddev: " << backtracksPerAlignedRead_.stddev() << endl; + cout << endl; + cout << " 0 Ns:" << endl; + cout << " BWT ops: avg: " << bwtOpsPer0nRead_.mean() << ", stddev: " << bwtOpsPer0nRead_.stddev() << endl; + cout << " Backtracks: avg: " << backtracksPer0nRead_.mean() << ", stddev: " << backtracksPer0nRead_.stddev() << endl; + cout << " 1 N:" << endl; + cout << " BWT ops: avg: " << bwtOpsPer1nRead_.mean() << ", stddev: " << bwtOpsPer1nRead_.stddev() << endl; + cout << " Backtracks: avg: " << backtracksPer1nRead_.mean() << ", stddev: " << backtracksPer1nRead_.stddev() << endl; + cout << " 2 Ns:" << endl; + cout << " BWT ops: avg: " << bwtOpsPer2nRead_.mean() << ", stddev: " << bwtOpsPer2nRead_.stddev() << endl; + cout << " Backtracks: avg: " << backtracksPer2nRead_.mean() << ", stddev: " << backtracksPer2nRead_.stddev() << endl; + cout << " >2 Ns:" << endl; + cout << " BWT ops: avg: " << bwtOpsPer3orMoreNRead_.mean() << ", stddev: " << bwtOpsPer3orMoreNRead_.stddev() << endl; + cout << " Backtracks: avg: " << backtracksPer3orMoreNRead_.mean() << ", stddev: " << backtracksPer3orMoreNRead_.stddev() << endl; + cout << endl; + } + + /** + * + */ + void nextRead(const BTDnaString& read) { + if(!first_) { + finishRead(); + } + first_ = false; + //float ent = entropyDna5(read); + float ent = 0.0f; + curIsLowEntropy_ = (ent < 0.75f); + curIsHomoPoly_ = (ent < 0.001f); + curHadRanges_ = false; + curBwtOps_ = 0; + curBacktracks_ = 0; + // Count Ns + curNumNs_ = 0; + const size_t len = read.length(); + for(size_t i = 0; i < len; i++) { + if((int)read[i] == 4) curNumNs_++; + } + } + + /** + * + */ + void setReadHasRange() { + curHadRanges_ = true; + } + + /** + * Commit the running statistics for this read to + */ + void finishRead() { + reads_++; + if(curIsHomoPoly_) homoReads_++; + else if(curIsLowEntropy_) lowEntReads_++; + else hiEntReads_++; + if(curHadRanges_) alignedReads_++; + else unalignedReads_++; + bwtOpsPerRead_.push((float)curBwtOps_); + backtracksPerRead_.push((float)curBacktracks_); + // Drill down by entropy + if(curIsHomoPoly_) { + bwtOpsPerHomoRead_.push((float)curBwtOps_); + backtracksPerHomoRead_.push((float)curBacktracks_); + } else if(curIsLowEntropy_) { + bwtOpsPerLoEntRead_.push((float)curBwtOps_); + backtracksPerLoEntRead_.push((float)curBacktracks_); + } else { + bwtOpsPerHiEntRead_.push((float)curBwtOps_); + backtracksPerHiEntRead_.push((float)curBacktracks_); + } + // Drill down by whether it aligned + if(curHadRanges_) { + bwtOpsPerAlignedRead_.push((float)curBwtOps_); + backtracksPerAlignedRead_.push((float)curBacktracks_); + } else { + bwtOpsPerUnalignedRead_.push((float)curBwtOps_); + backtracksPerUnalignedRead_.push((float)curBacktracks_); + } + if(curNumNs_ == 0) { + lessThanThreeNRreads_++; + bwtOpsPer0nRead_.push((float)curBwtOps_); + backtracksPer0nRead_.push((float)curBacktracks_); + } else if(curNumNs_ == 1) { + lessThanThreeNRreads_++; + bwtOpsPer1nRead_.push((float)curBwtOps_); + backtracksPer1nRead_.push((float)curBacktracks_); + } else if(curNumNs_ == 2) { + lessThanThreeNRreads_++; + bwtOpsPer2nRead_.push((float)curBwtOps_); + backtracksPer2nRead_.push((float)curBacktracks_); + } else { + threeOrMoreNReads_++; + bwtOpsPer3orMoreNRead_.push((float)curBwtOps_); + backtracksPer3orMoreNRead_.push((float)curBacktracks_); + } + } + + // Running-total of the number of backtracks and BWT ops for the + // current read + uint32_t curBacktracks_; + uint32_t curBwtOps_; + +protected: + + bool first_; + + // true iff the current read is low entropy + bool curIsLowEntropy_; + // true if current read is all 1 char (or very close) + bool curIsHomoPoly_; + // true iff the current read has had one or more ranges reported + bool curHadRanges_; + // number of Ns in current read + int curNumNs_; + + // # reads + uint32_t reads_; + // # homo-poly reads + uint32_t homoReads_; + // # low-entropy reads + uint32_t lowEntReads_; + // # high-entropy reads + uint32_t hiEntReads_; + // # reads with alignments + uint32_t alignedReads_; + // # reads without alignments + uint32_t unalignedReads_; + // # reads with 3 or more Ns + uint32_t threeOrMoreNReads_; + // # reads with < 3 Ns + uint32_t lessThanThreeNRreads_; + + // Distribution of BWT operations per read + RunningStat bwtOpsPerRead_; + RunningStat backtracksPerRead_; + + // Distribution of BWT operations per homo-poly read + RunningStat bwtOpsPerHomoRead_; + RunningStat backtracksPerHomoRead_; + + // Distribution of BWT operations per low-entropy read + RunningStat bwtOpsPerLoEntRead_; + RunningStat backtracksPerLoEntRead_; + + // Distribution of BWT operations per high-entropy read + RunningStat bwtOpsPerHiEntRead_; + RunningStat backtracksPerHiEntRead_; + + // Distribution of BWT operations per read that "aligned" (for + // which a range was arrived at - range may not have necessarily + // lead to an alignment) + RunningStat bwtOpsPerAlignedRead_; + RunningStat backtracksPerAlignedRead_; + + // Distribution of BWT operations per read that didn't align + RunningStat bwtOpsPerUnalignedRead_; + RunningStat backtracksPerUnalignedRead_; + + // Distribution of BWT operations/backtracks per read with no Ns + RunningStat bwtOpsPer0nRead_; + RunningStat backtracksPer0nRead_; + + // Distribution of BWT operations/backtracks per read with one N + RunningStat bwtOpsPer1nRead_; + RunningStat backtracksPer1nRead_; + + // Distribution of BWT operations/backtracks per read with two Ns + RunningStat bwtOpsPer2nRead_; + RunningStat backtracksPer2nRead_; + + // Distribution of BWT operations/backtracks per read with three or + // more Ns + RunningStat bwtOpsPer3orMoreNRead_; + RunningStat backtracksPer3orMoreNRead_; + + Timer timer_; +}; + +#endif /* ALIGNER_METRICS_H_ */ |