/* * Copyright 2011, Ben Langmead * * This file is part of Bowtie 2. * * Bowtie 2 is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Bowtie 2 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Bowtie 2. If not, see . */ #include #include "scoring.h" using namespace std; /** * Return true iff a read of length 'rdlen' passes the score filter, i.e., * has enough characters to rise above the minimum score threshold. */ bool Scoring::scoreFilter( int64_t minsc, size_t rdlen) const { int64_t sc = (int64_t)(rdlen * match(30)); return sc >= minsc; } /** * Given the score floor for valid alignments and the length of the read, * calculate the maximum possible number of read gaps that could occur in a * valid alignment. */ int Scoring::maxReadGaps( int64_t minsc, size_t rdlen) const { // Score if all characters match. TODO: remove assumption that match bonus // is independent of quality value. int64_t sc = (int64_t)(rdlen * match(30)); assert_geq(sc, minsc); // Now convert matches to read gaps until sc calls below minsc bool first = true; int num = 0; while(sc >= minsc) { if(first) { first = false; // Subtract both penalties sc -= readGapOpen(); } else { // Subtract just the extension penalty sc -= readGapExtend(); } num++; } assert_gt(num, 0); return num-1; } /** * Given the score floor for valid alignments and the length of the read, * calculate the maximum possible number of reference gaps that could occur * in a valid alignment. */ int Scoring::maxRefGaps( int64_t minsc, size_t rdlen) const { // Score if all characters match. TODO: remove assumption that match bonus // is independent of quality value. int64_t sc = (int64_t)(rdlen * match(30)); assert_geq(sc, minsc); // Now convert matches to read gaps until sc calls below minsc bool first = true; int num = 0; while(sc >= minsc) { sc -= match(30); if(first) { first = false; // Subtract both penalties sc -= refGapOpen(); } else { // Subtract just the extension penalty sc -= refGapExtend(); } num++; } assert_gt(num, 0); return num-1; } /** * Given a read sequence, return true iff the read passes the N filter. * The N filter rejects reads with more than the number of Ns. */ bool Scoring::nFilter(const BTDnaString& rd, size_t& ns) const { size_t rdlen = rd.length(); size_t maxns = nCeil.f((double)rdlen); assert_geq(rd.length(), 0); for(size_t i = 0; i < rdlen; i++) { if(rd[i] == 4) { ns++; if(ns > maxns) { return false; // doesn't pass } } } return true; // passes } /** * Given a read sequence, return true iff the read passes the N filter. * The N filter rejects reads with more than the number of Ns. * * For paired-end reads, there is a question of how to apply the filter. * The filter could be applied to both mates separately, which might then * prevent paired-end alignment. Or the filter could be applied to the * reads as though they're concatenated together. The latter approach has * pros and cons. The pro is that we can use paired-end information to * recover alignments for mates that would not have passed the N filter on * their own. The con is that we might not want to do that, since the * non-N portion of the bad mate might contain particularly unreliable * information. */ void Scoring::nFilterPair( const BTDnaString* rd1, // mate 1 const BTDnaString* rd2, // mate 2 size_t& ns1, // # Ns in mate 1 size_t& ns2, // # Ns in mate 2 bool& filt1, // true -> mate 1 rejected by filter bool& filt2) // true -> mate 2 rejected by filter const { // Both fail to pass by default filt1 = filt2 = false; if(rd1 != NULL && rd2 != NULL && ncatpair) { size_t rdlen1 = rd1->length(); size_t rdlen2 = rd2->length(); size_t maxns = nCeil.f((double)(rdlen1 + rdlen2)); for(size_t i = 0; i < rdlen1; i++) { if((*rd1)[i] == 4) ns1++; if(ns1 > maxns) { // doesn't pass return; } } for(size_t i = 0; i < rdlen2; i++) { if((*rd2)[i] == 4) ns2++; if(ns2 > maxns) { // doesn't pass return; } } // Both pass filt1 = filt2 = true; } else { if(rd1 != NULL) filt1 = nFilter(*rd1, ns1); if(rd2 != NULL) filt2 = nFilter(*rd2, ns2); } } #ifdef SCORING_MAIN int main() { { cout << "Case 1: Simple 1 ... "; Scoring sc = Scoring::base1(); assert_eq(COST_MODEL_CONSTANT, sc.matchType); assert_eq(0, sc.maxRefGaps(0, 10)); // 10 - 1 - 15 = -6 assert_eq(0, sc.maxRefGaps(0, 11)); // 11 - 1 - 15 = -5 assert_eq(0, sc.maxRefGaps(0, 12)); // 12 - 1 - 15 = -4 assert_eq(0, sc.maxRefGaps(0, 13)); // 13 - 1 - 15 = -3 assert_eq(0, sc.maxRefGaps(0, 14)); // 14 - 1 - 15 = -2 assert_eq(0, sc.maxRefGaps(0, 15)); // 15 - 1 - 15 = -1 assert_eq(1, sc.maxRefGaps(0, 16)); // 16 - 1 - 15 = 0 assert_eq(1, sc.maxRefGaps(0, 17)); // 17 - 2 - 19 = -4 assert_eq(1, sc.maxRefGaps(0, 18)); // 18 - 2 - 19 = -3 assert_eq(1, sc.maxRefGaps(0, 19)); // 19 - 2 - 19 = -2 assert_eq(1, sc.maxRefGaps(0, 20)); // 20 - 2 - 19 = -1 assert_eq(2, sc.maxRefGaps(0, 21)); // 21 - 2 - 19 = 0 assert_eq(0, sc.maxReadGaps(0, 10)); // 10 - 0 - 15 = -5 assert_eq(0, sc.maxReadGaps(0, 11)); // 11 - 0 - 15 = -4 assert_eq(0, sc.maxReadGaps(0, 12)); // 12 - 0 - 15 = -3 assert_eq(0, sc.maxReadGaps(0, 13)); // 13 - 0 - 15 = -2 assert_eq(0, sc.maxReadGaps(0, 14)); // 14 - 0 - 15 = -1 assert_eq(1, sc.maxReadGaps(0, 15)); // 15 - 0 - 15 = 0 assert_eq(1, sc.maxReadGaps(0, 16)); // 16 - 0 - 19 = -3 assert_eq(1, sc.maxReadGaps(0, 17)); // 17 - 0 - 19 = -2 assert_eq(1, sc.maxReadGaps(0, 18)); // 18 - 0 - 19 = -1 assert_eq(2, sc.maxReadGaps(0, 19)); // 19 - 0 - 19 = 0 assert_eq(2, sc.maxReadGaps(0, 20)); // 20 - 0 - 23 = -3 assert_eq(2, sc.maxReadGaps(0, 21)); // 21 - 0 - 23 = -2 // N ceiling: const=2, linear=0.1 assert_eq(1, sc.nCeil(1)); assert_eq(2, sc.nCeil(3)); assert_eq(2, sc.nCeil(5)); assert_eq(2, sc.nCeil(7)); assert_eq(2, sc.nCeil(9)); assert_eq(3, sc.nCeil(10)); for(int i = 0; i < 30; i++) { assert_eq(3, sc.n(i)); assert_eq(3, sc.mm(i)); } assert_eq(5, sc.gapbar); cout << "PASSED" << endl; } { cout << "Case 2: Simple 2 ... "; Scoring sc( 4, // reward for a match COST_MODEL_QUAL, // how to penalize mismatches 0, // constant if mm pelanty is a constant -3.0f, // constant coeff for minimum score -3.0f, // linear coeff for minimum score DEFAULT_FLOOR_CONST, // constant coeff for score floor DEFAULT_FLOOR_LINEAR, // linear coeff for score floor 3.0f, // max # ref Ns allowed in alignment; const coeff 0.4f, // max # ref Ns allowed in alignment; linear coeff COST_MODEL_QUAL, // how to penalize Ns in the read 0, // constant if N pelanty is a constant true, // whether to concatenate mates before N filtering 25, // constant coeff for cost of gap in the read 25, // constant coeff for cost of gap in the ref 10, // coeff of linear term for cost of gap in read 10, // coeff of linear term for cost of gap in ref 5, // 5 rows @ top/bot diagonal-entrance-only -1, // no restriction on row false // score prioritized over row ); assert_eq(COST_MODEL_CONSTANT, sc.matchType); assert_eq(4, sc.matchConst); assert_eq(COST_MODEL_QUAL, sc.mmcostType); assert_eq(COST_MODEL_QUAL, sc.npenType); assert_eq(0, sc.maxRefGaps(0, 8)); // 32 - 4 - 35 = -7 assert_eq(0, sc.maxRefGaps(0, 9)); // 36 - 4 - 35 = -3 assert_eq(1, sc.maxRefGaps(0, 10)); // 40 - 4 - 35 = 1 assert_eq(1, sc.maxRefGaps(0, 11)); // 44 - 8 - 45 = -9 assert_eq(1, sc.maxRefGaps(0, 12)); // 48 - 8 - 45 = -5 assert_eq(1, sc.maxRefGaps(0, 13)); // 52 - 8 - 45 = -1 assert_eq(2, sc.maxRefGaps(0, 14)); // 56 - 8 - 45 = 3 assert_eq(0, sc.maxReadGaps(0, 8)); // 32 - 0 - 35 = -3 assert_eq(1, sc.maxReadGaps(0, 9)); // 36 - 0 - 35 = 1 assert_eq(1, sc.maxReadGaps(0, 10)); // 40 - 0 - 45 = -5 assert_eq(1, sc.maxReadGaps(0, 11)); // 44 - 0 - 45 = -1 assert_eq(2, sc.maxReadGaps(0, 12)); // 48 - 0 - 45 = 3 assert_eq(2, sc.maxReadGaps(0, 13)); // 52 - 0 - 55 = -3 assert_eq(3, sc.maxReadGaps(0, 14)); // 56 - 0 - 55 = 1 // N ceiling: const=3, linear=0.4 assert_eq(1, sc.nCeil(1)); assert_eq(2, sc.nCeil(2)); assert_eq(3, sc.nCeil(3)); assert_eq(4, sc.nCeil(4)); assert_eq(5, sc.nCeil(5)); assert_eq(5, sc.nCeil(6)); assert_eq(5, sc.nCeil(7)); assert_eq(6, sc.nCeil(8)); assert_eq(6, sc.nCeil(9)); for(int i = 0; i < 256; i++) { assert_eq(i, sc.n(i)); assert_eq(i, sc.mm(i)); } assert_eq(5, sc.gapbar); cout << "PASSED" << endl; } } #endif /*def SCORING_MAIN*/