diff options
author | Aaron M. Ucko <ucko@debian.org> | 2005-12-08 22:58:31 +0000 |
---|---|---|
committer | Aaron M. Ucko <ucko@debian.org> | 2005-12-08 22:58:31 +0000 |
commit | 4b1edc60532e16ec7d0255e1c3552c2e6a33737e (patch) | |
tree | acef5e2808d7dbc8a9c41da8ef71b1436f51718e /algo | |
parent | 23f1d44b259776e399f7fe0fe883c7e9445e192b (diff) |
Load /tmp/.../ncbi-tools6-6.1.20051206 into
branches/upstream/current.
Diffstat (limited to 'algo')
95 files changed, 9022 insertions, 4963 deletions
diff --git a/algo/blast/api/blast_api.c b/algo/blast/api/blast_api.c index d2792239..1dc5e16e 100644 --- a/algo/blast/api/blast_api.c +++ b/algo/blast/api/blast_api.c @@ -1,4 +1,4 @@ -/* $Id: blast_api.c,v 1.18 2005/08/22 19:24:02 madden Exp $ +/* $Id: blast_api.c,v 1.20 2005/09/19 15:40:03 camacho Exp $ *************************************************************************** * * * COPYRIGHT NOTICE * @@ -429,7 +429,6 @@ Blast_RunSearch(SeqLoc* query_seqloc, BlastTabularFormatData* tf_data, BlastHSPResults **results, SeqLoc** filter_out, - Boolean* mask_at_hash, Blast_SummaryReturn* extra_returns) { Int2 status = 0; @@ -439,6 +438,7 @@ Blast_RunSearch(SeqLoc* query_seqloc, BlastSeqLoc* lookup_segments = NULL; BlastScoreBlk* sbp = NULL; LookupTableWrap* lookup_wrap = NULL; + BlastMaskLoc* mask_loc = NULL; BlastHSPStream* hsp_stream = NULL; const EBlastProgramType kProgram = options->program; const Boolean kRpsBlast = @@ -447,7 +447,6 @@ Blast_RunSearch(SeqLoc* query_seqloc, BlastRPSInfo* rps_info = NULL; Nlm_MemMapPtr rps_mmap = NULL; Nlm_MemMapPtr rps_pssm_mmap = NULL; - BlastMaskInformation mask_info; const QuerySetUpOptions* query_options = options->query_options; const LookupTableOptions* lookup_options = options->lookup_options; const BlastScoringOptions* score_options = options->score_options; @@ -468,12 +467,9 @@ Blast_RunSearch(SeqLoc* query_seqloc, if (options->program == eBlastTypeBlastn) { SeqLoc* dust_mask = NULL; /* Dust mask locations */ - Blast_FindDustSeqLoc(query_seqloc, options, &dust_mask); - /* Combine dust mask with lower case mask */ - if (dust_mask) - masking_locs = ValNodeLink(&masking_locs, dust_mask); + ValNodeLink(&masking_locs, dust_mask); } if (kRpsBlast) { @@ -495,22 +491,17 @@ Blast_RunSearch(SeqLoc* query_seqloc, } status = - BLAST_MainSetUp(kProgram, query_options, score_options, - hit_options, query, query_info, scale_factor, - &lookup_segments, &mask_info, &sbp, - &extra_returns->error); - - if (mask_at_hash) - *mask_at_hash = mask_info.mask_at_hash; + BLAST_MainSetUp(kProgram, query_options, score_options, query, + query_info, scale_factor, &lookup_segments, &mask_loc, + &sbp, &extra_returns->error); if (filter_out) { *filter_out = - BlastMaskLocToSeqLoc(kProgram, mask_info.filter_slp, - query_seqloc); + BlastMaskLocToSeqLoc(kProgram, mask_loc, query_seqloc); } /* Mask locations in BlastMaskLoc form are no longer needed. */ - BlastMaskLocFree(mask_info.filter_slp); + BlastMaskLocFree(mask_loc); if (status) return status; @@ -559,7 +550,6 @@ Blast_DatabaseSearch(SeqLoc* query_seqloc, char* db_name, BlastTabularFormatData* tf_data, SeqAlign **seqalign_out, SeqLoc** filter_out, - Boolean* mask_at_hash, Blast_SummaryReturn* extra_returns) { BlastSeqSrc *seq_src = NULL; @@ -597,7 +587,7 @@ Blast_DatabaseSearch(SeqLoc* query_seqloc, char* db_name, status = Blast_RunSearch(query_seqloc, seq_src, masking_locs, options, tf_data, - &results, filter_out, mask_at_hash, extra_returns); + &results, filter_out, extra_returns); /* The ReadDBFILE structure will not be destroyed here, because the initialising function used readdb_attach */ @@ -709,7 +699,7 @@ PHIBlastRunSearch(SeqLoc* query_seqloc, char* db_name, SeqLoc* masking_locs, PHI BLAST, so pass NULL in corresponding arguments. */ status = Blast_RunSearch(query_seqloc, seq_src, masking_locs, options, NULL, - &results, filter_out, NULL, extra_returns); + &results, filter_out, extra_returns); /* The ReadDBFILE structure will not be destroyed here, because the initialising function used readdb_attach */ @@ -738,7 +728,6 @@ Blast_TwoSeqLocSetsAdvanced(SeqLoc* query_seqloc, BlastTabularFormatData* tf_data, SeqAlign **seqalign_out, SeqLoc** filter_out, - Boolean* mask_at_hash, Blast_SummaryReturn* extra_returns) { BlastSeqSrc *seq_src = NULL; @@ -766,7 +755,7 @@ Blast_TwoSeqLocSetsAdvanced(SeqLoc* query_seqloc, status = Blast_RunSearch(query_seqloc, seq_src, masking_locs, options, tf_data, - &results, filter_out, mask_at_hash, extra_returns); + &results, filter_out, extra_returns); /* The ReadDBFILE structure will not be destroyed here, because the initialising function used readdb_attach */ diff --git a/algo/blast/api/blast_api.h b/algo/blast/api/blast_api.h index 8e76ee8c..47d9b9d6 100644 --- a/algo/blast/api/blast_api.h +++ b/algo/blast/api/blast_api.h @@ -1,4 +1,4 @@ -/* $Id: blast_api.h,v 1.4 2005/04/27 19:59:26 dondosha Exp $ +/* $Id: blast_api.h,v 1.5 2005/08/29 14:44:19 camacho Exp $ *************************************************************************** * * * COPYRIGHT NOTICE * @@ -61,8 +61,6 @@ extern "C" { * @param tf_data Structure to use for on-the-fly tabular formatting [in] * @param seqalign_out All results in Seq-align form. [out] * @param filter_out Filtering locations [out] - * @param mask_at_hash Was filtering performed only for lookup table, but not - * for extension? [out] * @param extra_returns Additional information about the search [out] */ Int2 @@ -72,7 +70,6 @@ Blast_DatabaseSearch(SeqLoc* query_seqloc, char* db_name, BlastTabularFormatData* tf_data, SeqAlign **seqalign_out, SeqLoc** filter_out, - Boolean* mask_at_hash, Blast_SummaryReturn* extra_returns); /** Compares a list of SeqLoc's against another list of SeqLoc's, @@ -84,8 +81,6 @@ Blast_DatabaseSearch(SeqLoc* query_seqloc, char* db_name, * @param tf_data Structure to use for on-the-fly tabular formatting [in] * @param seqalign_out All results in Seq-align form. [out] * @param filter_out Filtering locations [out] - * @param mask_at_hash Was filtering performed only for lookup table, but not - * for extension? [out] * @param extra_returns Additional information about the search [out] */ Int2 @@ -96,7 +91,6 @@ Blast_TwoSeqLocSetsAdvanced(SeqLoc* query_seqloc, BlastTabularFormatData* tf_data, SeqAlign **seqalign_out, SeqLoc** filter_out, - Boolean* mask_at_hash, Blast_SummaryReturn* extra_returns); /** Compare a list of query SeqLoc's against a source of subject sequences. @@ -108,7 +102,6 @@ Blast_TwoSeqLocSetsAdvanced(SeqLoc* query_seqloc, * @param results Search results [out] * @param filter_out Query locations that were masked (filtered) during the * search [out] - * @param mask_at_hash Was masking performed only for in the lookup table? [out] * @param extra_returns Additional search statistits [out] * @return 0 on success, -1 on failure. */ @@ -120,7 +113,6 @@ Blast_RunSearch(SeqLoc* query_seqloc, BlastTabularFormatData* tf_data, BlastHSPResults **results, SeqLoc** filter_out, - Boolean* mask_at_hash, Blast_SummaryReturn* extra_returns); /** Run a PHI BLAST search for a query SeqLoc against a database. Return results @@ -135,8 +127,6 @@ Blast_RunSearch(SeqLoc* query_seqloc, * ValNode data points to a Seq-align. [out] * @param filter_out Query locations that were masked (filtered) during the * search. [out] - * NB: masking at hash is not applicable to PHI BLAST, - * so there is no mask_at_hash output argument. * @param extra_returns Additional search statistits [out] * @return 0 on success, -1 on failure. */ diff --git a/algo/blast/api/blast_format.c b/algo/blast/api/blast_format.c index b38f2f9c..02c32543 100644 --- a/algo/blast/api/blast_format.c +++ b/algo/blast/api/blast_format.c @@ -1,4 +1,4 @@ -/* $Id: blast_format.c,v 1.95 2005/08/08 15:50:20 dondosha Exp $ +/* $Id: blast_format.c,v 1.96 2005/11/22 13:31:05 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -31,7 +31,7 @@ * Formatting of BLAST results (SeqAlign) */ -static char const rcsid[] = "$Id: blast_format.c,v 1.95 2005/08/08 15:50:20 dondosha Exp $"; +static char const rcsid[] = "$Id: blast_format.c,v 1.96 2005/11/22 13:31:05 madden Exp $"; #include <algo/blast/api/blast_format.h> #include <algo/blast/api/blast_seq.h> @@ -1182,7 +1182,7 @@ Blast_SeqIdGetDefLine(SeqId* sip, char** buffer_ptr, Boolean ncbi_gi, Boolean accession_only) { char* seqid_buffer = NULL; - Int4 gi; + Int4 gi = 0; Boolean numeric_id_type = FALSE; *buffer_ptr = NULL; diff --git a/algo/blast/api/blast_options_api.c b/algo/blast/api/blast_options_api.c index 715f152a..7eb9a47b 100644 --- a/algo/blast/api/blast_options_api.c +++ b/algo/blast/api/blast_options_api.c @@ -1,4 +1,4 @@ -/* $Id: blast_options_api.c,v 1.9 2005/08/08 15:48:22 dondosha Exp $ +/* $Id: blast_options_api.c,v 1.15 2005/10/31 14:14:29 madden Exp $ *************************************************************************** * * * COPYRIGHT NOTICE * @@ -145,6 +145,52 @@ Int2 SBlastOptionsSetWordSize(SBlastOptions* options, Int4 word_size) return -1; } +Int2 SBlastOptionsSetThreshold(SBlastOptions* options, Int4 threshold) +{ + + if (!options || !options->lookup_options || !options->score_options) + return -1; + + if (threshold < 0) + return -2; + + if (Blast_QueryIsNucleotide(options->program) == TRUE && Blast_QueryIsTranslated(options->program) == FALSE) + return 0; + + if (threshold == 0) + { + Int2 status=0; + if ((status=BLAST_GetSuggestedThreshold(options->program, options->score_options->matrix, &threshold)) != 0) + return status; + } + + options->lookup_options->threshold = threshold; + + return 0; +} + +Int2 SBlastOptionsSetWindowSize(SBlastOptions* options, Int4 window_size) +{ + + if (!options || !options->score_options || !options->word_options) + return -1; + + if (window_size < 0) + return -2; + + if (Blast_QueryIsNucleotide(options->program) == TRUE && Blast_QueryIsTranslated(options->program) == FALSE) + return 0; + + if (window_size == 0) + { + Int2 status=0; + if ((status=BLAST_GetSuggestedWindowSize(options->program, options->score_options->matrix, &window_size)) != 0) + return status; + } + + options->word_options->window_size = window_size; +} + Int2 SBlastOptionsSetDiscMbParams(SBlastOptions* options, Int4 template_length, Int4 template_type) { @@ -167,9 +213,59 @@ Int2 SBlastOptionsSetMatrixAndGapCosts(SBlastOptions* options, if (!matrix_name || !options || !options->score_options) return -1; + /* Reward penalty do not apply to blastn. */ + if (options->program == eBlastTypeBlastn) + return 0; + status = BlastScoringOptionsSetMatrix(options->score_options, matrix_name); + if (status != 0) + return status; + + if (gap_open < 0 || gap_extend < 0) + { + Int4 gap_open_priv = 0; + Int4 gap_extend_priv = 0; + + BLAST_GetProteinGapExistenceExtendParams(matrix_name, &gap_open_priv, &gap_extend_priv); + if (gap_open < 0) + gap_open = gap_open_priv; + if (gap_extend < 0) + gap_extend = gap_extend_priv; + } + + options->score_options->gap_open = gap_open; + options->score_options->gap_extend = gap_extend; + + return status; +} + +Int2 SBlastOptionsSetRewardPenaltyAndGapCosts(SBlastOptions* options, + Int4 reward, Int4 penalty, + Int4 gap_open, Int4 gap_extend) +{ + Int2 status = 0; + + if (reward <= 0 || penalty >= 0 || !options || !options->score_options) + return -1; + + /* Reward penalty only apply to blastn. */ + if (options->program != eBlastTypeBlastn) + return 0; + + if (gap_open < 0 || gap_extend < 0) + { + Int4 gap_open_priv = BLAST_GAP_OPEN_NUCL; + Int4 gap_extend_priv = BLAST_GAP_EXTN_NUCL; + + BLAST_GetNucleotideGapExistenceExtendParams(reward, penalty, &gap_open_priv, &gap_extend_priv); + if (gap_open < 0) + gap_open = gap_open_priv; + if (gap_extend < 0) + gap_extend = gap_extend_priv; + } + options->score_options->gap_open = gap_open; options->score_options->gap_extend = gap_extend; @@ -216,5 +312,13 @@ Int2 SBlastOptionsSetDbGeneticCode(SBlastOptions* options, Int4 gc) } +Boolean SBlastOptionsGetMaskAtHash(const SBlastOptions* options) +{ + ASSERT(options && options->query_options && + options->query_options->filtering_options); + + return options->query_options->filtering_options->mask_at_hash; +} + /* @} */ diff --git a/algo/blast/api/blast_options_api.h b/algo/blast/api/blast_options_api.h index 8121518e..a60b6247 100644 --- a/algo/blast/api/blast_options_api.h +++ b/algo/blast/api/blast_options_api.h @@ -1,4 +1,4 @@ -/* $Id: blast_options_api.h,v 1.3 2005/04/06 23:27:53 dondosha Exp $ +/* $Id: blast_options_api.h,v 1.7 2005/10/31 14:14:29 madden Exp $ *************************************************************************** * * * COPYRIGHT NOTICE * @@ -105,15 +105,50 @@ Int2 SBlastOptionsSetDiscMbParams(SBlastOptions* options, Int4 template_length, Int4 template_type); /** Reset matrix name and gap costs to new values. + * * @param options Options structure to update. [in] [out] * @param matrix_name New matrix name [in] - * @param gap_open New gap opening cost [in] - * @param gap_extend New gap extension cost [in] + * @param gap_open New gap existence cost. If zero default for matrix is used. [in] + * @param gap_extend New gap extension cost. If zero default for matrix is used. [in] */ Int2 SBlastOptionsSetMatrixAndGapCosts(SBlastOptions* options, const char* matrix_name, + Int4 gap_open, + Int4 gap_extend); + + +/** Reset rewared, penalty and gap costs to new values. + * Will suggest and use conservative values if gap_open and gap_extend are zero + * and suggest is TRUE. + * + * @param options Options structure to update. [in] [out] + * @param reward match score [in] + * @param penalty mismatch score [in] + * @param gap_open New gap existence cost. If zero default for reward/penalty is used. [in] + * @param gap_extend New gap extension cost. If zero default for reward/penalty is used. [in] + */ +Int2 SBlastOptionsSetRewardPenaltyAndGapCosts(SBlastOptions* options, + Int4 reward, Int4 penalty, Int4 gap_open, Int4 gap_extend); +/** Set threshold value. + * @param options options Options structure to update. [in] [out] + * @param threshold New value to set, if zero default value for matrix + * will be used. [in] + * @param zero unless error (e.g., threshold is < zero) + */ +Int2 SBlastOptionsSetThreshold(SBlastOptions* options, + Int4 threshold); + +/** Set window size for two hit extension. + * @param options options Options structure to update. [in] [out] + * @param window_size New value to set, if zero default value for matrix + * will be used. [in] + * @param zero unless error (e.g., window_size is < zero) + */ +Int2 SBlastOptionsSetWindowSize(SBlastOptions* options, + Int4 window_size); + /** Reset database (subject) genetic code option to a new value. * @param options Options structure to update. [in] [out] * @param gc New genetic code value. [in] @@ -128,6 +163,12 @@ Int2 SBlastOptionsSetDbGeneticCode(SBlastOptions* options, Int4 gc); */ Int2 SBlastOptionsSetFilterString(SBlastOptions* options, const char* str); +/** Returns the mask-at-hash option value. + * @param options The options structure [in] + * @return Boolean value of the masking at hash option. + */ +Boolean SBlastOptionsGetMaskAtHash(const SBlastOptions* options); + /* @} */ #ifdef __cplusplus diff --git a/algo/blast/api/blast_seq.c b/algo/blast/api/blast_seq.c index b3e63b43..a82051b3 100644 --- a/algo/blast/api/blast_seq.c +++ b/algo/blast/api/blast_seq.c @@ -1,4 +1,4 @@ -static char const rcsid[] = "$Id: blast_seq.c,v 1.70 2005/07/27 12:38:18 madden Exp $"; +static char const rcsid[] = "$Id: blast_seq.c,v 1.73 2005/11/09 14:49:49 camacho Exp $"; /* * =========================================================================== * @@ -44,23 +44,23 @@ static char const rcsid[] = "$Id: blast_seq.c,v 1.70 2005/07/27 12:38:18 madden */ /** Converts a SeqLocPtr to a BlastSeqLoc, used for formatting. - * @param mask_slp SeqLocPtr to be converted [in] + * @param slp SeqLocPtr to be converted [in] * @param head_loc BlastSeqLoc returned from last call [in] * @return pointer to BlastSeqLoc */ static BlastSeqLoc* -s_BlastSeqLocFromSeqLoc(SeqLocPtr mask_slp, BlastSeqLoc* head_loc) +s_BlastSeqLocFromSeqLoc(SeqLocPtr slp, BlastSeqLoc* head_loc) { BlastSeqLoc* last_loc = head_loc; - if (mask_slp == NULL) + if (slp == NULL) return NULL; - if (mask_slp->choice == SEQLOC_PACKED_INT) - mask_slp = (SeqLocPtr) mask_slp->data.ptrvalue; + if (slp->choice == SEQLOC_PACKED_INT) + slp = (SeqLocPtr) slp->data.ptrvalue; - for ( ; mask_slp; mask_slp = mask_slp->next) { - SeqIntPtr si = (SeqIntPtr) mask_slp->data.ptrvalue; + for ( ; slp; slp = slp->next) { + SeqIntPtr si = (SeqIntPtr) slp->data.ptrvalue; if (!head_loc) { last_loc = head_loc = BlastSeqLocNew(&last_loc, si->from, si->to); } else { @@ -71,43 +71,70 @@ s_BlastSeqLocFromSeqLoc(SeqLocPtr mask_slp, BlastSeqLoc* head_loc) } BlastMaskLoc* -BlastMaskLocFromSeqLoc(SeqLoc* mask_locs, SeqLoc* query_locs) +BlastMaskLocFromSeqLoc(SeqLoc* mask_seqlocs, SeqLoc* query_seqlocs, + EBlastProgramType program_number) { - const Int4 kNumSeqs = ValNodeLen(query_locs); - BlastMaskLoc* blast_mask; - Int4 tmp_index=0; - SeqLocPtr current_query_loc; + const Int4 kNumSeqs = ValNodeLen(query_seqlocs); + BlastMaskLoc* retval = NULL; + Int4 query_index = 0; + const unsigned int kNumContexts = BLAST_GetNumberOfContexts(program_number); + SeqLocPtr current_query_loc = NULL; - if (!mask_locs) + if (!mask_seqlocs) return NULL; - blast_mask = BlastMaskLocNew(kNumSeqs); + retval = BlastMaskLocNew(kNumSeqs*kNumContexts); - for (current_query_loc = query_locs; current_query_loc; - current_query_loc = current_query_loc->next) { - SeqLocPtr mask_var; - for (mask_var = mask_locs; mask_var; mask_var = mask_var->next) + for (current_query_loc = query_seqlocs, query_index = 0; + current_query_loc; + current_query_loc = current_query_loc->next, query_index++) { + + const int kCtxIndex = kNumContexts * query_index; /* context index */ + SeqLocPtr mask_slp = NULL; + + for (mask_slp = mask_seqlocs; mask_slp; mask_slp = mask_slp->next) { - SeqLocPtr current_mask = (SeqLocPtr) mask_var->data.ptrvalue; + SeqLocPtr current_mask = (SeqLocPtr) mask_slp->data.ptrvalue; /* If mask is empty, advance to the next link in the mask chain. If mask Seq-id does not match sequence Seq-id, stay with this mask for the next link in the sequence Seq-loc chain. */ if (current_mask && - SeqIdMatch(SeqLocId(current_mask), SeqLocId(current_query_loc))) { - blast_mask->seqloc_array[tmp_index] = - s_BlastSeqLocFromSeqLoc(current_mask, blast_mask->seqloc_array[tmp_index]); + SeqIdMatch(SeqLocId(current_mask), SeqLocId(current_query_loc))) + { + retval->seqloc_array[kCtxIndex] = + s_BlastSeqLocFromSeqLoc(current_mask, + retval->seqloc_array[kCtxIndex]); } } - if (blast_mask->seqloc_array[tmp_index]) + if (retval->seqloc_array[kCtxIndex]) { - BlastSeqLoc_RestrictToInterval(&blast_mask->seqloc_array[tmp_index], - SeqLocStart(current_query_loc), SeqLocStop(current_query_loc)); + const Boolean kIsNa = Blast_QueryIsNucleotide(program_number) && + !Blast_QueryIsTranslated(program_number) && + !Blast_ProgramIsPhiBlast(program_number); + BlastSeqLoc_RestrictToInterval(&retval->seqloc_array[kCtxIndex], + SeqLocStart(current_query_loc), + SeqLocStop(current_query_loc)); + if (kIsNa) { + /* N.B.: Unlike in the C++ APIs, this logic is only applied to + * non-translated nucleotide queries. See comment for + * BlastMaskLocDNAToProtein */ + Uint1 strand = SeqLocStrand(current_query_loc); + if (strand == Seq_strand_minus) { + retval->seqloc_array[kCtxIndex+1] = + retval->seqloc_array[kCtxIndex]; + retval->seqloc_array[kCtxIndex] = NULL; + } else if (strand == Seq_strand_plus) { + retval->seqloc_array[kCtxIndex+1] = NULL; + } else { + retval->seqloc_array[kCtxIndex+1] = + BlastSeqLocListDup(retval->seqloc_array[kCtxIndex]); + } + } } - tmp_index++; } - return blast_mask; + return retval; } SeqLoc* @@ -124,12 +151,10 @@ SeqLocPtr BlastMaskLocToSeqLoc(EBlastProgramType program_number, const BlastMaskLoc* mask_loc, SeqLoc* query_loc) { - SeqLocPtr mask_head = NULL, last_mask = NULL; + SeqLocPtr retval = NULL, retval_tail = NULL; Int4 index; - const Boolean k_translate = (program_number == eBlastTypeBlastx || - program_number == eBlastTypeTblastx || - program_number == eBlastTypeRpsTblastn); - const Uint1 k_num_frames = (k_translate ? NUM_FRAMES : 1); + const Boolean k_translate = Blast_QueryIsTranslated(program_number); + const Uint1 k_num_frames = BLAST_GetNumberOfContexts(program_number); SeqLoc* slp; if (mask_loc == NULL || mask_loc->seqloc_array == NULL) @@ -137,27 +162,25 @@ SeqLocPtr BlastMaskLocToSeqLoc(EBlastProgramType program_number, for (index=0, slp = query_loc; slp; ++index, slp = slp->next) { - Int4 frame_index = index*k_num_frames; + const int kCtxIndex = k_num_frames * index; /* context index */ Int4 tmp_index; Int4 slp_from = SeqLocStart(slp); SeqIdPtr seqid = SeqLocId(slp); - for (tmp_index=frame_index; tmp_index<(frame_index+k_num_frames); tmp_index++) + for (tmp_index=kCtxIndex; tmp_index<(kCtxIndex+k_num_frames); tmp_index++) { BlastSeqLoc* loc = NULL; - SeqLocPtr mask_slp_head = NULL, mask_slp_last = NULL; + SeqLocPtr mask_slp_head = NULL, mask_slp_tail = NULL; for (loc = mask_loc->seqloc_array[tmp_index]; loc; loc = loc->next) { - SSeqRange* di = loc->ssr; SeqIntPtr si = SeqIntNew(); - si->from = di->left + slp_from; - si->to = di->right + slp_from; + si->from = loc->ssr->left + slp_from; + si->to = loc->ssr->right + slp_from; si->id = SeqIdDup(seqid); - if (!mask_slp_last) - mask_slp_last = - ValNodeAddPointer(&mask_slp_head, SEQLOC_INT, si); - else - mask_slp_last = - ValNodeAddPointer(&mask_slp_last, SEQLOC_INT, si); + /* Append the pointer, but also keep track of the tail of the list + * so that appending to the list is a constant operation */ + mask_slp_tail = ValNodeAddPointer + ( (mask_slp_tail ? &mask_slp_tail : &mask_slp_head), + SEQLOC_INT, si); } if (mask_slp_head) { @@ -172,15 +195,15 @@ SeqLocPtr BlastMaskLocToSeqLoc(EBlastProgramType program_number, else tmp_choice = 0; - if (!last_mask) { - last_mask = ValNodeAddPointer(&mask_head, tmp_choice, new_mask_slp); - } else { - last_mask = ValNodeAddPointer(&last_mask, tmp_choice, new_mask_slp); - } + /* Append the pointer, but also keep track of the tail of the list + * so that appending to the list is a constant operation */ + retval_tail = ValNodeAddPointer + ( (retval_tail ? &retval_tail : &retval), + tmp_choice, new_mask_slp); } } } - return mask_head; + return retval; } /** Set field values for one element of the context array of a @@ -465,7 +488,7 @@ Int2 BLAST_GeneticCodeFind(Int4 gc, Uint1** genetic_code) * @param query_info The query information structure, pre-initialized, * but filled here [in] * @param query_options Query setup options, containing the genetic code for - * translation [in] + * translation. N.B.: its strand_option field is ignored [in] * @param num_frames How many frames to get for this sequence? [in] * @param encoding In what encoding to retrieve the sequence? [in] * @param buffer_out Buffer to hold plus strand or protein [out] @@ -620,10 +643,10 @@ Int2 BLAST_SetUpQuery(EBlastProgramType program_number, return status; if (masking_locs) { - BlastMaskLoc* lcase_mask = BlastMaskLocFromSeqLoc(masking_locs, query_slp); - if (program_number == eBlastTypeBlastx || - program_number == eBlastTypeTblastx || - program_number == eBlastTypeRpsTblastn) + BlastMaskLoc* lcase_mask = BlastMaskLocFromSeqLoc(masking_locs, + query_slp, + program_number); + if (Blast_QueryIsTranslated(program_number)) BlastMaskLocDNAToProtein(lcase_mask, *query_info); (*query_blk)->lcase_mask = lcase_mask; (*query_blk)->lcase_mask_allocated = TRUE; diff --git a/algo/blast/api/blast_seq.h b/algo/blast/api/blast_seq.h index 0fdc83c4..4c287fda 100644 --- a/algo/blast/api/blast_seq.h +++ b/algo/blast/api/blast_seq.h @@ -1,4 +1,4 @@ -/* $Id: blast_seq.h,v 1.26 2005/07/27 12:34:46 madden Exp $ +/* $Id: blast_seq.h,v 1.27 2005/09/20 00:04:02 camacho Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -62,12 +62,14 @@ BlastMaskLocToSeqLoc(EBlastProgramType program_number, /** Convert a list of mask locations in a form of SeqLoc into a BlastMaskLoc * structure. In case of multiple queries, it is not required to create a mask * SeqLoc for every query. + * @param program_number identifies blastn, blastp, etc. [in] * @param mask_locs Masking locations [in] * @param seq_locs Sequence locations [in] * @return Allocated and populated BlastMaskLoc structure. */ BlastMaskLoc* -BlastMaskLocFromSeqLoc(SeqLoc* mask_locs, SeqLoc* seq_locs); +BlastMaskLocFromSeqLoc(SeqLoc* mask_locs, SeqLoc* seq_locs, + EBlastProgramType program_number); /** Frees a special type of SeqLoc list, used in BLAST for masking locations. * @param mask_loc Input list of mask SeqLocs [in] diff --git a/algo/blast/api/blast_tabular.c b/algo/blast/api/blast_tabular.c index 7cfe61c8..3f1901e6 100644 --- a/algo/blast/api/blast_tabular.c +++ b/algo/blast/api/blast_tabular.c @@ -1,4 +1,4 @@ -/* $Id: blast_tabular.c,v 1.29 2005/08/05 22:29:50 dondosha Exp $ +/* $Id: blast_tabular.c,v 1.30 2005/11/22 13:30:34 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -28,7 +28,7 @@ * On-the-fly tabular formatting of BLAST results */ -static char const rcsid[] = "$Id: blast_tabular.c,v 1.29 2005/08/05 22:29:50 dondosha Exp $"; +static char const rcsid[] = "$Id: blast_tabular.c,v 1.30 2005/11/22 13:30:34 madden Exp $"; #include <algo/blast/api/blast_tabular.h> #include <algo/blast/core/blast_util.h> @@ -308,8 +308,10 @@ void* Blast_TabularFormatThread(void* data) query_lengths = (Int4*) malloc(num_queries*sizeof(Int4)); for (index = 0, slp = tf_data->query_slp; slp; ++index, slp = slp->next) { - query_id_array[index] = SeqLocId(slp); - query_lengths[index] = SeqLocLen(slp); + BioseqPtr bsp = BioseqLockById(SeqLocId(slp)); + query_id_array[index] = SeqIdSetDup(bsp->id); + query_lengths[index] = BioseqGetLen(bsp); + BioseqUnlockById(SeqLocId(slp)); } one_seq_update_params = (BlastSeqSrcGetTotLen(seq_src) == 0); @@ -482,6 +484,11 @@ void* Blast_TabularFormatThread(void* data) BlastSequenceBlkFree(seq_arg.seq); + for (index = 0; index<num_queries; ++index) + { + SeqIdSetFree(query_id_array[index]); + query_id_array[index] = NULL; + } sfree(query_lengths); sfree(query_id_array); diff --git a/algo/blast/api/dust_filter.c b/algo/blast/api/dust_filter.c index 091ec11d..890fb6d0 100644 --- a/algo/blast/api/dust_filter.c +++ b/algo/blast/api/dust_filter.c @@ -1,4 +1,4 @@ -static char const rcsid[] = "$Id: dust_filter.c,v 1.4 2005/08/17 16:24:44 dondosha Exp $"; +static char const rcsid[] = "$Id: dust_filter.c,v 1.5 2005/09/20 00:04:27 camacho Exp $"; /* * =========================================================================== @@ -57,6 +57,7 @@ s_GetFilteringLocations(BLAST_SequenceBlk* query_blk, BlastQueryInfo* query_info const Boolean kIsNucl = TRUE; Boolean no_forward_strand = (query_info->first_context > 0); /* filtering needed on reverse strand. */ SeqLoc* slp_var = query_seqloc; +Int4 qindex = 0; ASSERT(query_info && query_blk && filter_maskloc && query_seqloc); @@ -82,7 +83,7 @@ s_GetFilteringLocations(BLAST_SequenceBlk* query_blk, BlastQueryInfo* query_info if (!reverse || no_forward_strand) { BlastSeqLoc *filter_slp = NULL; /* Used to hold combined SeqLoc's */ - Int4 filter_index = BlastGetMaskLocIndexFromContext(kIsNucl, context); + Int4 filter_index = context; Int4 context_offset = query_info->contexts[context].query_offset; Uint1* buffer = &query_blk->sequence[context_offset]; SDustOptions* dust_options = filter_options->dustOptions; diff --git a/algo/blast/api/repeats_filter.c b/algo/blast/api/repeats_filter.c index 11c11466..36566dc2 100644 --- a/algo/blast/api/repeats_filter.c +++ b/algo/blast/api/repeats_filter.c @@ -1,4 +1,4 @@ -static char const rcsid[] = "$Id: repeats_filter.c,v 1.7 2005/04/21 15:00:36 dondosha Exp $"; +static char const rcsid[] = "$Id: repeats_filter.c,v 1.12 2005/09/20 18:27:50 kans Exp $"; /* * =========================================================================== @@ -39,6 +39,7 @@ static char const rcsid[] = "$Id: repeats_filter.c,v 1.7 2005/04/21 15:00:36 don #include <algo/blast/api/repeats_filter.h> #include <algo/blast/api/blast_api.h> #include <algo/blast/core/blast_filter.h> +#include <algo/blast/core/blast_util.h> #include <algo/blast/api/blast_seq.h> #include <algo/blast/api/seqsrc_readdb.h> @@ -93,6 +94,8 @@ s_FillMaskLocFromBlastHSPResults(SeqLoc* query_seqloc, BlastHSPResults* results, Int4 query_index; SeqLoc* slp; BlastMaskLoc* mask; + const EBlastProgramType kProgram = eBlastTypeBlastn; + const Uint4 kNumContexts = BLAST_GetNumberOfContexts(eBlastTypeBlastn); if (!query_seqloc || !mask_seqloc) return -1; @@ -104,14 +107,13 @@ s_FillMaskLocFromBlastHSPResults(SeqLoc* query_seqloc, BlastHSPResults* results, } num_seqs = ValNodeLen(query_seqloc); - mask = BlastMaskLocNew(num_seqs); + mask = BlastMaskLocNew(num_seqs*kNumContexts); for (query_index = 0, slp = query_seqloc; slp; ++query_index, slp = slp->next) { Int4 query_length, query_start; Int4 hit_index; BlastSeqLoc* loc_list = NULL, *ordered_loc_list = NULL; - BlastSeqLoc* last_loc = NULL; BlastHitList* hit_list = results->hitlist_array[query_index]; if (!hit_list) { @@ -142,25 +144,19 @@ s_FillMaskLocFromBlastHSPResults(SeqLoc* query_seqloc, BlastHSPResults* results, sequence. */ left += query_start; right += query_start; - /* If this is the first mask for this query, create a new - BlastSeqLoc, otherwise append to the end of the list. */ - if (!last_loc) - loc_list = last_loc = BlastSeqLocNew(NULL, left, right); - else - last_loc = BlastSeqLocNew(&last_loc, left, right); + BlastSeqLocNew(&loc_list, left, right); } } /* Make the intervals unique */ - CombineMaskLocations(loc_list, &ordered_loc_list, - REPEAT_MASK_LINK_VALUE); + ordered_loc_list = BlastSeqLocCombine(loc_list, REPEAT_MASK_LINK_VALUE); /* Free the list of locations that's no longer needed. */ loc_list = BlastSeqLocFree(loc_list); - mask->seqloc_array[query_index] = ordered_loc_list; + mask->seqloc_array[query_index*kNumContexts] = ordered_loc_list; } - *mask_seqloc = BlastMaskLocToSeqLoc(eBlastTypeBlastn, mask, query_seqloc); + *mask_seqloc = BlastMaskLocToSeqLoc(kProgram, mask, query_seqloc); mask = BlastMaskLocFree(mask); @@ -179,7 +175,6 @@ Blast_FindRepeatFilterSeqLoc(SeqLoc* query_seqloc, BlastSeqSrc* seq_src = NULL; SeqLoc* filter_loc = NULL; /* Dummy variable, since search will be performed without filtering. */ - Boolean mask_at_hash = FALSE; /* Dummy variable. */ BlastHSPResults* results = NULL; SBlastFilterOptions* filtering_options = NULL; @@ -227,7 +222,7 @@ Blast_FindRepeatFilterSeqLoc(SeqLoc* query_seqloc, status = Blast_RunSearch(query_seqloc, seq_src, NULL, options, NULL, - &results, &filter_loc, &mask_at_hash, sum_returns); + &results, &filter_loc, sum_returns); /* The ReadDBFILE structure will not be destroyed here, because the initialising function used readdb_attach */ diff --git a/algo/blast/api/twoseq_api.c b/algo/blast/api/twoseq_api.c index 67fc4275..a414ec92 100644 --- a/algo/blast/api/twoseq_api.c +++ b/algo/blast/api/twoseq_api.c @@ -1,4 +1,4 @@ -/* $Id: twoseq_api.c,v 1.48 2005/06/06 15:40:17 papadopo Exp $ +/* $Id: twoseq_api.c,v 1.51 2005/10/20 20:58:58 madden Exp $ *************************************************************************** * * * COPYRIGHT NOTICE * @@ -71,6 +71,8 @@ Int2 BLAST_SummaryOptionsInit(BLAST_SummaryOptions **options) new_options->nucleotide_mismatch = -3; new_options->longest_intron = 0; new_options->init_seed_method = eDefaultSeedType; + new_options->gap_open = -1; + new_options->gap_extend = -1; *options = new_options; return 0; @@ -106,7 +108,6 @@ s_TwoSeqBasicFillOptions(const BLAST_SummaryOptions* basic_options, BlastEffectiveLengthsOptions* eff_len_options = options->eff_len_options; BlastDatabaseOptions* db_options = options->db_options; Boolean do_megablast = FALSE; - Boolean do_ag_blast = FALSE; Boolean do_discontig = FALSE; Int4 greedy_align = 0; Int2 word_size = basic_options->word_size; @@ -138,13 +139,6 @@ s_TwoSeqBasicFillOptions(const BLAST_SummaryOptions* basic_options, greedy_align = 1; /* one-pass, no ungapped */ } - /* For a megablast search or a blastn search with - a non-default word size, turn on striding. Note that - striding is beneficial even if the wordsize is - smaller than the default */ - - if (word_size != 0 || do_megablast) - do_ag_blast = TRUE; /* If megablast was turned on but the input indicates a sensitive search is desired, or if word size is <=12, which is not used in contiguous @@ -157,8 +151,10 @@ s_TwoSeqBasicFillOptions(const BLAST_SummaryOptions* basic_options, if (word_size == 0 || word_size > 12) word_size = 11; do_discontig = TRUE; - do_ag_blast = FALSE; } + + if (do_megablast && !do_discontig) + greedy_align = 1; } @@ -384,9 +380,12 @@ BLAST_TwoSeqLocSets(const BLAST_SummaryOptions *basic_options, status = Blast_TwoSeqLocSetsAdvanced(query_seqloc, subject_seqloc, masking_locs, options, NULL, seqalign_out, filter_out, - mask_at_hash, extra_returns); + extra_returns); } + if (mask_at_hash) + *mask_at_hash = SBlastOptionsGetMaskAtHash(options); + options = SBlastOptionsFree(options); if (extra_returns_ptr) diff --git a/algo/blast/composition_adjustment/compo_heap.c b/algo/blast/composition_adjustment/compo_heap.c new file mode 100644 index 00000000..f3cb4d83 --- /dev/null +++ b/algo/blast/composition_adjustment/compo_heap.c @@ -0,0 +1,510 @@ +/* =========================================================================== +* +* PUBLIC DOMAIN NOTICE +* National Center for Biotechnology Information +* +* This software/database is a "United States Government Work" under the +* terms of the United States Copyright Act. It was written as part of +* the author's official duties as a United States Government employee and +* thus cannot be copyrighted. This software/database is freely available +* to the public for use. The National Library of Medicine and the U.S. +* Government have not placed any restriction on its use or reproduction. +* +* Although all reasonable efforts have been taken to ensure the accuracy +* and reliability of the software and data, the NLM and the U.S. +* Government do not and cannot warrant the performance or results that +* may be obtained by using this software or data. The NLM and the U.S. +* Government disclaim all warranties, express or implied, including +* warranties of performance, merchantability or fitness for any particular +* purpose. +* +* Please cite the author in any work or product based on this material. +* +* ===========================================================================*/ + +/** @file compo_heap.c + * @author E. Michael Gertz, Alejandro Schaffer + * + * Defines a "heap" data structure that is used to store computed alignments + * when composition adjustment of scoring matrices is used. + */ + +#ifndef SKIP_DOXYGEN_PROCESSING +static char const rcsid[] = + "$Id: compo_heap.c,v 1.1 2005/12/01 13:48:09 gertz Exp $"; +#endif /* SKIP_DOXYGEN_PROCESSING */ + +#include <assert.h> +#include <algo/blast/core/ncbi_std.h> +#include <algo/blast/composition_adjustment/compo_heap.h> + + +/** Define COMPO_INTENSE_DEBUG to be true to turn on rigorous but + * expensive consistency tests in the composition_adjustment + * module. + * + * This macro is usually used as part of a C-conditional + * if (COMPO_INTENSE_DEBUG) { + * perform expensive tests + * } + * The C compiler will then validate the code to perform the tests, but + * will almost always strip the code if COMPO_INTENSE_DEBUG is false. + */ +#ifndef COMPO_INTENSE_DEBUG +#define COMPO_INTENSE_DEBUG 0 +#endif + +/** The initial capacity of the heap will be set to the smaller of this + * and the heap threshold */ +#define HEAP_INITIAL_CAPACITY 100 +/** When the heap is about to exceed its capacity, it will be grown by + * the minimum of a multiplicative factor of HEAP_RESIZE_FACTOR + * and an additive factor of HEAP_MIN_RESIZE. The heap never + * decreases in size */ +#define HEAP_RESIZE_FACTOR 1.5 +/** @sa HEAP_RESIZE_FACTOR */ +#define HEAP_MIN_RESIZE 100 + +/* Return -1/0/1 if a is less than/equal to/greater than b. */ +#define CMP(a,b) ((a)>(b) ? 1 : ((a)<(b) ? -1 : 0)) + + +/** + * The struct BlastCompo_HeapRecord data type is used below to define + * the internal structure of a BlastCompo_Heap (see below). A + * BlastCompo_HeapRecord represents all alignments of a query sequence + * to a particular matching sequence. + */ +struct BlastCompo_HeapRecord { + double bestEvalue; /**< best (smallest) evalue of all + alignments in the record */ + int bestScore; /**< best (largest) score; used to + break ties between records with + the same e-value */ + int subject_index; /**< index of the subject sequence in + the database */ + void * theseAlignments; /**< a collection of alignments */ +}; +typedef struct BlastCompo_HeapRecord BlastCompo_HeapRecord; + + +/** Compare two records in the heap. */ +static int +s_CompoHeapRecordCompare(BlastCompo_HeapRecord * place1, + BlastCompo_HeapRecord * place2) +{ + int result; + if (0 == (result = CMP(place1->bestEvalue, place2->bestEvalue)) && + 0 == (result = CMP(place2->bestScore, place1->bestScore))) { + result = CMP(place1->subject_index, place2->subject_index); + } + return result > 0; +} + + +/** Swap two records in the heap. */ +static void +s_CompoHeapRecordSwap(BlastCompo_HeapRecord * record1, + BlastCompo_HeapRecord * record2) +{ + /* bestEvalue, bestScore, theseAlignments and subject_index are temporary + * variables used to perform the swap. */ + double bestEvalue; + int bestScore, subject_index; + void * theseAlignments; + + bestEvalue = record1->bestEvalue; + record1->bestEvalue = record2->bestEvalue; + record2->bestEvalue = bestEvalue; + + bestScore = record1->bestScore; + record1->bestScore = record2->bestScore; + record2->bestScore = bestScore; + + subject_index = record1->subject_index; + record1->subject_index = record2->subject_index; + record2->subject_index = subject_index; + + theseAlignments = record1->theseAlignments; + record1->theseAlignments = record2->theseAlignments; + record2->theseAlignments = theseAlignments; +} + + +/** + * Verify that the subtree rooted at element i is ordered so as to be + * as to be a valid heap. This routine checks every element in the + * subtree, and so is very time consuming. It is for debugging + * purposes only. + */ +static int +s_CompoHeapIsValid(BlastCompo_HeapRecord * heapArray, int i, int n) +{ + /* indices of nodes to the left and right of node i */ + int left = 2 * i, right = 2 * i + 1; + + if (right <= n) { + return !s_CompoHeapRecordCompare(&(heapArray[right]), + &(heapArray[i])) && + s_CompoHeapIsValid(heapArray, right, n); + } + if (left <= n) { + return !s_CompoHeapRecordCompare(&(heapArray[left]), + &(heapArray[i])) && + s_CompoHeapIsValid(heapArray, left, n); + } + return TRUE; +} + + +/** + * Relocate the top element of a subtree so that on exit the subtree + * is in valid heap order. On entry, all elements but the root of the + * subtree must be in valid heap order. + * + * @param heapArray array representing the heap stored as a binary tree + * @param top the index of the root element of a subtree + * @param n the size of the entire heap. + */ +static void +s_CompoHeapifyDown(BlastCompo_HeapRecord * heapArray, + int top, int n) +{ + int i, left, right, largest; /* placeholders for indices in swapping */ + + largest = top; + do { + i = largest; + left = 2 * i; + right = 2 * i + 1; + if (left <= n && + s_CompoHeapRecordCompare(&heapArray[left], + &heapArray[i])) { + largest = left; + } else { + largest = i; + } + if (right <= n && + s_CompoHeapRecordCompare(&heapArray[right], + &heapArray[largest])) { + largest = right; + } + if (largest != i) { + s_CompoHeapRecordSwap(&heapArray[i], &heapArray[largest]); + } + } while (largest != i); + if (COMPO_INTENSE_DEBUG) { + assert(s_CompoHeapIsValid(heapArray, top, n)); + } +} + + +/** + * Relocate a leaf in the heap so that the entire heap is in valid + * heap order. On entry, all elements but the leaf must be in valid + * heap order. + * + * @param heapArray array representing the heap as a binary tree + * @param i element in heap array that may be out of order [in] + */ +static void +s_CompoHeapifyUp(BlastCompo_HeapRecord * heapArray, int i) +{ + int parent = i / 2; /* index to the node that is the + parent of node i */ + while (parent >= 1 && s_CompoHeapRecordCompare(&heapArray[i], + &heapArray[parent])) + { + s_CompoHeapRecordSwap(&heapArray[i], &heapArray[parent]); + + i = parent; + parent /= 2; + } + if (COMPO_INTENSE_DEBUG) { + assert(s_CompoHeapIsValid(heapArray, 1, i)); + } +} + + +/** Convert a BlastCompo_Heap from a representation as an unordered array to + * a representation as a heap-ordered array. + * + * @param self the BlastCompo_Heap to convert + */ +static void +s_ConvertToHeap(BlastCompo_Heap * self) +{ + if (NULL != self->array) { /* If we aren't already a heap */ + int i; /* heap node index */ + int n; /* number of elements in the heap */ + self->heapArray = self->array; + self->array = NULL; + + n = self->n; + for (i = n / 2; i >= 1; --i) { + s_CompoHeapifyDown(self->heapArray, i, n); + } + } + if (COMPO_INTENSE_DEBUG) { + assert(s_CompoHeapIsValid(self->heapArray, 1, self->n)); + } +} + + +/** Return true if self may insert a match that had the given eValue, + * score and subject_index. + * + * @param self a BlastCompo_Heap + * @param eValue the evalue to be tested. + * @param score the score to be tested + * @param subject_index the subject_index to be tested. + */ +int +BlastCompo_HeapWouldInsert(BlastCompo_Heap * self, + double eValue, + int score, + int subject_index) +{ + if (self->n < self->heapThreshold || + eValue <= self->ecutoff || + eValue < self->worstEvalue) { + return TRUE; + } else { + /* self is either currently a heap, or must be converted to + * one; use s_CompoHeapRecordCompare to compare against + * the worst element in the heap */ + BlastCompo_HeapRecord heapRecord; /* temporary record to + compare against */ + if (self->heapArray == NULL) s_ConvertToHeap(self); + + heapRecord.bestEvalue = eValue; + heapRecord.bestScore = score; + heapRecord.subject_index = subject_index; + heapRecord.theseAlignments = NULL; + + return s_CompoHeapRecordCompare(&self->heapArray[1], &heapRecord); + } +} + + +/** + * Insert a new heap record at the end of *array, possibly resizing + * the array to hold the new record. + * + * @param *array the array to receive the new record + * @param *length number of records already in *array + * @param *capacity allocated size of *array + * @param alignments a list of alignments + * @param eValue the best evalue among the alignments + * @param score the best score among the alignments + * @param subject_index the index of the subject sequence in the database + * @return 0 on success, -1 on failure (out-of-memory) + */ +static int +s_CompHeapRecordInsertAtEnd(BlastCompo_HeapRecord **array, + int * length, + int * capacity, + void * alignments, + double eValue, + int score, + int subject_index) +{ + BlastCompo_HeapRecord *heapRecord; /* destination for the new + alignments */ + if (*length >= *capacity) { + /* The destination array must be resized */ + int new_capacity; /* capacity the resized heap */ + BlastCompo_HeapRecord * new_array; + + new_capacity = MAX(HEAP_MIN_RESIZE + *capacity, + (int) (HEAP_RESIZE_FACTOR * (*capacity))); + new_array = realloc(*array, (new_capacity + 1) * + sizeof(BlastCompo_HeapRecord)); + if (new_array == NULL) { /* out of memory */ + return -1; + } + *array = new_array; + *capacity = new_capacity; + } + heapRecord = &(*array)[++(*length)]; + heapRecord->bestEvalue = eValue; + heapRecord->bestScore = score; + heapRecord->theseAlignments = alignments; + heapRecord->subject_index = subject_index; + + return 0; +} + + +/** + * Try to insert a collection of alignments into a heap. + * + * @param self the heap + * @param alignments a collection of alignments, in an unspecified + * format + * @param eValue the best evalue among the alignments + * @param score the best score among the alignments + * @param subject_index the index of the subject sequence in the database + * @param discardedAlignment a collection of alignments that must be + * deleted (passed back to the calling routine + * as this routine does know how to delete them) + * @return 0 on success, -1 for out of memory */ +int +BlastCompo_HeapInsert(BlastCompo_Heap * self, + void * alignments, + double eValue, + int score, + int subject_index, + void ** discardedAlignments) +{ + *discardedAlignments = NULL; + if (self->array && self->n >= self->heapThreshold) { + s_ConvertToHeap(self); + } + if (self->array != NULL) { + /* "self" is currently a list. Add the new alignments to the end */ + int status = + s_CompHeapRecordInsertAtEnd(&self->array, &self->n, + &self->capacity, alignments, + eValue, score, + subject_index); + if (status != 0) { /* out of memory */ + return -1; + } + if (self->worstEvalue < eValue) { + self->worstEvalue = eValue; + } + } else { /* "self" is currently a heap */ + if (self->n < self->heapThreshold || + (eValue <= self->ecutoff && + self->worstEvalue <= self->ecutoff)) { + /* The new alignments must be inserted into the heap, and all old + * alignments retained */ + int status = + s_CompHeapRecordInsertAtEnd(&self->heapArray, + &self->n, + &self->capacity, + alignments, eValue, + score, subject_index); + if (status != 0) { /* out of memory */ + return -1; + } + s_CompoHeapifyUp(self->heapArray, self->n); + } else { + /* Some set of alignments must be discarded; discardedAlignments + * will hold a pointer to these alignments. */ + BlastCompo_HeapRecord heapRecord; /* Candidate record + for insertion */ + heapRecord.bestEvalue = eValue; + heapRecord.bestScore = score; + heapRecord.theseAlignments = alignments; + heapRecord.subject_index = subject_index; + + if (s_CompoHeapRecordCompare(&self->heapArray[1], + &heapRecord)) { + /* The new record should be inserted, and the largest + * element currently in the heap may be discarded */ + *discardedAlignments = self->heapArray[1].theseAlignments; + memcpy(&self->heapArray[1], &heapRecord, + sizeof(BlastCompo_HeapRecord)); + } else { + *discardedAlignments = heapRecord.theseAlignments; + } + s_CompoHeapifyDown(self->heapArray, 1, self->n); + } + /* end else some set of alignments must be discarded */ + self->worstEvalue = self->heapArray[1].bestEvalue; + if (COMPO_INTENSE_DEBUG) { + assert(s_CompoHeapIsValid(self->heapArray, 1, self->n)); + } + } + /* end else "self" is currently a heap. */ + return 0; /* success */ +} + + +/** + * Return true if only matches with evalue <= self->ecutoff may be + * inserted. + * + * @param self a BlastCompo_Heap + */ +int +BlastCompo_HeapFilledToCutoff(const BlastCompo_Heap * self) +{ + return self->n >= self->heapThreshold && + self->worstEvalue <= self->ecutoff; +} + + +/** Initialize a new BlastCompo_Heap; parameters to this function correspond + * directly to fields in the BlastCompo_Heap + * + * @return 0 on success, -1 for out of memory */ +int +BlastCompo_HeapInitialize(BlastCompo_Heap * self, int heapThreshold, + double ecutoff) +{ + self->n = 0; + self->heapThreshold = heapThreshold; + self->ecutoff = ecutoff; + self->heapArray = NULL; + self->capacity = MIN(HEAP_INITIAL_CAPACITY, heapThreshold); + self->worstEvalue = 0; + /* Begin life as a list */ + self->array = calloc(self->capacity + 1, sizeof(BlastCompo_HeapRecord)); + + return self->array != NULL ? 0 : -1; +} + + +/** + * Release the storage associated with the fields of a BlastCompo_Heap. Don't + * delete the BlastCompo_Heap structure itself. + * + * @param self BlastCompo_Heap whose storage will be released + */ +void +BlastCompo_HeapRelease(BlastCompo_Heap * self) +{ + if (self->heapArray) free(self->heapArray); + if (self->array) free(self->array); + + self->n = self->capacity = self->heapThreshold = 0; + self->heapArray = NULL; self->array = NULL; +} + + +/** + * Remove and return the element in the BlastCompo_Heap with largest + * (worst) evalue; ties are broken according to the order specified + * by the s_CompoHeapRecordCompare routine. + * + * @param self a BlastCompo_Heap + */ +void * +BlastCompo_HeapPop(BlastCompo_Heap * self) +{ + void * results = NULL; /* the list of SeqAligns to be returned */ + + s_ConvertToHeap(self); + if (self->n > 0) { /* The heap is not empty */ + BlastCompo_HeapRecord *first, *last; /* The first and last + elements of the array + that represents the + heap. */ + first = &self->heapArray[1]; + last = &self->heapArray[self->n]; + + results = first->theseAlignments; + if (--self->n > 0) { + /* The heap is still not empty */ + memcpy(first, last, sizeof(BlastCompo_HeapRecord)); + s_CompoHeapifyDown(self->heapArray, 1, self->n); + } + } + if (COMPO_INTENSE_DEBUG) { + assert(s_CompoHeapIsValid(self->heapArray, 1, self->n)); + } + return results; +} diff --git a/algo/blast/composition_adjustment/compo_heap.h b/algo/blast/composition_adjustment/compo_heap.h new file mode 100644 index 00000000..731edda7 --- /dev/null +++ b/algo/blast/composition_adjustment/compo_heap.h @@ -0,0 +1,124 @@ +/* $Id: compo_heap.h,v 1.1 2005/12/01 13:52:20 gertz Exp $ + * =========================================================================== + * + * PUBLIC DOMAIN NOTICE + * National Center for Biotechnology Information + * + * This software/database is a "United States Government Work" under the + * terms of the United States Copyright Act. It was written as part of + * the author's official duties as a United States Government employee and + * thus cannot be copyrighted. This software/database is freely available + * to the public for use. The National Library of Medicine and the U.S. + * Government have not placed any restriction on its use or reproduction. + * + * Although all reasonable efforts have been taken to ensure the accuracy + * and reliability of the software and data, the NLM and the U.S. + * Government do not and cannot warrant the performance or results that + * may be obtained by using this software or data. The NLM and the U.S. + * Government disclaim all warranties, express or implied, including + * warranties of performance, merchantability or fitness for any particular + * purpose. + * + * Please cite the author in any work or product based on this material. + * + * ===========================================================================*/ + +/** @file compo_heap.h + * @author Alejandro Schaffer, E. Michael Gertz + * + * Declares a "heap" data structure that is used to store computed alignments + * when composition adjustment of scoring matrices is used. + */ + +#ifndef __COMPO_HEAP__ +#define __COMPO_HEAP__ + +#include <algo/blast/core/blast_export.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct BlastCompo_HeapRecord; + +/** + * A BlastCompo_Heap represents a collection of alignments between one + * query sequence and several matching subject sequences. + * + * Each matching sequence is allocated one record in a + * BlastCompo_Heap. The eValue of a query-subject pair is the best + * (smallest positive) evalue of all alignments between the two + * sequences. + * + * The comparison function for matches is BlastCompo_HeapRecordCompare. A + * match will be inserted in the the BlastCompo_Heap if: + * - there are fewer that BlastCompo_Heap::heapThreshold elements in + * the BlastCompo_Heap; + * - the eValue of the match is <= BlastCompo_Heap::ecutoff; or + * - the match is less than (as determined by BlastCompo_HeapRecordCompare) the + * largest (worst) match already in the BlastCompo_Heap. + * + * If there are >= BlastCompo_Heap::heapThreshold matches already in + * the BlastCompo_Heap when a new match is to be inserted, then the + * largest match (as determined by BlastCompo_HeapRecordCompare) is + * removed, unless the eValue of the largest match <= + * BlastCompo_Heap::ecutoff. Matches with eValue <= + * BlastCompo_Heap::ecutoff are never removed by the insertion + * routine. As a consequence, the BlastCompo_Heap can hold an + * arbitrarily large number of matches, although it is atypical for + * the number of matches to be greater than + * BlastCompo_Heap::heapThreshold. + * + * Once all matches have been collected, the BlastCompo_HeapPop + * routine may be invoked to return all alignments in order. + * + * While the number of elements in a heap < BlastCompo_Heap::heapThreshold, + * the BlastCompo_Heap is implemented as an unordered array, rather + * than a heap-ordered array. The BlastCompo_Heap is converted to a + * heap-ordered array as soon as it becomes necessary to order the + * matches by evalue. The routines that operate on a BlastCompo_Heap + * should behave properly whichever state the BlastCompo_Heap is in. + */ +typedef struct BlastCompo_Heap { + int n; /**< The current number of elements */ + int capacity; /**< The maximum number of elements + that may be inserted before the + BlastCompo_Heap must be resized, this + number must be >= heapThreshold */ + int heapThreshold; /**< see above */ + double ecutoff; /**< matches with evalue below ecutoff may + always be inserted in the BlastCompo_Heap */ + double worstEvalue; /**< the worst (biggest) evalue currently in + the heap */ + + struct BlastCompo_HeapRecord *array; + struct BlastCompo_HeapRecord *heapArray; +} BlastCompo_Heap; + + +NCBI_XBLAST_EXPORT +int BlastCompo_HeapWouldInsert(BlastCompo_Heap * self, double eValue, + int score, int subject_index); +NCBI_XBLAST_EXPORT +int BlastCompo_HeapInsert(BlastCompo_Heap * self, void * alignments, + double eValue, int score, int + subject_index, void ** discardedAligns); + +NCBI_XBLAST_EXPORT +int BlastCompo_HeapFilledToCutoff(const BlastCompo_Heap * self); + +NCBI_XBLAST_EXPORT +int BlastCompo_HeapInitialize(BlastCompo_Heap * self, int heapThreshold, + double ecutoff); + +NCBI_XBLAST_EXPORT +void BlastCompo_HeapRelease(BlastCompo_Heap * self); + +NCBI_XBLAST_EXPORT +void * BlastCompo_HeapPop(BlastCompo_Heap * self); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/algo/blast/composition_adjustment/compo_mode_condition.c b/algo/blast/composition_adjustment/compo_mode_condition.c index 2489f657..95344dc1 100644 --- a/algo/blast/composition_adjustment/compo_mode_condition.c +++ b/algo/blast/composition_adjustment/compo_mode_condition.c @@ -1,5 +1,3 @@ -static char const rcsid[] = "$Id: Mode_condition.c,v 1.1 2005/05/16 16:11:41 papadopo Exp $"; - /* =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -24,119 +22,110 @@ static char const rcsid[] = "$Id: Mode_condition.c,v 1.1 2005/05/16 16:11:41 pap * * ===========================================================================*/ -/***************************************************************************** - -File name: Mode_condition.c - -Authors: Alejandro Schaffer, Yi-Kuo Yu - -Contents: Functions to test whether conditional score matrix - adjustment should be applied for a pair of matching sequences. - -******************************************************************************/ -/* - * $Log: Mode_condition.c,v $ - * Revision 1.1 2005/05/16 16:11:41 papadopo - * Initial revision +/** + * @file compo_mode_condition.c * + * Authors: Alejandro Schaffer, Yi-Kuo Yu + * + * Functions to test whether conditional score matrix adjustment + * should be applied for a pair of matching sequences. */ -#include <ncbi.h> -#include <NRdefs.h> -#include <Mode_condition.h> +#ifndef SKIP_DOXYGEN_PROCESSING +static char const rcsid[] = + "$Id: compo_mode_condition.c,v 1.5 2005/12/01 13:49:43 gertz Exp $"; +#endif /* SKIP_DOXYGEN_PROCESSING */ -double BLOSUM62_bg[Alphsize] = - { 0.0742356686, 0.0515874541, 0.0446395713, 0.0536092024, 0.0246865086, - 0.0342500470, 0.0543174458, 0.0741431988, 0.0262119099, 0.0679331197, - 0.0989057232, 0.0581774322, 0.0249972837, 0.0473970070, 0.0385382904, - 0.0572279733, 0.0508996546, 0.0130298868, 0.0322925130, 0.0729201182 - }; - /* BLOSUM 62 is the correct bg */ +#include <algo/blast/core/ncbi_std.h> +#include <algo/blast/composition_adjustment/composition_adjustment.h> +#include <algo/blast/composition_adjustment/compo_mode_condition.h> +#include <algo/blast/composition_adjustment/matrix_frequency_data.h> +/* 180 degrees in half a circle */ #define HALF_CIRCLE_DEGREES 180 +/* some digits of PI */ #define PI 3.1415926543 +/* thresholds used to determine which composition mode to use */ #define QUERY_MATCH_DISTANCE_THRESHOLD 0.16 #define LENGTH_RATIO_THRESHOLD 3.0 -#define ANGLE_DEGREE_THRESHOLD 70 - -/* declaration of Htype function for future use - * - * typedef int (*Condition) (int , int , int *, int *, char *); - * - * variable orders: Queryseq_length, Matchseq_length, - * query_amino_count, match_amino_account, matrix_name - */ - -Int4 TestToApplyREAdjustmentUnconditional(Int4, - Int4, - Nlm_FloatHi *, - Nlm_FloatHi *, - char *); +#define ANGLE_DEGREE_THRESHOLD 70.0 -Int4 TestToApplyREAdjustmentConditional(Int4, - Int4, - Nlm_FloatHi *, - Nlm_FloatHi *, - char *); +/* type of function used to choose a mode for composition-based + * statistics. The variables are Queryseq_length, Matchseq_length, + * query_amino_count, match_amino_account and matrix_name.*/ +typedef ECompoAdjustModes +(*Condition) (int, int, const double *, const double *, + const char *); -/* If this function is used relative-entropy score adjustment is +/* A function used to choose a mode for composition-based statistics. + * If this function is used relative-entropy score adjustment is * always applied, with a fixed value as the target relative entropy*/ -Int4 -TestToApplyREAdjustmentUnconditional(Int4 Len_query, - Int4 Len_match, - Nlm_FloatHi * P_query, - Nlm_FloatHi * P_match, - char *matrix_name) +static ECompoAdjustModes +TestToApplyREAdjustmentUnconditional(int Len_query, + int Len_match, + const double * P_query, + const double * P_match, + const char *matrix_name) { - return RE_USER_SPECIFIED; + /* Suppress unused variable warnings */ + (void) Len_query; + (void) Len_match; + (void) P_query; + (void) P_match; + (void) matrix_name; + + return eUserSpecifiedRelEntropy; } -/* Decide whether a relative-entropy score adjustment should be used +/** + * A function used to choose a mode for composition-based statistics. + * Decide whether a relative-entropy score adjustment should be used * based on lengths and letter counts of the two matched sequences; * matrix_name is the underlying score matrix; for now only BLOSUM62 * is supported */ -Int4 -TestToApplyREAdjustmentConditional(Int4 Len_query, - Int4 Len_match, - Nlm_FloatHi * P_query, - Nlm_FloatHi * P_match, - char *matrix_name) +static ECompoAdjustModes +TestToApplyREAdjustmentConditional(int Len_query, + int Len_match, + const double * P_query, + const double * P_match, + const char *matrix_name) { - Int4 mode_value; /* which relative entropy mode to return */ - Int4 i; /* loop indices */ - Nlm_FloatHi p_query[Alphsize], p_match[Alphsize]; /*letter probabilities - *for query and match*/ - Nlm_FloatHi *p_matrix; /* letter probabilities used in constructing - * matrix name*/ - Nlm_FloatHi D_m_mat, D_q_mat, D_m_q; /* distances between - * match and original - * between query and - * original between - * match and query*/ - Nlm_FloatHi corr_factor = 0.0; /* correlation between how - p_query and p_match deviate - from p_matrix */ - Nlm_FloatHi len_q, len_m; /* lengths of query and matching - sequence in floating point */ - Nlm_FloatHi len_large, len_small; /* store the larger and smaller of - * len_q and len_m */ - Nlm_FloatHi angle; /* angle between query and match - probabilities */ - - p_matrix = Get_bg_freq(matrix_name); - - for(i = 0; i < Alphsize; i++) { + ECompoAdjustModes mode_value; /* which relative entropy mode to + return */ + int i; /* loop indices */ + double p_query[COMPO_NUM_TRUE_AA]; + double p_match[COMPO_NUM_TRUE_AA]; /*letter probabilities + for query and match*/ + const double *p_matrix; /* letter probabilities used in + constructing matrix name*/ + double D_m_mat, D_q_mat, D_m_q; /* distances between match and + original between query and + original between match and + query*/ + double corr_factor = 0.0; /* correlation between how p_query + and p_match deviate from p_matrix + */ + double len_q, len_m; /* lengths of query and matching + sequence in floating point */ + double len_large, len_small; /* store the larger and smaller of + len_q and len_m */ + double angle; /* angle between query and match + probabilities */ + + p_matrix = Blast_GetMatrixBackgroundFreq(matrix_name); + + for (i = 0; i < COMPO_NUM_TRUE_AA; i++) { p_query[i] = P_query[i]; p_match[i] = P_match[i]; corr_factor += (p_query[i] - p_matrix[i]) * (p_match[i] - p_matrix[i]); } - D_m_mat = Get_RE(p_match, p_matrix); - D_q_mat = Get_RE(p_query, p_matrix); - D_m_q = Get_RE(p_match, p_query); /* distance between match and query */ + D_m_mat = Blast_GetRelativeEntropy(p_match, p_matrix); + D_q_mat = Blast_GetRelativeEntropy(p_query, p_matrix); + D_m_q = Blast_GetRelativeEntropy(p_match, p_query); angle = acos((D_m_mat * D_m_mat + D_q_mat * D_q_mat - @@ -146,61 +135,54 @@ TestToApplyREAdjustmentConditional(Int4 Len_query, len_q = 1.0 * Len_query; len_m = 1.0 * Len_match; - if(len_q > len_m) { + if (len_q > len_m) { len_large = len_q; len_small = len_m; } else { len_large = len_m; len_small = len_q; } - - if((D_m_q > QUERY_MATCH_DISTANCE_THRESHOLD) && - (len_large / len_small > LENGTH_RATIO_THRESHOLD) && - (angle > ANGLE_DEGREE_THRESHOLD)) { - mode_value = KEEP_OLD_MATRIX; + if ((D_m_q > QUERY_MATCH_DISTANCE_THRESHOLD) && + (len_large / len_small > LENGTH_RATIO_THRESHOLD) && + (angle > ANGLE_DEGREE_THRESHOLD)) { + mode_value = eCompoKeepOldMatrix; } else { - mode_value = RE_USER_SPECIFIED; + mode_value = eUserSpecifiedRelEntropy; } - return mode_value; } -/* Retrieve the background letter probabilities implicitly used in - * constructing the score matrix matrix_name*/ -Nlm_FloatHi * -Get_bg_freq(char *matrix_name) -{ - if(0 == strcmp(matrix_name, "BLOSUM62")) { - return BLOSUM62_bg; - } else { /* default */ - printf("matrix not supported, exit now! \n"); - exit(1); - } -} +/** + * An array of functions that can be used to decide which optimization + * formulation should be used for score adjustment */ +static Condition Cond_func[] = { + TestToApplyREAdjustmentConditional, + TestToApplyREAdjustmentUnconditional, + NULL +}; -/* initialization of array of functions that can be used to decide - * which optimization formulation should be used for score - * adjustment */ -Condition Cond_func[] ={ TestToApplyREAdjustmentConditional, - TestToApplyREAdjustmentUnconditional, - NULL }; - -/* Choose how the relative entropy should be constrained based on - * properties of the two sequences to be aligned. length1 an length2 - * are the lengths of the two sequences; probArray1 and probArray2 are - * arrays of probabilities of letters in each sequence, using the - * 20-letter alphabet; matrixName is the name of the underlying 20x20 - * score matrix; testFunctionIndex allows different rules to be tested - * for the relative entropy decision. */ -Int4 -chooseMode(Int4 length1, - Int4 length2, - Nlm_FloatHi * probArray1, - Nlm_FloatHi * probArray2, - char *matrixName, - Int4 testFunctionIndex) +/** + * Choose how the relative entropy should be constrained based on + * properties of the two sequences to be aligned. + * + * @param length1 length of the first sequence + * @param length2 length of the second sequence + * @param probArray1 arrays of probabilities for the first sequence, in + * a 20 letter amino-acid alphabet + * @param probArray2 arrays of probabilities for the other sequence + * @param matrixName name of the scoring matrix + * @param testFunctionIndex allows different rules to be tested + * for the relative entropy decision. + */ +ECompoAdjustModes +Blast_ChooseCompoAdjustMode(int length1, + int length2, + const double * probArray1, + const double * probArray2, + const char *matrixName, + int testFunctionIndex) { return Cond_func[testFunctionIndex] (length1, length2, diff --git a/algo/blast/composition_adjustment/compo_mode_condition.h b/algo/blast/composition_adjustment/compo_mode_condition.h index ff8943ee..9ad535a9 100644 --- a/algo/blast/composition_adjustment/compo_mode_condition.h +++ b/algo/blast/composition_adjustment/compo_mode_condition.h @@ -1,79 +1,56 @@ -/* =========================================================================== -* -* PUBLIC DOMAIN NOTICE -* National Center for Biotechnology Information -* -* This software/database is a "United States Government Work" under the -* terms of the United States Copyright Act. It was written as part of -* the author's official duties as a United States Government employee and -* thus cannot be copyrighted. This software/database is freely available -* to the public for use. The National Library of Medicine and the U.S. -* Government have not placed any restriction on its use or reproduction. -* -* Although all reasonable efforts have been taken to ensure the accuracy -* and reliability of the software and data, the NLM and the U.S. -* Government do not and cannot warrant the performance or results that -* may be obtained by using this software or data. The NLM and the U.S. -* Government disclaim all warranties, express or implied, including -* warranties of performance, merchantability or fitness for any particular -* purpose. -* -* Please cite the author in any work or product based on this material. -* -* ===========================================================================*/ - -/***************************************************************************** - -File name: Mode_condition.h - -Authors: Alejandro Schaffer, Yi-Kuo Yu - -Contents: Definitions used only in Mode_condition.c - -******************************************************************************/ -/* - * $Log: Mode_condition.h,v $ - * Revision 1.1 2005/05/16 16:11:41 papadopo - * Initial revision +/* $Id: compo_mode_condition.h,v 1.5 2005/12/01 13:54:04 gertz Exp $ + * =========================================================================== + * + * PUBLIC DOMAIN NOTICE + * National Center for Biotechnology Information + * + * This software/database is a "United States Government Work" under the + * terms of the United States Copyright Act. It was written as part of + * the author's official duties as a United States Government employee and + * thus cannot be copyrighted. This software/database is freely available + * to the public for use. The National Library of Medicine and the U.S. + * Government have not placed any restriction on its use or reproduction. + * + * Although all reasonable efforts have been taken to ensure the accuracy + * and reliability of the software and data, the NLM and the U.S. + * Government do not and cannot warrant the performance or results that + * may be obtained by using this software or data. The NLM and the U.S. + * Government disclaim all warranties, express or implied, including + * warranties of performance, merchantability or fitness for any particular + * purpose. * + * Please cite the author in any work or product based on this material. + * + * ===========================================================================*/ +/** + * @file compo_mode_condition.h + * @author Alejandro Schaffer, Yi-Kuo Yu + * + * Declarations of functions used to choose the mode for + * composition-based statistics. */ -#ifndef MODE_CONDITION -#define MODE_CONDITION -#define Mode_1_per 0.3 -#define Mode_unchange_per 0.6 -#define RE_mode_1_limit 0.18 +#ifndef __COMPO_MODE_CONDITION__ +#define __COMPO_MODE_CONDITION__ -double *Get_bg_freq(char *matrix_name); +#include <algo/blast/core/blast_export.h> -/* declaration of function type for future use - * - * variable orders: Queryseq_length, Matchseq_length, query_amino_count, - * match_amino_account, matrix_name - * - * return values for both Test_0 and Test_1 - * -1: no adjustment; 0: mode 0 (unconstrained); - * 1: mode 1 (with RE in new context) - */ -typedef Int4 (*Condition) (Int4 , Int4 , - Nlm_FloatHi *, Nlm_FloatHi *, char *); +#ifdef __cplusplus +extern "C" { +#endif -Int4 -TestToApplyREAdjustmentUnconditional(Int4 Len_query, - Int4 Len_match, - Nlm_FloatHi * P_query, - Nlm_FloatHi * P_match, - char *matrix_name); -Int4 -TestToApplyREAdjustmentConditional(Int4 Len_query, - Int4 Len_match, - Nlm_FloatHi * P_query, - Nlm_FloatHi * P_match, - char *matrix_name); +#include <algo/blast/composition_adjustment/composition_constants.h> -Int4 -chooseMode(Int4 length1, Int4 length2, - Nlm_FloatHi * probArray1, Nlm_FloatHi * probArray2, - char *matrixName, Int4 testFunctionIndex); +NCBI_XBLAST_EXPORT +ECompoAdjustModes +Blast_ChooseCompoAdjustMode(int length1, int length2, + const double * probArray1, + const double * probArray2, + const char * matrixName, + int testFunctionIndex); + +#ifdef __cplusplus +} +#endif #endif diff --git a/algo/blast/composition_adjustment/composition_adjustment.c b/algo/blast/composition_adjustment/composition_adjustment.c new file mode 100644 index 00000000..2778dfa5 --- /dev/null +++ b/algo/blast/composition_adjustment/composition_adjustment.c @@ -0,0 +1,1376 @@ +/* =========================================================================== +* +* PUBLIC DOMAIN NOTICE +* National Center for Biotechnology Information +* +* This software/database is a "United States Government Work" under the +* terms of the United States Copyright Act. It was written as part of +* the author's official duties as a United States Government employee and +* thus cannot be copyrighted. This software/database is freely available +* to the public for use. The National Library of Medicine and the U.S. +* Government have not placed any restriction on its use or reproduction. +* +* Although all reasonable efforts have been taken to ensure the accuracy +* and reliability of the software and data, the NLM and the U.S. +* Government do not and cannot warrant the performance or results that +* may be obtained by using this software or data. The NLM and the U.S. +* Government disclaim all warranties, express or implied, including +* warranties of performance, merchantability or fitness for any particular +* purpose. +* +* Please cite the author in any work or product based on this material. +* +* ===========================================================================*/ + +/** @file composition_adjustment.c + * + * @author Yi-Kuo Yu, Alejandro Schaffer, E. Michael Gertz + * + * Highest level functions to solve the optimization problem for + * compositional score matrix adjustment. + */ +#ifndef SKIP_DOXYGEN_PROCESSING +static char const rcsid[] = + "$Id: composition_adjustment.c,v 1.6 2005/12/01 13:51:03 gertz Exp $"; +#endif /* SKIP_DOXYGEN_PROCESSING */ + +#include <limits.h> +#include <assert.h> +#include <algo/blast/core/ncbi_std.h> +#include <algo/blast/composition_adjustment/composition_constants.h> +#include <algo/blast/composition_adjustment/composition_adjustment.h> +#include <algo/blast/composition_adjustment/matrix_frequency_data.h> +#include <algo/blast/composition_adjustment/nlm_linear_algebra.h> +#include <algo/blast/composition_adjustment/optimize_target_freq.h> + +/**positions of true characters in protein alphabet*/ +static int trueCharPositions[COMPO_NUM_TRUE_AA] = +{1,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,22}; + +/** + * conversion from 26 letter NCBIstdaa alphabet to 20 letter order + * for true amino acids: ARNDCQEGHILKMFPSTWYV. This order is + * alphabetical in the standard three-letter abbreviation of each + * amino acid */ +static int alphaConvert[COMPO_PROTEIN_ALPHABET] = + {(-1), 0, (-1), 4, 3, 6, 13, 7, 8, 9, 11, 10, 12, 2, 14, 5, 1, 15, + 16, 19, 17, (-1), 18, (-1), (-1), (-1)}; + + +/** + * Desired margin between an end of region used for computing a + * composition, and the nearest StopChar; the desired margin may + * not be attained. */ +static const int kCompositionMargin = 20; + +#define SCORE_BOUND 0.0000000001 /* average scores below + -SCORE_BOUND are considered + effectively nonnegative, and + Newton's method will + will terminate */ +#define LAMBDA_STEP_FRACTION 0.5 /* default step fraction in + Newton's method */ +#define INITIAL_LAMBDA 1.0 /* initial value for Newton's + method */ +#define LAMBDA_ITERATION_LIMIT 300 /* iteration limit for Newton's + method. */ +#define LAMBDA_ERROR_TOLERANCE 0.0000001 /* bound on error for estimating + lambda */ + +/* bound on error for Newton's method */ +static const double kCompoAdjustErrTolerance = 0.00000001; +/* iteration limit for Newton's method */ +static const int kCompoAdjustIterationLimit = 2000; +/** relative entropy of BLOSUM62 */ +static const double kFixedReBlosum62 = 0.44; + +/** + * Find the weighted average of a set of observed probabilities with a + * set of "background" probabilities. All array parameters have + * length COMPO_NUM_TRUE_AA. + * + * @param probs_with_pseudo an array of weighted averages [out] + * @param normalized_probs observed frequencies, normalized to sum + * to 1.0 [out] + * @param observed_freq observed frequencies, not necessarily + * normalized to sum to 1.0. [in] + * @param background_probs the probability of characters in a + * standard sequence. + * @param number_of_observations the number of characters used to + * form the observed_freq array + * @param pseudocounts the number of "standard" characters + * to be added to form the weighted + * average. + */ +static void +Blast_ApplyPseudocounts(double * probs_with_pseudo, + double * normalized_probs, + const double * observed_freq, + int number_of_observations, + const double * background_probs, + int pseudocounts) +{ + int i; /* loop index */ + double weight; /* weight assigned to pseudocounts */ + double sum; /* sum of the observed frequencies */ + double dpseudocounts; /* pseudocounts as a double */ + + dpseudocounts = pseudocounts; + + /* Normalize probabilities */ + sum = 0.0; + for (i = 0; i < COMPO_NUM_TRUE_AA; i++) { + sum += observed_freq[i]; + } + if (sum > 0) { + for (i = 0; i < COMPO_NUM_TRUE_AA; i++) { + normalized_probs[i] = observed_freq[i]/sum; + } + } + weight = dpseudocounts / (number_of_observations + dpseudocounts); + for (i = 0; i < COMPO_NUM_TRUE_AA; i++) { + probs_with_pseudo[i] = + (1.0 - weight) * normalized_probs[i] + + weight * background_probs[i]; + } +} + + +/** + * Create a score matrix from a set of target frequencies. The scores + * are scaled so that the Karlin-Altschul statistical parameter Lambda + * equals (within numerical precision) 1.0. + * + * @param score the new score matrix [out] + * @param alphasize the number of rows and columns of score + * @param freq a matrix of target frequencies [in] + * @param row_sum sum of each row of freq [in] + * @param col_sum sum of each column of freq[in] + */ +static void +Blast_ScoreMatrixFromFreq(double ** score, int alphsize, double ** freq, + const double row_sum[], const double col_sum[]) +{ + int i, j; /* array indices */ + double sum; /* sum of values in freq; used to normalize freq */ + + sum = 0.0; + for (i = 0; i < alphsize; i++) { + for (j = 0; j < alphsize; j++) { + sum += freq[i][j]; + } + } + for (i = 0; i < alphsize; i++) { + for (j = 0; j < alphsize; j++) { + score[i][j] = log(freq[i][j] / sum / row_sum[i] / col_sum[j]); + } + } +} + + +/** + * Compute the symmetric form of the relative entropy of two + * probability vectors + * + * In this software relative entropy is expressed in "nats", + * meaning that logarithms are base e. In some other scientific + * and engineering domains where entropy is used, the logarithms + * are taken base 2 and the entropy is expressed in bits. + * + * @param A an array of length COMPO_NUM_TRUE_AA of + * probabilities. + * @param B a second array of length COMPO_NUM_TRUE_AA of + * probabilities. + */ +double +Blast_GetRelativeEntropy(const double A[], const double B[]) +{ + int i; /* loop index over letters */ + double temp; /* intermediate term */ + double value = 0.0; /* square of relative entropy */ + + for (i = 0; i < COMPO_NUM_TRUE_AA; i++) { + temp = (A[i] + B[i]) / 2; + if (temp > 0) { + if (A[i] > 0) { + value += A[i] * log(A[i] / temp) / 2; + } + if (B[i] > 0) { + value += B[i] * log(B[i] / temp) / 2; + } + } + } + if (value < 0) { /* must be numerical rounding error */ + value = 0; + } + return sqrt(value); +} + + +/** + * Convert letter probabilities from a 26-letter NCBIstdaa alphabet to + * a 20 letter ARND... amino acid alphabet. (@see alphaConvert) + * + * @param inputLetterProbs the 26-letter probabilities [in] + * @param outputLetterProbs the 20-letter probabilities [out] + */ +static void +s_GatherLetterProbs(double * outputLetterProbs, + const double * inputLetterProbs) +{ + int c; /*index over characters*/ + + for (c = 0; c < COMPO_PROTEIN_ALPHABET; c++) { + if ((-1) != alphaConvert[c]) { + outputLetterProbs[alphaConvert[c]] = inputLetterProbs[c]; + } + } +} + + +/** + * Scatter and scale a matrix of scores for a 20 letter ARND... amino + * acid alphabet into a matrix for a 26 letter NCBIstdaa alphabet + * (@see alphaConvert), leaving scores for any character not present + * in the smaller alphabet untouched. + * + * @param dMatrix frequency ratios for the 26 letter alphabet [out] + * @param dMatrixTrueAA frequency ratios for the 20 letter alphabet [in] + * @param scale multiply the elements in dMatrixTrueAA by + * scale when applying the scatter. + */ +static void +s_ScatterScores(double ** dMatrix, + double scale, + double ** dMatrixTrueAA) +{ + int p, c; /*indices over positions and characters*/ + + for (p = 0; p < COMPO_PROTEIN_ALPHABET; p++) { + for (c = 0; c < COMPO_PROTEIN_ALPHABET; c++) { + if (((-1) != alphaConvert[p]) && ((-1) != alphaConvert[c])) { + dMatrix[p][c] = + scale * dMatrixTrueAA[alphaConvert[p]][alphaConvert[c]]; + } + } + } +} + + +/** + * Average the scores for two characters to get scores for an + * ambiguity character than represents either of the two original + * characters. + * + * @param dMatrix score matrix -- on entry contains the score data + * for characters A and B, and on exit also contains + * the score data for ambigAB. + * @param A a character in the alphabet + * @param B another character in the alphabet + * @param ambigAB the combined ambiguity character + */ +static void +Blast_AverageScores(double ** dMatrix, int A, int B, int ambigAB) +{ + int i; /* iteration index */ + double sum; /* sum of scores */ + + for (i = 0; i < COMPO_PROTEIN_ALPHABET; i++) { + if (-1 != alphaConvert[i]) { + sum = dMatrix[i][A] + dMatrix[i][B]; + dMatrix[i][ambigAB] = sum/2.0; + sum = dMatrix[A][i] + dMatrix[B][i]; + dMatrix[ambigAB][i] = sum/2.0; + } + } + /* Because ambiguity characters are rare, we assume a match of + * ambiguity characters represents a match of the true residues, and + * so only include matches when computing the average score. */ + sum = dMatrix[A][A] + dMatrix[B][B]; + dMatrix[ambigAB][ambigAB] = sum/2.0; +} + + +/** + * Set scores for substitutions that involve the nonstandard amino + * acids in the NCBIstdaa alphabet: the ambiguity characters 'B' and + * 'Z'; the "don't care" character 'X'; the atypical amino acid 'U' + * (Selenocysteine); the stop codon '*'; and the gap or end of + * sequence character '-'. + * + * @param dMatrix a matrix that on entry contains scores for all the + * true amino acids, and on exit also contains the + * scores for the nonstandard amino acids. + * @param startMatrix rounded amino acid substitution scores in + * standard context [in] + */ +static void +Blast_SetNonstandardAaScores(double **dMatrix, int **startMatrix) +{ + int i; /* loop index */ + /* An array containing those special characters whose score will be + set using startFreqRatios */ + int specialChars[4] = + { eGapChar, eXchar, eSelenocysteine, eStopChar }; + + /* Set the scores for ambiguity characters B and Z */ + Blast_AverageScores(dMatrix, eDchar, eNchar, eBchar); + Blast_AverageScores(dMatrix, eEchar, eQchar, eZchar); + + /* (B,Z) mismatches are so rare we simply set their score to zero. */ + dMatrix[eBchar][eZchar] = dMatrix[eZchar][eBchar] = 0.0; + + /* Set the other characters using the startMatrix */ + for (i = 0; i < 4; i++) { + int A, B; /* Two characters in the alphabet */ + A = specialChars[i]; + for (B = 0; B < COMPO_PROTEIN_ALPHABET; B++) { + dMatrix[A][B] = startMatrix[A][B]; + dMatrix[B][A] = startMatrix[B][A]; + } + } +} + + +/** Return the nearest integer to x. */ +static long Nint(double x) +{ + x += (x >= 0. ? 0.5 : -0.5); + return (long)x; +} + + +/** + * Round a matrix of floating point scores. + * + * @param matrix the matrix of integer valued scores [out] + * @param floatScoreMatrix the matrix of floating point valued + * scores [in] + * @param numPositions the number of rows of the matrices. + */ +static void +s_RoundScoreMatrix(int **matrix, double **floatScoreMatrix, + int numPositions) +{ + int p, c; /*indices over positions and characters*/ + + for (p = 0; p < numPositions; p++) { + for (c = 0; c < COMPO_PROTEIN_ALPHABET; c++) { + if (floatScoreMatrix[p][c] < INT_MIN) { + matrix[p][c] = INT_MIN; + } else { + matrix[p][c] = Nint(floatScoreMatrix[p][c]); + } + } + } +} + + +/** + * Find the range of scores contained in an scoring matrix. + * @param obs_min smallest value in the matrix + * @param obs_max largest value in the matrix + * @param matrix a matrix with COMPO_NUM_TRUE_AA columns + * @param rows number of rows in the matrix + */ +static void s_GetScoreRange(int * obs_min, int * obs_max, + int ** matrix, int rows) +{ + int aa; /* index of an amino-acid in the 20 + letter alphabet */ + int irow, jcol; /* matrix row and column indices */ + int minScore, maxScore; /* largest and smallest observed scores */ + + minScore = maxScore = 0; + for (irow = 0; irow < rows; irow++) { + for (aa = 0; aa < COMPO_NUM_TRUE_AA; aa++) { + jcol = trueCharPositions[aa]; + if (matrix[irow][jcol] < minScore && + matrix[irow][jcol] > COMPO_SCORE_MIN) + minScore = matrix[irow][jcol]; + if (matrix[irow][jcol] > maxScore) + maxScore = matrix[irow][jcol]; + } + } + *obs_min = minScore; + *obs_max = maxScore; +} + + +/** + * Compute the score probabilities for a given amino acid substitution matrix + * in the context of given query and subject amino acid frequencies. + * + * @param *obs_min the smallest score in the score matrix [out] + * @param *obs_max the largest score in the score matrix [out] + * @param *scoreProb the new array, of length (*obs_max - *obs_min + 1), + * of score probabilities, where (*scoreProb)[0] is + * the probability for score *obs_min. + * @param matrix a amino-acid substitution matrix (not + * position-specific) + * @param subjectProbArray is an array containing the probability of + * occurrence of each residue in the subject + * @param queryProbArray is an array containing the probability of + * occurrence of each residue in the query + * @param scoreProb is an array of probabilities for each score + * that is to be used as a field in return_sfp + * @return 0 on success, -1 on out-of-memory + */ +static int +s_GetMatrixScoreProbs(double **scoreProb, int * obs_min, int * obs_max, + int **matrix, const double *subjectProbArray, + const double *queryProbArray) +{ + int aa; /* index of an amino-acid in the 20 letter + alphabet */ + int irow, jcol; /* matrix row and column indices */ + double * sprob; /* a pointer to the element of the score + probabilities array that represents the + probability of the score 0*/ + int minScore; /* smallest score in matrix; the same value as + (*obs_min). */ + int range; /* the range of scores in the matrix */ + + s_GetScoreRange(obs_min, obs_max, matrix, COMPO_PROTEIN_ALPHABET); + minScore = *obs_min; + range = *obs_max - *obs_min + 1; + *scoreProb = calloc(range, sizeof(double)); + if (*scoreProb == NULL) { + return -1; + } + sprob = &((*scoreProb)[-(*obs_min)]); /*center around 0*/ + for (irow = 0; irow < COMPO_PROTEIN_ALPHABET; irow++) { + for (aa = 0; aa < COMPO_NUM_TRUE_AA; aa++) { + jcol = trueCharPositions[aa]; + if (matrix[irow][jcol] >= minScore) { + sprob[matrix[irow][jcol]] += + (queryProbArray[irow] * subjectProbArray[jcol]); + } + } + } + return 0; +} + + +/** + * Compute the score probabilities for a given amino acid position-specific + * substitution matrix in the context of a given set of subject amino + * acid frequencies. + * + * @param *obs_min the smallest score in the score matrix [out] + * @param *obs_max the largest score in the score matrix [out] + * @param *scoreProb the new array, of length (*obs_max - *obs_min + 1), + * of score probabilities, where (*scoreProb)[0] is + * the probability for score *obs_min. + * @param matrix a position-specific amino-acid substitution matrix. + * @param rows the number of rows in matrix. + * @param subjectProbArray is an array containing the probability of + * occurrence of each residue in the subject + * @param queryProbArray is an array containing the probability of + * occurrence of each residue in the query + * @param scoreProb is an array of probabilities for each score + * that is to be used as a field in return_sfp + * @return 0 on success, -1 on out-of-memory + */ +static int +s_GetPssmScoreProbs(double ** scoreProb, int * obs_min, int * obs_max, + int **matrix, int rows, + const double *subjectProbArray) +{ + int aa; /* index of an amino-acid in the 20 letter + alphabet */ + int irow, jcol; /* matrix row and column indices */ + double onePosFrac; /* matrix length as a double*/ + double * sprob; /* pointer to the element of the score + * probabilities array the represents the + * probability of zero */ + int minScore; /* smallest score in matrix; the same value as + (*obs_min). */ + int range; /* the range of scores in the matrix */ + + s_GetScoreRange(obs_min, obs_max, matrix, rows); + minScore = *obs_min; + range = *obs_max - *obs_min + 1; + *scoreProb = calloc(range, sizeof(double)); + if (*scoreProb == NULL) { + return -1; + } + sprob = &((*scoreProb)[-(*obs_min)]); /*center around 0*/ + onePosFrac = 1.0/ ((double) rows); + for (irow = 0; irow < rows; irow++) { + for (aa = 0; aa < COMPO_NUM_TRUE_AA; aa++) { + jcol = trueCharPositions[aa]; + if (matrix[irow][jcol] >= minScore) { + sprob[matrix[irow][jcol]] += + onePosFrac * subjectProbArray[jcol]; + } + } + } + return 0; +} + + +/** + * Compute an integer-valued amino-acid score matrix from a set of + * score frequencies. + * + * @param matrix the preallocated matrix + * @param matrixName the score frequencies + * @param Lambda the desired scale of the matrix + */ +void +Blast_Int4MatrixFromFreq(Int4 **matrix, int alphsize, + double ** freq, double Lambda) +{ + int i,j; /*loop indices*/ + + for (i = 0; i < alphsize; i++) { + for (j = 0; j < alphsize; j++) { + if (0.0 == freq[i][j]) { + matrix[i][j] = COMPO_SCORE_MIN; + } else { + double temp = log(freq[i][j])/Lambda; + matrix[i][j] = Nint(temp); + } + } + } +} + + +/** + * Fill in one row of a score matrix; used by the s_ScaleMatrix + * routine to fill in all rows. (@sa s_ScaleMatrix) + * + * @param matrixRow a row of the matrix to be filled in [out]. + * @param startMatrixRow a row of rounded amino acid substitution scores in + * standard context [in] + * @param freqRatiosRow a row of frequency ratios of starting matrix [in] + * @param Lambda a Karlin-Altschul parameter. [in] + * @param LambdaRatio ratio of correct Lambda to it's original value [in] + */ +static void +s_ScaleMatrixRow(int *matrixRow, const int *startMatrixRow, + const double *freqRatiosRow, + double Lambda, double LambdaRatio) +{ + int c; /* column index */ + double temp; /* intermediate term in computation*/ + + for (c = 0; c < COMPO_PROTEIN_ALPHABET; c++) { + switch (c) { + case eGapChar: case eXchar: case eSelenocysteine: case eStopChar: + /* Don't scale these nonstandard residues */ + matrixRow[c] = startMatrixRow[c]; + break; + + default: + if (0.0 == freqRatiosRow[c]) { + matrixRow[c] = COMPO_SCORE_MIN; + } else { + temp = log(freqRatiosRow[c]); + temp = temp/Lambda; + temp = temp * LambdaRatio; + matrixRow[c] = Nint(temp); + } /* end else 0.0 != freqRatiosRow[c] */ + } /* end switch(c) */ + } /* end for c */ +} + + +/** Free memory associated with a Blast_MatrixInfo object */ +void Blast_MatrixInfoFree(Blast_MatrixInfo ** ss) +{ + if (*ss != NULL) { + free((*ss)->matrixName); + Nlm_Int4MatrixFree(&(*ss)->startMatrix); + Nlm_DenseMatrixFree(&(*ss)->startFreqRatios); + free(*ss); + *ss = NULL; + } +} + + +/** Create a Blast_MatrixInfo object + * + * @param rows the number of rows in the matrix, should be + * COMPO_PROTEIN_ALPHABET unless the matrix is position + * based, in which case it is the query length + * @param positionBased is this matrix position-based? + */ +Blast_MatrixInfo * +Blast_MatrixInfoNew(int rows, int positionBased) +{ + int i; /* loop index */ + Blast_MatrixInfo * ss = malloc(sizeof(Blast_MatrixInfo)); + if (ss != NULL) { + ss->rows = rows; + ss->positionBased = positionBased; + + ss->matrixName = NULL; + ss->startMatrix = NULL; + ss->startFreqRatios = NULL; + + ss->startMatrix = Nlm_Int4MatrixNew(rows + 1, COMPO_PROTEIN_ALPHABET); + if (ss->startMatrix == NULL) + goto error_return; + ss->startFreqRatios = Nlm_DenseMatrixNew(rows + 1, COMPO_PROTEIN_ALPHABET); + if (ss->startFreqRatios == NULL) + goto error_return; + for (i = 0; i < COMPO_PROTEIN_ALPHABET; i++) { + ss->startMatrix[rows][i] = COMPO_SCORE_MIN; + ss->startFreqRatios[rows][i] = (double) COMPO_SCORE_MIN; + } + + } + goto normal_return; +error_return: + Blast_MatrixInfoFree(&ss); +normal_return: + return ss; +} + + +/** + * Fill in the entries of a score matrix with compositionally adjusted + * values. (@sa Blast_CompositionBasedStats) + * + * @param matrix preallocated matrix to be filled in [out] + * @param ss data used to compute matrix scores + * @param LambdaRatio ratio of correct Lambda to its value in + * standard context. + */ +static void +s_ScaleMatrix(int **matrix, const Blast_MatrixInfo * ss, + double LambdaRatio) +{ + int p; /* index over matrix rows */ + + if (ss->positionBased) { + /* scale the matrix rows unconditionally */ + for (p = 0; p < ss->rows; p++) { + s_ScaleMatrixRow(matrix[p], ss->startMatrix[p], + ss->startFreqRatios[p], + ss->ungappedLambda, LambdaRatio); + } + } else { + /* Scale only the rows for true amino acids and ambiguous residues + * B and Z. */ + for (p = 0; p < COMPO_PROTEIN_ALPHABET; p++) { + switch (p) { + case eGapChar: case eXchar: case eSelenocysteine: case eStopChar: + /* Do not scale the scores of nonstandard amino acids. */ + memcpy(matrix[p], ss->startMatrix[p], + COMPO_PROTEIN_ALPHABET * sizeof(int)); + break; + default: + s_ScaleMatrixRow(matrix[p], ss->startMatrix[p], + ss->startFreqRatios[p], + ss->ungappedLambda, LambdaRatio); + } + } + } +} + + +/** LambdaRatioLowerBound is used when the expected score is too large + * causing impalaKarlinLambdaNR to give a Lambda estimate that + * is too small, or to fail entirely returning -1*/ +#define LambdaRatioLowerBound 0.5 + + +/** + * Use composition-based statistics to adjust the scoring matrix, as + * described in + * + * Schaffer, A.A., Aravaind, L., Madden, T.L., Shavirin, S., + * Spouge, J.L., Wolf, Y.I., Koonin, E.V., and Altschul, S.F. + * (2001), "Improving the accuracy of PSI-BLAST protein database + * searches with composition-based statistics and other + * refinements", Nucleic Acids Res. 29:2994-3005. + * + * @param matrix a scoring matrix to be adjusted [out] + * @param *LambdaRatio the ratio of the corrected lambda to the + * original lambda [out] + * @param ss data used to compute matrix scores + * + * @param queryProb amino acid probabilities in the query + * @param resProb amino acid probabilities in the subject + * @param calc_lambda a function that can calculate the + * statistical parameter Lambda from a set of + * score frequencies. + * @return 0 on success, -1 on out of memory + */ +int +Blast_CompositionBasedStats(int ** matrix, double * LambdaRatio, + const Blast_MatrixInfo * ss, + const double queryProb[], const double resProb[], + double (*calc_lambda)(double*,int,int,double)) +{ + double correctUngappedLambda; /* new value of ungapped lambda */ + int obs_min, obs_max; + double *scoreArray; + int out_of_memory; + + if (ss->positionBased) { + out_of_memory = + s_GetPssmScoreProbs(&scoreArray, &obs_min, &obs_max, + ss->startMatrix, ss->rows, resProb); + } else { + out_of_memory = + s_GetMatrixScoreProbs(&scoreArray, &obs_min, &obs_max, + ss->startMatrix, resProb, queryProb); + } + if (out_of_memory) + return -1; + correctUngappedLambda = + calc_lambda(scoreArray, obs_min, obs_max, ss->ungappedLambda); + + /* calc_lambda will return -1 in the case where the + * expected score is >=0; however, because of the MAX statement 3 + * lines below, LambdaRatio should always be > 0; the succeeding + * test is retained as a vestige, in case one wishes to remove the + * MAX statement and allow LambdaRatio to take on the error value + * -1 */ + *LambdaRatio = correctUngappedLambda / ss->ungappedLambda; + *LambdaRatio = MIN(1, *LambdaRatio); + *LambdaRatio = MAX(*LambdaRatio, LambdaRatioLowerBound); + + if (*LambdaRatio > 0) { + s_ScaleMatrix(matrix, ss, *LambdaRatio); + } + free(scoreArray); + + return 0; +} + + +/** + * Compute the amino acid composition of a sequence. + * + * @param composition the computed composition + * @param sequence a sequence of amino acids + * @param length length of the sequence + */ +void +Blast_ReadAaComposition(Blast_AminoAcidComposition * composition, + const Uint1 * sequence, int length) +{ + int frequency[COMPO_PROTEIN_ALPHABET]; /*frequency of each letter*/ + int i; /*index*/ + int localLength; /*reduce for X characters*/ + double * resProb = composition->prob; + + localLength = length; + for (i = 0; i < COMPO_PROTEIN_ALPHABET; i++) + frequency[i] = 0; + for (i = 0; i < length; i++) { + if (eXchar != sequence[i]) + frequency[sequence[i]]++; + else + localLength--; + } + for (i = 0; i < COMPO_PROTEIN_ALPHABET; i++) { + if (frequency[i] == 0) + resProb[i] = 0.0; + else { + double freq = frequency[i]; + resProb[i] = freq / (double) localLength; + } + } + composition->numTrueAminoAcids = localLength; +} + + +/** + * Get the range of a sequence to be included when computing a + * composition. This function is used for translated sequences, where + * the range to use when computing a composition is not the whole + * sequence, but is rather a range about an existing alignment. + * + * @param *pleft, *pright left and right endpoint of the range + * @param subject_data data from a translated sequence + * @param length length of subject_data + * @param start, finish start and finish (one past the end) of a + * existing alignment + */ +void +Blast_GetCompositionRange(int * pleft, int * pright, + const Uint1 * subject_data, int length, + int start, int finish) +{ + int i; /* iteration index */ + int left, right; + + left = start; + /* Search leftward for a StopChar */ + for (i = left; i > 0; i--) { + if (subject_data[i - 1] == eStopChar) { + /* We have found a StopChar. Unless the StopChar is + * too close to the start of the subject region of the + * HSP, */ + if (i + kCompositionMargin < left) { + /* reset the left endpoint. */ + left = i + kCompositionMargin; + } + break; + } + } + if (i == 0) { + /* No stop codon was found to the left. */ + left = 0; + } + right = finish; + /* Search rightward for a StopChar */ + for (i = right; i < length; i++) { + if (subject_data[i] == eStopChar) { + /* We have found a StopChar. Unless the StopChar is + * too close to the end of the subject region of the + * HSP, */ + if (i - kCompositionMargin > right) { + /* reset the right endpoint */ + right = i - kCompositionMargin; + } + break; + } + } + if (i == length) { + /* No stop codon was found to the right. */ + right = length; + } + *pleft = left; *pright = right; +} + + +/** Free memory associated with a record of type + * Blast_CompositionWorkspace. */ +void +Blast_CompositionWorkspaceFree(Blast_CompositionWorkspace ** pNRrecord) +{ + Blast_CompositionWorkspace * NRrecord = *pNRrecord; + + if (NRrecord != NULL) { + free(NRrecord->first_standard_freq); + free(NRrecord->second_standard_freq); + free(NRrecord->first_seq_freq); + free(NRrecord->second_seq_freq); + free(NRrecord->first_seq_freq_wpseudo); + free(NRrecord->second_seq_freq_wpseudo); + + Nlm_DenseMatrixFree(&NRrecord->score_old); + Nlm_DenseMatrixFree(&NRrecord->score_final); + Nlm_DenseMatrixFree(&NRrecord->mat_final); + Nlm_DenseMatrixFree(&NRrecord->mat_b); + + free(NRrecord); + } + pNRrecord = NULL; +} + + +/** Create a new Blast_CompositionWorkspace object, allocating memory + * for all its component arrays. */ +Blast_CompositionWorkspace * Blast_CompositionWorkspaceNew() +{ + Blast_CompositionWorkspace * NRrecord; /* record to allocate + and return */ + int i; /* loop index */ + + NRrecord = (Blast_CompositionWorkspace *) + malloc(sizeof(Blast_CompositionWorkspace)); + if (NRrecord == NULL) goto error_return; + + NRrecord->first_standard_freq = NULL; + NRrecord->second_standard_freq = NULL; + NRrecord->first_seq_freq = NULL; + NRrecord->second_seq_freq = NULL; + NRrecord->first_seq_freq_wpseudo = NULL; + NRrecord->second_seq_freq_wpseudo = NULL; + NRrecord->score_old = NULL; + NRrecord->score_final = NULL; + NRrecord->mat_final = NULL; + NRrecord->mat_b = NULL; + + NRrecord->first_standard_freq = + (double *) malloc(COMPO_NUM_TRUE_AA * sizeof(double)); + if (NRrecord->first_standard_freq == NULL) goto error_return; + + NRrecord->second_standard_freq = + (double *) malloc(COMPO_NUM_TRUE_AA * sizeof(double)); + if (NRrecord->second_standard_freq == NULL) goto error_return; + + NRrecord->first_seq_freq = + (double *) malloc(COMPO_NUM_TRUE_AA * sizeof(double)); + if (NRrecord->first_seq_freq == NULL) goto error_return; + + NRrecord->second_seq_freq = + (double *) malloc(COMPO_NUM_TRUE_AA * sizeof(double)); + if (NRrecord->second_seq_freq == NULL) goto error_return; + + NRrecord->first_seq_freq_wpseudo = + (double *) malloc(COMPO_NUM_TRUE_AA * sizeof(double)); + if (NRrecord->first_seq_freq_wpseudo == NULL) goto error_return; + + NRrecord->second_seq_freq_wpseudo = + (double *) malloc(COMPO_NUM_TRUE_AA * sizeof(double)); + if (NRrecord->second_seq_freq_wpseudo == NULL) goto error_return; + + NRrecord->score_old = Nlm_DenseMatrixNew(COMPO_NUM_TRUE_AA, + COMPO_NUM_TRUE_AA); + if (NRrecord->score_old == NULL) goto error_return; + + NRrecord->score_final = Nlm_DenseMatrixNew(COMPO_NUM_TRUE_AA, + COMPO_NUM_TRUE_AA); + if (NRrecord->score_final == NULL) goto error_return; + + NRrecord->mat_final = Nlm_DenseMatrixNew(COMPO_NUM_TRUE_AA, + COMPO_NUM_TRUE_AA); + if (NRrecord->mat_final == NULL) goto error_return; + + NRrecord->mat_b = Nlm_DenseMatrixNew(COMPO_NUM_TRUE_AA, + COMPO_NUM_TRUE_AA); + if (NRrecord->mat_b == NULL) goto error_return; + + for (i = 0; i < COMPO_NUM_TRUE_AA; i++) { + NRrecord->first_standard_freq[i] = + NRrecord->second_standard_freq[i] = 0.0; + NRrecord->first_seq_freq[i] = NRrecord->second_seq_freq[i] = 0.0; + NRrecord->first_seq_freq_wpseudo[i] = + NRrecord->second_seq_freq_wpseudo[i] = 0.0; + } + + goto normal_return; +error_return: + Blast_CompositionWorkspaceFree(&NRrecord); +normal_return: + return NRrecord; +} + + +/** Initialize the fields of a Blast_CompositionWorkspace for a specific + * underlying scoring matrix. */ +int +Blast_CompositionWorkspaceInit(Blast_CompositionWorkspace * NRrecord, + const char *matrixName) +{ + double re_o_implicit = 0.0; /* implicit relative entropy of + starting matrix */ + int i, j; /* loop indices */ + + if (0 == Blast_GetJointProbsForMatrix(NRrecord->mat_b, + NRrecord->first_standard_freq, + NRrecord->second_standard_freq, + matrixName)) { + for (i = 0; i < COMPO_NUM_TRUE_AA; i++) { + for (j = 0; j < COMPO_NUM_TRUE_AA; j++) { + re_o_implicit += + NRrecord->mat_b[i][j] * log(NRrecord->mat_b[i][j] / + NRrecord-> + first_standard_freq[i] / + NRrecord-> + second_standard_freq[j]); + NRrecord->score_old[i][j] = + log(NRrecord->mat_b[i][j] / + NRrecord->first_standard_freq[i] / + NRrecord->second_standard_freq[j]); + } + } + NRrecord->RE_o_implicit = re_o_implicit; + return 0; + } else { + fprintf(stderr, + "Matrix %s not currently supported for RE based adjustment\n", + matrixName); + return -1; + } +} + + +/*compute Lambda and if flag set according return re_o_newcontext, + otherwise return 0.0, also test for the possibility of average + score >= 0*/ +static double +Blast_CalcLambdaForComposition(Blast_CompositionWorkspace * NRrecord, + int compute_re, + double * lambdaToReturn) +{ + int iteration_count; /* counter for number of iterations of + Newton's method */ + int i, j; /* loop indices */ + double sum; /* used to compute the sum for estimating + lambda */ + double lambda_error; /* error when estimating lambda */ + double lambda; /* scale parameter of the Extreme Value + Distribution of scores */ + double ave_score; /* average score in new context */ + double slope; /* used to compute the derivative when + estimating lambda */ + double re_to_return; /* relative entropy if using old joint + probabilities*/ + + lambda_error = 1.0; + *lambdaToReturn = 1.0; + re_to_return = 0.0; + + if (eRelEntropyOldMatrixNewContext == NRrecord->flag) { + ave_score = 0.0; + for (i = 0; i < COMPO_NUM_TRUE_AA; i++) { + for (j = 0; j < COMPO_NUM_TRUE_AA; j++) { + ave_score += + NRrecord->score_old[i][j] * NRrecord->first_seq_freq[i] * + NRrecord->second_seq_freq[j]; + } + } + } + if ((eRelEntropyOldMatrixNewContext == NRrecord->flag) && + (ave_score >= (-SCORE_BOUND))) { + /* fall back to no constraint mode when average score becomes + global alignment-like */ + NRrecord->flag = eUnconstrainedRelEntropy; + + printf("scoring matrix has nonnegative average score %12.8f," + " reset to mode 0 \n", ave_score); + } + /* Need to find the relative entropy here. */ + if (compute_re) { + slope = 0.0; + lambda = INITIAL_LAMBDA; + while(slope <= LAMBDA_ERROR_TOLERANCE) { + /* making sure iteration starting point belongs to nontrivial + fixed point */ + lambda = 2.0 * lambda; + for (i = 0; i < COMPO_NUM_TRUE_AA; i++) { + for (j = 0; j < COMPO_NUM_TRUE_AA; j++) { + if (eRelEntropyOldMatrixNewContext == NRrecord->flag) { + slope += + NRrecord->score_old[i][j] * + exp(NRrecord->score_old[i][j] * lambda) * + NRrecord->first_seq_freq[i] * + NRrecord->second_seq_freq[j]; + } else { + slope += + NRrecord->score_final[i][j] * + exp(NRrecord->score_final[i][j] * lambda) * + NRrecord->first_seq_freq[i] * + NRrecord->second_seq_freq[j]; + } + } + } + } + iteration_count = 0; + while ((fabs(lambda_error) > LAMBDA_ERROR_TOLERANCE) && + (iteration_count < LAMBDA_ITERATION_LIMIT)) { + sum = 0.0; + slope = 0.0; + for (i = 0; i < COMPO_NUM_TRUE_AA; i++) { + for (j = 0; j < COMPO_NUM_TRUE_AA; j++) { + if (eRelEntropyOldMatrixNewContext == NRrecord->flag) { + sum += + exp(NRrecord->score_old[i][j] * lambda) * + NRrecord->first_seq_freq[i] * + NRrecord->second_seq_freq[j]; + slope += + NRrecord->score_old[i][j] * + exp(NRrecord->score_old[i][j] * lambda) * + NRrecord->first_seq_freq[i] * + NRrecord->second_seq_freq[j]; + } else { + if(eUnconstrainedRelEntropy == NRrecord->flag) { + sum += + exp(NRrecord->score_final[i][j] * lambda) * + NRrecord->first_seq_freq[i] * + NRrecord->second_seq_freq[j]; + slope += + NRrecord->score_final[i][j] * + exp(NRrecord->score_final[i][j] * lambda) * + NRrecord->first_seq_freq[i] * + NRrecord->second_seq_freq[j]; + } + } + } + } + lambda_error = (1.0 - sum) / slope; + lambda = lambda + LAMBDA_STEP_FRACTION * lambda_error; + iteration_count++; + } + *lambdaToReturn = lambda; + printf("Lambda iteration count %d\n", iteration_count ); + printf("the lambda value = %f \t sum of jp = %12.10f \n", lambda, + sum); + re_to_return = 0.0; + for (i = 0; i < COMPO_NUM_TRUE_AA; i++) { + for (j = 0; j < COMPO_NUM_TRUE_AA; j++) { + if (eRelEntropyOldMatrixNewContext == NRrecord->flag) { + double scaledScore = lambda * NRrecord->score_old[i][j]; + re_to_return += scaledScore * exp(scaledScore) * + NRrecord->first_seq_freq[i] * + NRrecord->second_seq_freq[j]; + } else { + if (eUnconstrainedRelEntropy == NRrecord->flag) { + double scaledScore = + lambda * NRrecord->score_final[i][j]; + + re_to_return += scaledScore * exp(scaledScore) * + NRrecord->first_seq_freq[i] * + NRrecord->second_seq_freq[j]; + } + } + } + } + } + return re_to_return; +} + + +/** + * Use compositional score matrix adjustment, as described in + * + * Altschul, Stephen F., John C. Wootton, E. Michael Gertz, Richa + * Agarwala, Aleksandr Morgulis, Alejandro A. Schaffer, and Yi-Kuo + * Yu (2005) "Protein database searches using compositionally + * adjusted substitution matrices", FEBS J. 272:5101-5109. + * + * to optimize a score matrix to a given set of letter frequencies. + * + * @param length1 adjusted length (not counting X) of the first + * sequence + * @param length2 adjusted length of the second sequence + * @param probArray1 letter probabilities for the first sequence, + * in the 20 letter amino-acid alphabet + * @param probArray2 letter probabilities for the second sequence + * @param pseudocounts number of pseudocounts to add the the + * probabilities for each sequence, before optimizing + * the scores. + * @param specifiedRE a relative entropy that might (subject to + * fields in NRrecord) be used to as a constraint + * of the optimization problem + * @param NRrecord a Blast_CompositionWorkspace that contains + * fields used for the composition adjustment and + * that will hold the output. + * @param lambdaComputed the new computed value of lambda + * + * @return 0 on success, 1 on failure to converge, -1 for out-of-memory + */ +int +Blast_CompositionMatrixAdj(int length1, + int length2, + const double * probArray1, + const double * probArray2, + int pseudocounts, + double specifiedRE, + Blast_CompositionWorkspace * NRrecord, + double * lambdaComputed) +{ + int i; /* loop indices */ + double re_o_newcontext = 0.0; /* relative entropy implied by + input single sequence + probabilities */ + static int total_iterations = 0; /* total iterations among all + calls to + compute_new_score_matrix */ + int new_iterations = 0; /* number of iterations in the most + recent call to + compute_new_score_matrix */ + static int max_iterations = 0; /* maximum number of iterations + observed in a call to + compute_new_score_matrix */ + int status; /* status code for operations that may + fail */ + /*Is the relative entropy constrained? Behaves as boolean for now*/ + int constrain_rel_entropy = + eUnconstrainedRelEntropy != NRrecord->flag; + + Blast_ApplyPseudocounts(NRrecord->first_seq_freq_wpseudo, + NRrecord->first_seq_freq, probArray1, length1, + NRrecord->first_standard_freq, pseudocounts); + /* plug in frequencies for second sequence, will be the matching + sequence in BLAST */ + Blast_ApplyPseudocounts(NRrecord->second_seq_freq_wpseudo, + NRrecord->second_seq_freq, probArray2, length2, + NRrecord->second_standard_freq, pseudocounts); + *lambdaComputed = 1.0; + re_o_newcontext = + Blast_CalcLambdaForComposition( + NRrecord, (NRrecord->flag == eRelEntropyOldMatrixNewContext), + lambdaComputed); + switch (NRrecord->flag) { + case eUnconstrainedRelEntropy: + /* Initialize to a arbitrary value; it won't be used */ + NRrecord->RE_final = 0.0; + break; + case eRelEntropyOldMatrixNewContext: + NRrecord->RE_final = re_o_newcontext; + break; + case eRelEntropyOldMatrixOldContext: + NRrecord->RE_final = NRrecord->RE_o_implicit; + break; + case eUserSpecifiedRelEntropy: + NRrecord->RE_final = specifiedRE; + break; + default: /* I assert that we can't get here */ + fprintf(stderr, "Unknown flag for setting relative entropy" + "in composition matrix adjustment"); + exit(1); + } + status = + Blast_OptimizeTargetFrequencies(&NRrecord->mat_final[0][0], + COMPO_NUM_TRUE_AA, + &new_iterations, + &NRrecord->mat_b[0][0], + NRrecord->first_seq_freq_wpseudo, + NRrecord->second_seq_freq_wpseudo, + constrain_rel_entropy, + NRrecord->RE_final, + kCompoAdjustErrTolerance, + kCompoAdjustIterationLimit); + total_iterations += new_iterations; + if (new_iterations > max_iterations) + max_iterations = new_iterations; + + if (status == 0) { + Blast_ScoreMatrixFromFreq(NRrecord->score_final, + COMPO_NUM_TRUE_AA, + NRrecord->mat_final, + NRrecord->first_seq_freq_wpseudo, + NRrecord->second_seq_freq_wpseudo); + if (NRrecord->flag == eUnconstrainedRelEntropy) { + /* Compute the unconstrained relative entropy */ + (void) Blast_CalcLambdaForComposition(NRrecord, 1, lambdaComputed); + } + /* success if and only if the computed lambda is positive */ + status = (*lambdaComputed > 0) ? 0 : 1; + } else if (status == -1) { + /* out of memory */ + status = -1; + } else { + /* Iteration did not converge */ + fprintf(stderr, "bad probabilities from sequence 1, length %d\n", + length1); + for (i = 0; i < COMPO_NUM_TRUE_AA; i++) + fprintf(stderr, "%15.12f\n", probArray1[i]); + fprintf(stderr, "bad probabilities from sequence 2, length %d\n", + length2); + for (i = 0; i < COMPO_NUM_TRUE_AA; i++) + fprintf(stderr, "%15.12f\n", probArray2[i]); + fflush(stderr); + status = 1; + } + return status; +} + + +/** + * Compute a compositionally adjusted scoring matrix. + * + * @param matrix the adjusted matrix + * @param query_composition composition of the query sequence + * @param queryLength length of the query sequence + * @param subject_composition composition of the subject (database) + * sequence + * @param queryLength length of the subject sequence + * @param matrixInfo information about the underlying, + * non-adjusted, scoring matrix. + * @param RE_rule the rule to use for computing the scoring + * matrix + * @param RE_pseudocounts the number of pseudocounts to use in some + * rules of composition adjustment + * @param NRrecord workspace used to perform compositional + * adjustment + * @param *whichMode which mode of compositional adjustment was + * actually used + * @calc_lambda a function that can calculate the statistical + * parameter Lambda from a set of score + * frequencies. + * @return 0 for success, 1 for failure to converge, + * -1 for out of memory + */ +int +Blast_AdjustScores(int ** matrix, + const Blast_AminoAcidComposition * query_composition, + int queryLength, + const Blast_AminoAcidComposition * subject_composition, + int subjectLength, + const Blast_MatrixInfo * matrixInfo, + int RE_rule, + int RE_pseudocounts, + Blast_CompositionWorkspace *NRrecord, + ECompoAdjustModes *whichMode, + double calc_lambda(double *,int,int,double)) +{ + double LambdaRatio; /* the ratio of the corrected + lambda to the original lambda */ + + if (matrixInfo->positionBased || RE_rule == 0) { + /* Use old-style composition-based statistics unconditionally. */ + *whichMode = eCompoKeepOldMatrix; + return Blast_CompositionBasedStats(matrix, &LambdaRatio, + matrixInfo, + query_composition->prob, + subject_composition->prob, + calc_lambda); + } else { + /* else call Yi-Kuo's code to choose mode for matrix adjustment. */ + + /* The next two arrays are letter probabilities of query and + * match in 20 letter ARND... alphabet. */ + double permutedQueryProbs[COMPO_NUM_TRUE_AA]; + double permutedMatchProbs[COMPO_NUM_TRUE_AA]; + + s_GatherLetterProbs(permutedQueryProbs, query_composition->prob); + s_GatherLetterProbs(permutedMatchProbs, subject_composition->prob); + + *whichMode = + Blast_ChooseCompoAdjustMode(queryLength, subjectLength, + permutedQueryProbs, + permutedMatchProbs, + matrixInfo->matrixName, + RE_rule-1); + /* compute and plug in new matrix here */ + if (eCompoKeepOldMatrix == *whichMode) { + /* Yi-Kuo's code chose to use composition-based stats */ + return Blast_CompositionBasedStats(matrix, &LambdaRatio, + matrixInfo, + query_composition->prob, + subject_composition->prob, + calc_lambda); + } else { + /* else use compositionally adjusted scoring matrices */ + double correctUngappedLambda; /* new value of ungapped lambda */ + double ** REscoreMatrix = NULL; + int status = 0; + REscoreMatrix = Nlm_DenseMatrixNew(COMPO_PROTEIN_ALPHABET, + COMPO_PROTEIN_ALPHABET); + if (REscoreMatrix != NULL) { + NRrecord->flag = *whichMode; + status = + Blast_CompositionMatrixAdj(query_composition-> + numTrueAminoAcids, + subject_composition-> + numTrueAminoAcids, + permutedQueryProbs, + permutedMatchProbs, + RE_pseudocounts, + kFixedReBlosum62, + NRrecord, + &correctUngappedLambda); + if (status == 0) { + LambdaRatio = + correctUngappedLambda / matrixInfo->ungappedLambda; + if (LambdaRatio <= 0) { + status = 1; + } + } + if (status == 0) { + s_ScatterScores(REscoreMatrix, LambdaRatio, + NRrecord->score_final); + /*scale matrix in floating point*/ + Blast_SetNonstandardAaScores(REscoreMatrix, + matrixInfo->startMatrix); + s_RoundScoreMatrix(matrix, REscoreMatrix, + COMPO_PROTEIN_ALPHABET); + } + Nlm_DenseMatrixFree(&REscoreMatrix); + } + return status; + } /* end else use compositionally adjusted scoring matrices */ + } /* end else call Yi-Kuo's code to choose mode for matrix adjustment. */ +} diff --git a/algo/blast/composition_adjustment/composition_adjustment.h b/algo/blast/composition_adjustment/composition_adjustment.h new file mode 100644 index 00000000..1d38d0f3 --- /dev/null +++ b/algo/blast/composition_adjustment/composition_adjustment.h @@ -0,0 +1,168 @@ +/* $Id: composition_adjustment.h,v 1.6 2005/12/01 13:54:04 gertz Exp $ + * =========================================================================== + * + * PUBLIC DOMAIN NOTICE + * National Center for Biotechnology Information + * + * This software/database is a "United States Government Work" under the + * terms of the United States Copyright Act. It was written as part of + * the author's official duties as a United States Government employee and + * thus cannot be copyrighted. This software/database is freely available + * to the public for use. The National Library of Medicine and the U.S. + * Government have not placed any restriction on its use or reproduction. + * + * Although all reasonable efforts have been taken to ensure the accuracy + * and reliability of the software and data, the NLM and the U.S. + * Government do not and cannot warrant the performance or results that + * may be obtained by using this software or data. The NLM and the U.S. + * Government disclaim all warranties, express or implied, including + * warranties of performance, merchantability or fitness for any particular + * purpose. + * + * Please cite the author in any work or product based on this material. + * + * ===========================================================================*/ +/** + * @file composition_adjustment.h + * @author E. Michael Gertz, Alejandro Schaffer, Yi-Kuo Yu + * + * Definitions used in compositional score matrix adjustment + */ + +#ifndef __COMPOSITION_ADJUSTMENT__ +#define __COMPOSITION_ADJUSTMENT__ + +#include <algo/blast/core/blast_export.h> +#include <algo/blast/core/ncbi_std.h> +#include <algo/blast/composition_adjustment/compo_mode_condition.h> + +/* Number of standard amino acids */ +#define COMPO_NUM_TRUE_AA 20 + +#ifdef __cplusplus +extern "C" { +#endif + +/* Some characters in the 26 letter NCBIstdaa alphabet, including + ambiguity characters, selenocysteine and the stop character. */ +enum { eGapChar = 0, eBchar = 2, eDchar = 4, eEchar = 5, eNchar = 13, + eQchar = 15, eXchar = 21, eZchar = 23, eSelenocysteine = 24, + eStopChar = 25}; + +/** + * Represents the composition of an amino-acid sequence */ +struct Blast_AminoAcidComposition { + double prob[26]; /**< probabilities of each amino acid, including + nonstandard amino acids */ + int numTrueAminoAcids; /**< number of true amino acids in the sequence, + omitting X characters */ +}; +typedef struct Blast_AminoAcidComposition Blast_AminoAcidComposition; + +NCBI_XBLAST_EXPORT +void +Blast_ReadAaComposition(Blast_AminoAcidComposition * composition, + const Uint1 * sequence, int length); + +struct Blast_MatrixInfo { + char * matrixName; /**< name of the matrix */ + Int4 **startMatrix; /**< Rescaled values of the original matrix */ + double **startFreqRatios; /**< frequency ratios used to calculate matrix + scores */ + int rows; /**< the number of rows in the scoring + matrix. */ + int positionBased; /**< is the matrix position-based */ + double ungappedLambda; /**< ungapped Lambda value for this matrix + in standard context */ +}; +typedef struct Blast_MatrixInfo Blast_MatrixInfo; + +NCBI_XBLAST_EXPORT +Blast_MatrixInfo * Blast_MatrixInfoNew(int rows, int positionBased); + +NCBI_XBLAST_EXPORT +void Blast_MatrixInfoFree(Blast_MatrixInfo ** ss); + +/** Work arrays used to perform composition-based matrix adjustment */ +typedef struct Blast_CompositionWorkspace { + int flag; /**< determines which of the optimization + problems are solved */ + double ** mat_b; /**< joint probabilities for the matrix in + standard context */ + double ** score_old; /**< score of the matrix in standard context + with scale Lambda == 1 */ + double ** mat_final; /**< optimized target frequencies */ + double ** score_final; /**< optimized score matrix */ + + double RE_final; /**< the relative entropy used, either + re_o_implicit or re_o_newcontext */ + double RE_o_implicit; /**< used for eRelEntropyOldMatrixOldContext + mode */ + + double * first_seq_freq; /**< freq vector of first seq */ + double * second_seq_freq; /**< freq. vector for the second. */ + double * first_standard_freq; /**< background freq vector of first + seq using matrix */ + double * second_standard_freq; /**< background freq vector for + the second. */ + double * first_seq_freq_wpseudo; /**< freq vector of first seq + w/pseudocounts */ + double * second_seq_freq_wpseudo; /**< freq. vector for the + second seq w/pseudocounts */ +} Blast_CompositionWorkspace; + +NCBI_XBLAST_EXPORT +Blast_CompositionWorkspace * Blast_CompositionWorkspaceNew(); + +NCBI_XBLAST_EXPORT +int Blast_CompositionWorkspaceInit(Blast_CompositionWorkspace * NRrecord, + const char *matrixName); + +NCBI_XBLAST_EXPORT +void Blast_CompositionWorkspaceFree(Blast_CompositionWorkspace ** NRrecord); + +NCBI_XBLAST_EXPORT +void Blast_GetCompositionRange(int * pleft, int * pright, + const Uint1 * subject_data, int length, + int start, int finish); +NCBI_XBLAST_EXPORT +int +Blast_CompositionBasedStats(Int4 ** matrix, double * LambdaRatio, + const Blast_MatrixInfo * ss, + const double queryProb[], const double resProb[], + double (*calc_lambda)(double*,int,int,double)); + +NCBI_XBLAST_EXPORT +int Blast_CompositionMatrixAdj(int length1, int length2, + const double *probArray1, + const double *probArray2, + int pseudocounts, double specifiedRE, + Blast_CompositionWorkspace * NRrecord, + double * lambdaComputed); + +NCBI_XBLAST_EXPORT +int +Blast_AdjustScores(Int4 ** matrix, + const Blast_AminoAcidComposition * query_composition, + int queryLength, + const Blast_AminoAcidComposition * subject_composition, + int subjectLength, + const Blast_MatrixInfo * matrixInfo, + int RE_rule, + int RE_pseudocounts, + Blast_CompositionWorkspace *NRrecord, + ECompoAdjustModes *whichMode, + double calc_lambda(double *,int,int,double)); + +NCBI_XBLAST_EXPORT +void Blast_Int4MatrixFromFreq(Int4 **matrix, int alphsize, + double ** freq, double Lambda); + +NCBI_XBLAST_EXPORT +double Blast_GetRelativeEntropy(const double A[], const double B[]); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/algo/blast/composition_adjustment/composition_constants.h b/algo/blast/composition_adjustment/composition_constants.h new file mode 100644 index 00000000..48e8152b --- /dev/null +++ b/algo/blast/composition_adjustment/composition_constants.h @@ -0,0 +1,60 @@ +/* $Id: composition_constants.h,v 1.1 2005/12/01 13:52:20 gertz Exp $ + * =========================================================================== + * + * PUBLIC DOMAIN NOTICE + * National Center for Biotechnology Information + * + * This software/database is a "United States Government Work" under the + * terms of the United States Copyright Act. It was written as part of + * the author's official duties as a United States Government employee and + * thus cannot be copyrighted. This software/database is freely available + * to the public for use. The National Library of Medicine and the U.S. + * Government have not placed any restriction on its use or reproduction. + * + * Although all reasonable efforts have been taken to ensure the accuracy + * and reliability of the software and data, the NLM and the U.S. + * Government do not and cannot warrant the performance or results that + * may be obtained by using this software or data. The NLM and the U.S. + * Government disclaim all warranties, express or implied, including + * warranties of performance, merchantability or fitness for any particular + * purpose. + * + * Please cite the author in any work or product based on this material. + * + * ===========================================================================*/ +/** + * @file composition_constants.h + * @author E. Michael Gertz, Alejandro Schaffer, Yi-Kuo Yu + * + * Constants used in compositional score matrix adjustment + */ + + +#ifndef __COMPOSITION_CONSTANTS__ +#define __COMPOSITION_CONSTANTS__ + +#include <algo/blast/core/ncbi_std.h> + +/** Number of standard amino acids */ +#define COMPO_NUM_TRUE_AA 20 + +/** Number of amino acids, including nonstandard ones */ +#define COMPO_PROTEIN_ALPHABET 26 + +/** Minimum score in a matrix */ +#define COMPO_SCORE_MIN INT2_MIN + +/* An collection of constants that specify all permissible + * modes of composition adjustment */ +enum ECompoAdjustModes { + eNoCompositionAdjustment = (-1), + eCompoKeepOldMatrix = 0, + eUnconstrainedRelEntropy = 1, + eRelEntropyOldMatrixNewContext = 2, + eRelEntropyOldMatrixOldContext = 3, + eUserSpecifiedRelEntropy = 4, + eNumCompoAdjustModes +}; +typedef enum ECompoAdjustModes ECompoAdjustModes; + +#endif diff --git a/algo/blast/composition_adjustment/matrix_frequency_data.c b/algo/blast/composition_adjustment/matrix_frequency_data.c new file mode 100644 index 00000000..176d7f31 --- /dev/null +++ b/algo/blast/composition_adjustment/matrix_frequency_data.c @@ -0,0 +1,230 @@ +/* =========================================================================== +* +* PUBLIC DOMAIN NOTICE +* National Center for Biotechnology Information +* +* This software/database is a "United States Government Work" under the +* terms of the United States Copyright Act. It was written as part of +* the author's official duties as a United States Government employee and +* thus cannot be copyrighted. This software/database is freely available +* to the public for use. The National Library of Medicine and the U.S. +* Government have not placed any restriction on its use or reproduction. +* +* Although all reasonable efforts have been taken to ensure the accuracy +* and reliability of the software and data, the NLM and the U.S. +* Government do not and cannot warrant the performance or results that +* may be obtained by using this software or data. The NLM and the U.S. +* Government disclaim all warranties, express or implied, including +* warranties of performance, merchantability or fitness for any particular +* purpose. +* +* Please cite the author in any work or product based on this material. +* +* ===========================================================================*/ + +/** @file joint_probs.c + * + * @author Yi-Kuo Yu, Alejandro Schaffer, E. Michael Gertz + * + * Joint probabilities for specific matrices. + */ +#ifndef SKIP_DOXYGEN_PROCESSING +static char const rcsid[] = + "$Id: matrix_frequency_data.c,v 1.1 2005/12/01 13:48:09 gertz Exp $"; +#endif /* SKIP_DOXYGEN_PROCESSING */ + +#include <stdlib.h> +#include <algo/blast/core/ncbi_std.h> +#include <algo/blast/composition_adjustment/composition_constants.h> +#include <algo/blast/composition_adjustment/matrix_frequency_data.h> + +/* bound on error for sum of probabilities*/ +static const double kProbSumTolerance = 0.000000001; + +/* Joint probabilities for BLOSUM62 */ +static double +BLOSUM62_JOINT_PROBS[COMPO_NUM_TRUE_AA][COMPO_NUM_TRUE_AA] += { + {0.021516461557, 0.002341028532, 0.001941062549, 0.002160193055, + 0.001595828537, 0.001934059173, 0.002990874959, 0.005831307116, + 0.001108651421, 0.003181451207, 0.004450432543, 0.003350994862, + 0.001330482798, 0.001634084433, 0.002159278003, 0.006261897426, + 0.003735752688, 0.000404784037, 0.001298558985, 0.005124343367}, + {0.002341028532, 0.017737158563, 0.001969132731, 0.001581985934, + 0.000393496788, 0.002483620870, 0.002678135197, 0.001721914295, + 0.001230766890, 0.001239704106, 0.002418976127, 0.006214150782, + 0.000796884039, 0.000932356719, 0.000959872904, 0.002260870847, + 0.001779897849, 0.000265310579, 0.000918577576, 0.001588408095}, + {0.001941062549, 0.001969132731, 0.014105369019, 0.003711182199, + 0.000436559586, 0.001528401416, 0.002205231268, 0.002856026580, + 0.001423459827, 0.000986015608, 0.001369776043, 0.002436729322, + 0.000521972796, 0.000746722150, 0.000858953243, 0.003131380307, + 0.002237168191, 0.000161021675, 0.000695990541, 0.001203509685}, + {0.002160193055, 0.001581985934, 0.003711182199, 0.021213070328, + 0.000397349231, 0.001642988683, 0.004909362115, 0.002510933422, + 0.000948355160, 0.001226071189, 0.001524412852, 0.002443951825, + 0.000458902921, 0.000759393269, 0.001235481304, 0.002791458183, + 0.001886707235, 0.000161498946, 0.000595157039, 0.001320931409}, + {0.001595828537, 0.000393496788, 0.000436559586, 0.000397349231, + 0.011902428201, 0.000309689150, 0.000380965445, 0.000768969543, + 0.000229437747, 0.001092222651, 0.001570843250, 0.000500631539, + 0.000373569136, 0.000512643056, 0.000360439075, 0.001038049531, + 0.000932287369, 0.000144869300, 0.000344932387, 0.001370634611}, + {0.001934059173, 0.002483620870, 0.001528401416, 0.001642988683, + 0.000309689150, 0.007348611171, 0.003545322222, 0.001374101100, + 0.001045402587, 0.000891574240, 0.001623152279, 0.003116305001, + 0.000735592074, 0.000544610751, 0.000849940593, 0.001893917959, + 0.001381521088, 0.000228499204, 0.000674510708, 0.001174481769}, + {0.002990874959, 0.002678135197, 0.002205231268, 0.004909362115, + 0.000380965445, 0.003545322222, 0.016058942448, 0.001941788215, + 0.001359354087, 0.001208575016, 0.002010620195, 0.004137352463, + 0.000671608129, 0.000848058651, 0.001418534945, 0.002949177015, + 0.002049363253, 0.000264084965, 0.000864998825, 0.001706373779}, + {0.005831307116, 0.001721914295, 0.002856026580, 0.002510933422, + 0.000768969543, 0.001374101100, 0.001941788215, 0.037833882792, + 0.000956438296, 0.001381594180, 0.002100349645, 0.002551728599, + 0.000726329019, 0.001201930393, 0.001363538639, 0.003819521365, + 0.002185818204, 0.000406753457, 0.000831463001, 0.001832653843}, + {0.001108651421, 0.001230766890, 0.001423459827, 0.000948355160, + 0.000229437747, 0.001045402587, 0.001359354087, 0.000956438296, + 0.009268821027, 0.000575006579, 0.000990341860, 0.001186603601, + 0.000377383962, 0.000807129053, 0.000477177871, 0.001100800912, + 0.000744015818, 0.000151511190, 0.001515361861, 0.000650302833}, + {0.003181451207, 0.001239704106, 0.000986015608, 0.001226071189, + 0.001092222651, 0.000891574240, 0.001208575016, 0.001381594180, + 0.000575006579, 0.018297094930, 0.011372374833, 0.001566332194, + 0.002471405322, 0.003035353009, 0.001002322534, 0.001716150165, + 0.002683992649, 0.000360556333, 0.001366091300, 0.011965802769}, + {0.004450432543, 0.002418976127, 0.001369776043, 0.001524412852, + 0.001570843250, 0.001623152279, 0.002010620195, 0.002100349645, + 0.000990341860, 0.011372374833, 0.037325284430, 0.002482344486, + 0.004923694031, 0.005449900864, 0.001421696216, 0.002434190706, + 0.003337092433, 0.000733421681, 0.002210504676, 0.009545821406}, + {0.003350994862, 0.006214150782, 0.002436729322, 0.002443951825, + 0.000500631539, 0.003116305001, 0.004137352463, 0.002551728599, + 0.001186603601, 0.001566332194, 0.002482344486, 0.016147683460, + 0.000901118905, 0.000950170174, 0.001578353818, 0.003104386139, + 0.002360691115, 0.000272260749, 0.000996404634, 0.001952015271}, + {0.001330482798, 0.000796884039, 0.000521972796, 0.000458902921, + 0.000373569136, 0.000735592074, 0.000671608129, 0.000726329019, + 0.000377383962, 0.002471405322, 0.004923694031, 0.000901118905, + 0.003994917914, 0.001184353682, 0.000404888644, 0.000847632455, + 0.001004584462, 0.000197602804, 0.000563431813, 0.002301832938}, + {0.001634084433, 0.000932356719, 0.000746722150, 0.000759393269, + 0.000512643056, 0.000544610751, 0.000848058651, 0.001201930393, + 0.000807129053, 0.003035353009, 0.005449900864, 0.000950170174, + 0.001184353682, 0.018273718971, 0.000525642239, 0.001195904180, + 0.001167245623, 0.000851298193, 0.004226922511, 0.002601386501}, + {0.002159278003, 0.000959872904, 0.000858953243, 0.001235481304, + 0.000360439075, 0.000849940593, 0.001418534945, 0.001363538639, + 0.000477177871, 0.001002322534, 0.001421696216, 0.001578353818, + 0.000404888644, 0.000525642239, 0.019101516083, 0.001670397698, + 0.001352022511, 0.000141505490, 0.000450817134, 0.001257818591}, + {0.006261897426, 0.002260870847, 0.003131380307, 0.002791458183, + 0.001038049531, 0.001893917959, 0.002949177015, 0.003819521365, + 0.001100800912, 0.001716150165, 0.002434190706, 0.003104386139, + 0.000847632455, 0.001195904180, 0.001670397698, 0.012524165008, + 0.004695393160, 0.000286147117, 0.001025667373, 0.002373134246}, + {0.003735752688, 0.001779897849, 0.002237168191, 0.001886707235, + 0.000932287369, 0.001381521088, 0.002049363253, 0.002185818204, + 0.000744015818, 0.002683992649, 0.003337092433, 0.002360691115, + 0.001004584462, 0.001167245623, 0.001352022511, 0.004695393160, + 0.012524453183, 0.000287144142, 0.000940528155, 0.003660378402}, + {0.000404784037, 0.000265310579, 0.000161021675, 0.000161498946, + 0.000144869300, 0.000228499204, 0.000264084965, 0.000406753457, + 0.000151511190, 0.000360556333, 0.000733421681, 0.000272260749, + 0.000197602804, 0.000851298193, 0.000141505490, 0.000286147117, + 0.000287144142, 0.006479671265, 0.000886553355, 0.000357440337}, + {0.001298558985, 0.000918577576, 0.000695990541, 0.000595157039, + 0.000344932387, 0.000674510708, 0.000864998825, 0.000831463001, + 0.001515361861, 0.001366091300, 0.002210504676, 0.000996404634, + 0.000563431813, 0.004226922511, 0.000450817134, 0.001025667373, + 0.000940528155, 0.000886553355, 0.010185916203, 0.001555728244}, + {0.005124343367, 0.001588408095, 0.001203509685, 0.001320931409, + 0.001370634611, 0.001174481769, 0.001706373779, 0.001832653843, + 0.000650302833, 0.011965802769, 0.009545821406, 0.001952015271, + 0.002301832938, 0.002601386501, 0.001257818591, 0.002373134246, + 0.003660378402, 0.000357440337, 0.001555728244, 0.019815247974} +}; + + + +/* Background frequencies for BLOSUM62 */ +static double BLOSUM62_bg[COMPO_NUM_TRUE_AA] = + { 0.0742356686, 0.0515874541, 0.0446395713, 0.0536092024, 0.0246865086, + 0.0342500470, 0.0543174458, 0.0741431988, 0.0262119099, 0.0679331197, + 0.0989057232, 0.0581774322, 0.0249972837, 0.0473970070, 0.0385382904, + 0.0572279733, 0.0508996546, 0.0130298868, 0.0322925130, 0.0729201182 + }; + + +int Blast_FrequencyDataIsAvailable(const char *matrix_name) +{ + return NULL != Blast_GetMatrixBackgroundFreq(matrix_name); +} + + +/** Retrieve the background letter probabilities implicitly used in + * constructing the score matrix matrix_name. */ +const double * +Blast_GetMatrixBackgroundFreq(const char *matrix_name) +{ + if (0 == strcmp(matrix_name, "BLOSUM62")) { + return BLOSUM62_bg; + } else { /* default */ + fprintf(stderr, "matrix not supported, exit now! \n"); + return NULL; + } +} + + +/** + * Get joint probabilities for the named matrix. + * + * @param probs the joint probabilities [out] + * @param row_sums sum of the values in each row of probs [out] + * @param col_sums sum of the values in each column of probs [out] + * @param matrixName the name of the matrix sought [in] + * @returns 0 if successful; -1 if the named matrix is not known. + */ +int +Blast_GetJointProbsForMatrix(double ** probs, double row_sums[], + double col_sums[], const char *matrixName) +{ + double sum; /* sum of all joint probabilities -- should + be close to one */ + int i, j; /* loop indices */ + /* The joint probabilities of the selected matrix */ + double (*joint_probs)[COMPO_NUM_TRUE_AA]; + + /* Choose the matrix */ + if (0 == strcmp("BLOSUM62", matrixName)) { + joint_probs = BLOSUM62_JOINT_PROBS; + } else { + fprintf(stderr, "matrix %s is not supported " + "for RE based adjustment\n", matrixName); + return -1; + } + sum = 0.0; + for (i = 0; i < COMPO_NUM_TRUE_AA; i++) { + for (j = 0; j < COMPO_NUM_TRUE_AA; j++) { + sum += joint_probs[i][j]; + } + } + assert(fabs(sum - 1.0) < kProbSumTolerance); + /* Normalize and record the data */ + for (j = 0; j < COMPO_NUM_TRUE_AA; j++) { + col_sums[j] = 0.0; + } + for (i = 0; i < COMPO_NUM_TRUE_AA; i++) { + row_sums[i] = 0.0; + for (j = 0; j < COMPO_NUM_TRUE_AA; j++) { + double probij = joint_probs[i][j]; + + probs[i][j] = probij/sum; + row_sums[i] += probij/sum; + col_sums[j] += probij/sum; + } + } + return 0; +} diff --git a/algo/blast/composition_adjustment/matrix_frequency_data.h b/algo/blast/composition_adjustment/matrix_frequency_data.h new file mode 100644 index 00000000..cd275c78 --- /dev/null +++ b/algo/blast/composition_adjustment/matrix_frequency_data.h @@ -0,0 +1,54 @@ +/* $Id: matrix_frequency_data.h,v 1.1 2005/12/01 13:52:20 gertz Exp $ + * =========================================================================== + * + * PUBLIC DOMAIN NOTICE + * National Center for Biotechnology Information + * + * This software/database is a "United States Government Work" under the + * terms of the United States Copyright Act. It was written as part of + * the author's official duties as a United States Government employee and + * thus cannot be copyrighted. This software/database is freely available + * to the public for use. The National Library of Medicine and the U.S. + * Government have not placed any restriction on its use or reproduction. + * + * Although all reasonable efforts have been taken to ensure the accuracy + * and reliability of the software and data, the NLM and the U.S. + * Government do not and cannot warrant the performance or results that + * may be obtained by using this software or data. The NLM and the U.S. + * Government disclaim all warranties, express or implied, including + * warranties of performance, merchantability or fitness for any particular + * purpose. + * + * Please cite the author in any work or product based on this material. + * + * ===========================================================================*/ +/** + * @file joint_probs.h + * @author Alejandro Schaffer, E. Michael Gertz + * + * Definitions used to get joint probabilities for a scoring matrix + */ +#ifndef __BLAST_JOINT_PROBS__ +#define __BLAST_JOINT_PROBS__ + +#include <algo/blast/core/blast_export.h> + +#ifdef __cplusplus +extern "C" { +#endif + +NCBI_XBLAST_EXPORT +int Blast_GetJointProbsForMatrix(double ** probs, double row_sums[], + double col_sums[], const char *matrixName); + +NCBI_XBLAST_EXPORT +const double * Blast_GetMatrixBackgroundFreq(const char *matrix_name); + +NCBI_XBLAST_EXPORT +int Blast_FrequencyDataIsAvailable(const char *matrix_name); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/algo/blast/composition_adjustment/nlm_linear_algebra.c b/algo/blast/composition_adjustment/nlm_linear_algebra.c index 13d89db5..ff98794d 100644 --- a/algo/blast/composition_adjustment/nlm_linear_algebra.c +++ b/algo/blast/composition_adjustment/nlm_linear_algebra.c @@ -1,5 +1,3 @@ -static char const rcsid[] = "$Id: nlm_numerics.c,v 1.1 2005/05/16 16:11:41 papadopo Exp $"; - /* =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -24,25 +22,21 @@ static char const rcsid[] = "$Id: nlm_numerics.c,v 1.1 2005/05/16 16:11:41 papad * * ===========================================================================*/ -/***************************************************************************** - -File name: nlm_numerics.c - -Author: E. Michael Gertz - -Contents: Basic matrix and vector operations for use in conjunction - with higher-level procedures in re_newton.c - -******************************************************************************/ -/* - * $Log: nlm_numerics.c,v $ - * Revision 1.1 2005/05/16 16:11:41 papadopo - * Initial revision +/** @file nlm_linear_algebra.c + * + * @author E. Michael Gertz * + * Basic matrix and vector operations */ +#ifndef SKIP_DOXYGEN_PROCESSING +static char const rcsid[] = + "$Id: nlm_linear_algebra.c,v 1.5 2005/12/01 13:49:43 gertz Exp $"; +#endif /* SKIP_DOXYGEN_PROCESSING */ -#include <ncbi.h> -#include <nlm_numerics.h> +#include <math.h> +#include <stdlib.h> +#include <algo/blast/core/ncbi_std.h> +#include <algo/blast/composition_adjustment/nlm_linear_algebra.h> /** * Create and return a new, dense matrix. Elements of the matrix A @@ -51,22 +45,26 @@ Contents: Basic matrix and vector operations for use in conjunction * @param nrows the number of rows for the new matrix. * @param ncols the number of columns for the new matrix. */ -Nlm_FloatHiPtr PNTR -Nlm_DenseMatrixNew(Int4 nrows, - Int4 ncols) +double ** +Nlm_DenseMatrixNew(int nrows, + int ncols) { - Int4 i; /* iteration index */ - Nlm_FloatHiPtr PNTR mat; /* the new matrix */ - - mat = (Nlm_FloatHiPtr PNTR) Nlm_Calloc(nrows, sizeof(Nlm_FloatHiPtr)); - - mat[0] = - (Nlm_FloatHiPtr) Nlm_MemNew((size_t) nrows * - (size_t) ncols * sizeof(Nlm_FloatHi)); - for(i = 1; i < nrows; i++) { - mat[i] = &mat[0][i * ncols]; + int i; /* iteration index */ + double ** mat; /* the new matrix */ + + mat = (double **) calloc(nrows, sizeof(double *)); + if (mat != NULL) { + mat[0] = (double *) malloc((size_t) nrows * + (size_t) ncols * sizeof(double)); + if (mat[0] != NULL) { + for (i = 1; i < nrows; i++) { + mat[i] = &mat[0][i * ncols]; + } + } else { + free(mat); + mat = NULL; + } } - return mat; } @@ -77,23 +75,27 @@ Nlm_DenseMatrixNew(Int4 nrows, * * @param n the dimension of the matrix. */ -Nlm_FloatHiPtr PNTR -Nlm_LtriangMatrixNew(Int4 n) +double ** +Nlm_LtriangMatrixNew(int n) { - Int4 i; /* iteration index */ - Nlm_FloatHiPtr PNTR L; /* the new, lower triangular matrix */ + int i; /* iteration index */ + double ** L; /* the new, lower triangular matrix */ size_t nelts; /* the number of elements in the matrix */ - nelts = ((size_t) n * (n + 1))/2; - L = (Nlm_FloatHiPtr PNTR) Nlm_Calloc(n, sizeof(Nlm_FloatHi *)); - L[0] = (Nlm_FloatHiPtr) Nlm_MemNew(nelts * sizeof(Nlm_FloatHi) ); - - for( i = 1; i < n; i++ ) { - L[i] = L[i - 1] + i; + L = (double**) calloc(n, sizeof(double *)); + if (L != NULL) { + L[0] = (double*) malloc(nelts * sizeof(double)); + if (L[0] != NULL) { + for (i = 1; i < n; i++) { + L[i] = L[i - 1] + i; + } + } else { + free(L); + L = NULL; + } } - return L; } @@ -105,17 +107,64 @@ Nlm_LtriangMatrixNew(Int4 n) * @param mat the matrix to be freed * @return always NULL */ -Nlm_FloatHiPtr PNTR -Nlm_DenseMatrixFree(Nlm_FloatHiPtr PNTR mat) +void +Nlm_DenseMatrixFree(double *** mat) { - mat[0] = (Nlm_FloatHiPtr) Nlm_MemFree(mat[0]); - mat = (Nlm_FloatHiPtr PNTR) Nlm_MemFree(mat); + if(*mat != NULL) { + free((*mat)[0]); + free(*mat); + } + *mat = NULL; +} - return NULL; + +/** + * Create and return a new Int4 matrix. Elements of the matrix A + * may be accessed as A[i][j] + * + * @param nrows the number of rows for the new matrix. + * @param ncols the number of columns for the new matrix. + */ +Int4 ** Nlm_Int4MatrixNew(int nrows, int ncols) +{ + int i; /* iteration index */ + Int4 ** mat; /* the new matrix */ + + mat = (Int4 **) calloc(nrows, sizeof(Int4 *)); + if (mat != NULL) { + mat[0] = (Int4 *) malloc((size_t) nrows * + (size_t) ncols * sizeof(Int4)); + if (mat[0] != NULL) { + for (i = 1; i < nrows; i++) { + mat[i] = &mat[0][i * ncols]; + } + } else { + free(mat); + mat = NULL; + } + } + return mat; } /** + * Free a matrix created by Nlm_DenseMatrixNew or + * Nlm_LtriangMatrixNew. + * + * @param mat the matrix to be freed + * @return always NULL + */ +void +Nlm_Int4MatrixFree(Int4 *** mat) +{ + if(*mat != NULL) { + free((*mat)[0]); + free(*mat); + } + *mat = NULL; +} + +/** * Accessing only the lower triangular elements of the symmetric, * positive definite matrix A, compute a lower triangular matrix L * such that A = L L^T (Cholesky factorization.) Overwrite the lower @@ -129,22 +178,22 @@ Nlm_DenseMatrixFree(Nlm_FloatHiPtr PNTR mat) * @param n the size of A */ void -Nlm_FactorLtriangPosDef(Nlm_FloatHiPtr PNTR A, Int4 n) +Nlm_FactorLtriangPosDef(double ** A, int n) { - Int4 i, j, k; /* iteration indices */ - Nlm_FloatHi temp; /* temporary variable for intermediate + int i, j, k; /* iteration indices */ + double temp; /* temporary variable for intermediate values in a computation */ - for( i = 0; i < n; i++ ) { - for( j = 0; j < i; j++ ) { + for (i = 0; i < n; i++) { + for (j = 0; j < i; j++) { temp = A[i][j]; - for( k = 0; k < j; k++ ) { + for (k = 0; k < j; k++) { temp -= A[i][k] * A[j][k]; } A[i][j] = temp/A[j][j]; } temp = A[i][i]; - for(k = 0; k < i; k++ ) { + for (k = 0; k < i; k++) { temp -= A[i][k] * A[i][k]; } A[i][i] = sqrt(temp); @@ -162,29 +211,27 @@ Nlm_FactorLtriangPosDef(Nlm_FloatHiPtr PNTR A, Int4 n) * @param n the size of x * @param L a non-singular lower triangular matrix */ -void Nlm_SolveLtriangPosDef(Nlm_FloatHiPtr x, Int4 n, - Nlm_FloatHiPtr PNTR L ) +void Nlm_SolveLtriangPosDef(double * x, int n, + double ** L ) { - Int4 i, j; /* iteration indices */ - Nlm_FloatHi temp; /* temporary variable for intermediate + int i, j; /* iteration indices */ + double temp; /* temporary variable for intermediate values in a computation */ /* At point x = b in the equation L L\T y = b */ /* Forward solve; L z = b */ - for( i = 0; i < n; i++ ) { + for (i = 0; i < n; i++) { temp = x[i]; - for( j = 0; j < i; j++ ) { + for (j = 0; j < i; j++) { temp -= L[i][j] * x[j]; } x[i] = temp/L[i][i]; } - /* Now x = z */ - - /* Back solve; L\T y = z */ - for( j = n - 1; j >= 0; j-- ) { + /* Now x = z. Back solve the system L\T y = z */ + for (j = n - 1; j >= 0; j--) { x[j] /= L[j][j]; - for( i = 0; i < j; i++ ) { + for (i = 0; i < j; i++) { x[i] -= L[j][i] * x[j]; } } @@ -201,17 +248,17 @@ void Nlm_SolveLtriangPosDef(Nlm_FloatHiPtr x, Int4 n, * @param v a vector * @param n the length of v */ -Nlm_FloatHi -Nlm_EuclideanNorm(const Nlm_FloatHi PNTR v, Int4 n) +double +Nlm_EuclideanNorm(const double * v, int n) { - Nlm_FloatHi sum = 1.0; /* sum of squares of elements in v */ - Nlm_FloatHi scale = 0.0; /* a scale factor for the elements in v */ - Int4 i; /* iteration index */ - - for( i = 0; i < n; i++ ) { - if( v[i] != 0.0 ) { - Nlm_FloatHi absvi = ABS(v[i]); - if( scale < absvi ) { + double sum = 1.0; /* sum of squares of elements in v */ + double scale = 0.0; /* a scale factor for the elements in v */ + int i; /* iteration index */ + + for (i = 0; i < n; i++) { + if (v[i] != 0.0) { + double absvi = fabs(v[i]); + if (scale < absvi) { sum = 1.0 + sum * (scale/absvi) * (scale/absvi); scale = absvi; } else { @@ -219,7 +266,6 @@ Nlm_EuclideanNorm(const Nlm_FloatHi PNTR v, Int4 n) } } } - return scale * sqrt(sum); } @@ -231,12 +277,13 @@ Nlm_EuclideanNorm(const Nlm_FloatHi PNTR v, Int4 n) * @param x another vector * @param n the length of x and y */ -void Nlm_AddVectors(Nlm_FloatHiPtr y, Int4 n, Nlm_FloatHi alpha, - const Nlm_FloatHi PNTR x ) +void Nlm_AddVectors(double * y, int n, double alpha, const double * x) { - Int4 i; /* iteration index */ + int i; /* iteration index */ - for( i = 0; i < n; i++ ) y[i] += alpha * x[i]; + for (i = 0; i < n; i++) { + y[i] += alpha * x[i]; + } } @@ -249,24 +296,19 @@ void Nlm_AddVectors(Nlm_FloatHiPtr y, Int4 n, Nlm_FloatHi alpha, * @param n the size of x and step_x * @param max a nonnegative scalar */ -Nlm_FloatHi -Nlm_StepBound(const Nlm_FloatHi PNTR x, Int4 n, - const Nlm_FloatHi PNTR step_x, Nlm_FloatHi max ) +double +Nlm_StepBound(const double * x, int n, const double * step_x, double max) { - Int4 i; /* iteration index */ - Nlm_FloatHi alpha; /* current largest permitted step */ - - alpha = max; + int i; /* iteration index */ + double alpha = max; /* current largest permitted step */ - for( i = 0; i < n; i++ ) { - Nlm_FloatHi alpha_i; /* a step to the boundary for the - current i */ + for (i = 0; i < n; i++) { + double alpha_i; /* a step to the boundary for the current i */ alpha_i = -x[i] / step_x[i]; - if( alpha_i >= 0 && alpha_i < alpha ) { + if (alpha_i >= 0 && alpha_i < alpha) { alpha = alpha_i; } } - return alpha; } diff --git a/algo/blast/composition_adjustment/nlm_linear_algebra.h b/algo/blast/composition_adjustment/nlm_linear_algebra.h index 637b913d..e706bfb1 100644 --- a/algo/blast/composition_adjustment/nlm_linear_algebra.h +++ b/algo/blast/composition_adjustment/nlm_linear_algebra.h @@ -1,61 +1,80 @@ -/* =========================================================================== -* -* PUBLIC DOMAIN NOTICE -* National Center for Biotechnology Information -* -* This software/database is a "United States Government Work" under the -* terms of the United States Copyright Act. It was written as part of -* the author's official duties as a United States Government employee and -* thus cannot be copyrighted. This software/database is freely available -* to the public for use. The National Library of Medicine and the U.S. -* Government have not placed any restriction on its use or reproduction. -* -* Although all reasonable efforts have been taken to ensure the accuracy -* and reliability of the software and data, the NLM and the U.S. -* Government do not and cannot warrant the performance or results that -* may be obtained by using this software or data. The NLM and the U.S. -* Government disclaim all warranties, express or implied, including -* warranties of performance, merchantability or fitness for any particular -* purpose. -* -* Please cite the author in any work or product based on this material. -* -* ===========================================================================*/ - -/***************************************************************************** - -File name: nlm_numerics.h - -Author: E. Michael Gertz - -Contents: Definitions used in nlm_numerics.c - -******************************************************************************/ -/* - * $Log: nlm_numerics.h,v $ - * Revision 1.1 2005/05/16 16:11:41 papadopo - * Initial revision +/* $Id: nlm_linear_algebra.h,v 1.6 2005/12/01 13:54:04 gertz Exp $ + * =========================================================================== * + * PUBLIC DOMAIN NOTICE + * National Center for Biotechnology Information + * + * This software/database is a "United States Government Work" under the + * terms of the United States Copyright Act. It was written as part of + * the author's official duties as a United States Government employee and + * thus cannot be copyrighted. This software/database is freely available + * to the public for use. The National Library of Medicine and the U.S. + * Government have not placed any restriction on its use or reproduction. + * + * Although all reasonable efforts have been taken to ensure the accuracy + * and reliability of the software and data, the NLM and the U.S. + * Government do not and cannot warrant the performance or results that + * may be obtained by using this software or data. The NLM and the U.S. + * Government disclaim all warranties, express or implied, including + * warranties of performance, merchantability or fitness for any particular + * purpose. + * + * Please cite the author in any work or product based on this material. + * + * ===========================================================================*/ + +/** + * @file nlm_linear_algebra.h + * + * @author E. Michael Gertz + * + * Declarations for several linear algebra routines */ -#ifndef NLMNUMERICS -#define NLMNUMERICS -#include <ncbistd.h> +#ifndef __NLM_LINEAR_ALGEBRA__ +#define __NLM_LINEAR_ALGEBRA__ -Nlm_FloatHiPtr PNTR Nlm_DenseMatrixNew(Int4 nrows, Int4 ncols); -Nlm_FloatHiPtr PNTR Nlm_LtriangMatrixNew(Int4 n); -Nlm_FloatHiPtr PNTR Nlm_DenseMatrixFree(Nlm_FloatHiPtr PNTR mat); +#include <algo/blast/core/blast_export.h> +#include <algo/blast/core/ncbi_std.h> + +#ifdef __cplusplus +extern "C" { +#endif -void Nlm_FactorLtriangPosDef(Nlm_FloatHiPtr PNTR A, Int4 n); -void Nlm_SolveLtriangPosDef(Nlm_FloatHiPtr x, Int4 n, - Nlm_FloatHiPtr PNTR L ); +NCBI_XBLAST_EXPORT +double ** Nlm_DenseMatrixNew(int nrows, int ncols); -Nlm_FloatHi Nlm_EuclideanNorm(const Nlm_FloatHi PNTR v, Int4 n); +NCBI_XBLAST_EXPORT +double ** Nlm_LtriangMatrixNew(int n); -void Nlm_AddVectors(Nlm_FloatHiPtr y, Int4 n, Nlm_FloatHi alpha, - const Nlm_FloatHi PNTR x); +NCBI_XBLAST_EXPORT +void Nlm_DenseMatrixFree(double *** mat); -Nlm_FloatHi Nlm_StepBound(const Nlm_FloatHi PNTR x, Int4 n, - const Nlm_FloatHi PNTR step_x, Nlm_FloatHi max); +NCBI_XBLAST_EXPORT +Int4 ** Nlm_Int4MatrixNew(int nrows, int ncols); + +NCBI_XBLAST_EXPORT +void Nlm_Int4MatrixFree(Int4 *** mat); + +NCBI_XBLAST_EXPORT +void Nlm_FactorLtriangPosDef(double ** A, int n); + +NCBI_XBLAST_EXPORT +void Nlm_SolveLtriangPosDef(double x[], int n, double ** L); + +NCBI_XBLAST_EXPORT +double Nlm_EuclideanNorm(const double v[], int n); + +NCBI_XBLAST_EXPORT +void Nlm_AddVectors(double y[], int n, double alpha, + const double x[]); + +NCBI_XBLAST_EXPORT +double Nlm_StepBound(const double x[], int n, + const double step_x[], double max); + +#ifdef __cplusplus +} +#endif #endif diff --git a/algo/blast/composition_adjustment/optimize_target_freq.c b/algo/blast/composition_adjustment/optimize_target_freq.c index a8a331cd..120d9801 100644 --- a/algo/blast/composition_adjustment/optimize_target_freq.c +++ b/algo/blast/composition_adjustment/optimize_target_freq.c @@ -1,5 +1,3 @@ -static char const rcsid[] = "$Id: re_newton.c,v 1.3 2005/07/25 12:48:39 camacho Exp $"; - /* =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -24,39 +22,11 @@ static char const rcsid[] = "$Id: re_newton.c,v 1.3 2005/07/25 12:48:39 camacho * * ===========================================================================*/ -/***************************************************************************** - -File name: re_newton.c - -Authors: E. Michael Gertz, Alejandro Schaffer - -Contents: Mid-level functions that directly solve the optimization - problem for compositional score matrix adjustment. - Used in conjunction with Newton_procedures.c and nlm_numerics - -******************************************************************************/ -/* - * $Log: re_newton.c,v $ - * Revision 1.3 2005/07/25 12:48:39 camacho - * Updated reference for compositional adjustment - * - * Revision 1.2 2005/07/14 20:19:58 coulouri - * - In OptimizeTargetFrequencies, change the convergence tests to robustly - * handle NaN (floating point not a number) - * - * Revision 1.1 2005/05/16 16:11:41 papadopo - * Initial revision - * - */ -#include <ncbi.h> -#include <nlm_numerics.h> -#include <re_newton.h> - /** - * @file re_newton.c + * @file optimize_target_freq.c * * Author E. Michael Gertz - * + * * Routines for finding an optimal set of target frequencies for the * purpose of generating a compositionally adjusted score matrix. The * function for performing this optimization is named @@ -111,8 +81,20 @@ Contents: Mid-level functions that directly solve the optimization * Stephen F. Altschul, John C. Wootton, E. Michael Gertz, Richa * Agarwala, Aleksandr Morgulis, Alejandro Schaffer and Yi-Kuo Yu * (2005) Protein Database Searches Using Compositionally Adjusted - * Substitution Matrices. FEBS Journal, in press. + * Substitution Matrices. FEBS Journal, 272,5101-9. */ +#ifndef SKIP_DOXYGEN_PROCESSING +static char const rcsid[] = + "$Id: optimize_target_freq.c,v 1.6 2005/12/01 13:49:43 gertz Exp $"; +#endif /* SKIP_DOXYGEN_PROCESSING */ + +#include <string.h> +#include <assert.h> +#include <math.h> +#include <stdlib.h> +#include <algo/blast/core/ncbi_std.h> +#include <algo/blast/composition_adjustment/nlm_linear_algebra.h> +#include <algo/blast/composition_adjustment/optimize_target_freq.h> /** * Compute the symmetric product A D A^T, where A is the matrix of @@ -133,29 +115,27 @@ Contents: Mid-level functions that directly solve the optimization * length alphsize * alphsize */ static void -ScaledSymmetricProductA(Nlm_FloatHiPtr PNTR W, - Nlm_FloatHiPtr diagonal, - Int4 alphsize) +ScaledSymmetricProductA(double ** W, const double diagonal[], int alphsize) { - Int4 rowW, colW; /* iteration indices over the rows and columns of W */ - Int4 i, j; /* iteration indices over characters in the alphabet */ - Int4 m; /* The number of rows in A; also the size of W */ - + int rowW, colW; /* iteration indices over the rows and columns of W */ + int i, j; /* iteration indices over characters in the alphabet */ + int m; /* The number of rows in A; also the size of W */ + m = 2 * alphsize - 1; - for(rowW = 0; rowW < m; rowW++) { - for(colW = 0; colW <= rowW; colW++) { + for (rowW = 0; rowW < m; rowW++) { + for (colW = 0; colW <= rowW; colW++) { W[rowW][colW] = 0.0; } } - for(i = 0; i < alphsize; i++) { - for(j = 0; j < alphsize; j++) { - Nlm_FloatHi dd; /* an individual diagonal element */ + for (i = 0; i < alphsize; i++) { + for (j = 0; j < alphsize; j++) { + double dd; /* an individual diagonal element */ dd = diagonal[i * alphsize + j]; W[j][j] += dd; - if(i > 0) { + if (i > 0) { W[i + alphsize - 1][j] += dd; W[i + alphsize - 1][i + alphsize - 1] += dd; } @@ -177,31 +157,28 @@ ScaledSymmetricProductA(Nlm_FloatHiPtr PNTR W, * @param x a vector of size alphsize * alphsize */ static void -MultiplyByA(Nlm_FloatHi beta, - Nlm_FloatHiPtr y, - Int4 alphsize, - Nlm_FloatHi alpha, - const Nlm_FloatHi PNTR x) +MultiplyByA(double beta, double y[], int alphsize, + double alpha, const double x[]) { - Int4 i, j; /* iteration indices over characters in the alphabet */ - if(beta == 0.0) { + int i, j; /* iteration indices over characters in the alphabet */ + if (beta == 0.0) { /* Initialize y to zero, without reading any elements from y */ - for(i = 0; i < 2 * alphsize - 1; i++) { + for (i = 0; i < 2 * alphsize - 1; i++) { y[i] = 0.0; } - } else if(beta != 1.0) { + } else if (beta != 1.0) { /* rescale y */ - for(i = 0; i < 2 * alphsize - 1; i++) { + for (i = 0; i < 2 * alphsize - 1; i++) { y[i] *= beta; } } - for(i = 0; i < alphsize; i++) { - for(j = 0; j < alphsize; j++) { + for (i = 0; i < alphsize; i++) { + for (j = 0; j < alphsize; j++) { y[j] += alpha * x[i * alphsize + j]; } } - for(i = 1; i < alphsize; i++) { - for(j = 0; j < alphsize; j++) { + for (i = 1; i < alphsize; i++) { + for (j = 0; j < alphsize; j++) { y[i + alphsize - 1] += alpha * x[i * alphsize + j]; } } @@ -221,33 +198,30 @@ MultiplyByA(Nlm_FloatHi beta, * @param x a vector of size 2 * alphsize - 1 */ static void -MultiplyByAtranspose(Nlm_FloatHi beta, - Nlm_FloatHiPtr y, - Int4 alphsize, - Nlm_FloatHi alpha, - const Nlm_FloatHi PNTR x) +MultiplyByAtranspose(double beta, double y[], int alphsize, + double alpha, const double x[]) { - Int4 i, j; /* iteration indices over characters in the alphabet */ - Int4 k; /* index of a row of A transpose (a column of A); also + int i, j; /* iteration indices over characters in the alphabet */ + int k; /* index of a row of A transpose (a column of A); also an index into y */ - if(beta == 0.0) { + if (beta == 0.0) { /* Initialize y to zero, without reading any elements from y */ - for(k = 0; k < alphsize * alphsize; k++) { + for (k = 0; k < alphsize * alphsize; k++) { y[k] = 0.0; } - } else if(beta != 1.0) { + } else if (beta != 1.0) { /* rescale y */ - for(k = 0; k < alphsize * alphsize; k++) { + for (k = 0; k < alphsize * alphsize; k++) { y[k] *= beta; } } - for(i = 0; i < alphsize; i++) { - for(j = 0; j < alphsize; j++) { + for (i = 0; i < alphsize; i++) { + for (j = 0; j < alphsize; j++) { k = i * alphsize + j; y[k] += alpha * x[j]; - if(i > 0) { + if (i > 0) { y[k] += alpha * x[i + alphsize - 1]; } } @@ -266,18 +240,15 @@ MultiplyByAtranspose(Nlm_FloatHi beta, * @param col_sums column sums of the substitution probabilities */ static void -ResidualsLinearConstraints(Nlm_FloatHiPtr rA, - Int4 alphsize, - const Nlm_FloatHi PNTR x, - const Nlm_FloatHi PNTR row_sums, - const Nlm_FloatHi PNTR col_sums) +ResidualsLinearConstraints(double rA[], int alphsize, const double x[], + const double row_sums[], const double col_sums[]) { - Int4 i; /* iteration index */ + int i; /* iteration index */ - for(i = 0; i < alphsize; i++) { + for (i = 0; i < alphsize; i++) { rA[i] = col_sums[i]; } - for(i = 1; i < alphsize; i++) { + for (i = 1; i < alphsize; i++) { rA[i + alphsize - 1] = row_sums[i]; } MultiplyByA(1.0, rA, alphsize, -1.0, x); @@ -298,24 +269,21 @@ ResidualsLinearConstraints(Nlm_FloatHiPtr rA, * problem. */ static void -DualResiduals(Nlm_FloatHiPtr resids_x, - Int4 alphsize, - Nlm_FloatHiPtr PNTR grads, - const Nlm_FloatHi PNTR z, - Int4 constrain_rel_entropy) +DualResiduals(double resids_x[], int alphsize, double ** grads, + const double z[], int constrain_rel_entropy) { - Int4 i; /* iteration index */ - Int4 n = alphsize * alphsize; /* size of resids_x */ + int i; /* iteration index */ + int n = alphsize * alphsize; /* size of resids_x */ - if(constrain_rel_entropy) { - Nlm_FloatHi eta; /* dual variable for the relative + if (constrain_rel_entropy) { + double eta; /* dual variable for the relative entropy constraint */ eta = z[2 * alphsize - 1]; - for(i = 0; i < n; i++) { + for (i = 0; i < n; i++) { resids_x[i] = -grads[0][i] + eta * grads[1][i]; } } else { - for(i = 0; i < n; i++) { + for (i = 0; i < n; i++) { resids_x[i] = -grads[0][i]; } } @@ -348,28 +316,28 @@ DualResiduals(Nlm_FloatHiPtr resids_x, * */ static void -CalculateResiduals(Nlm_FloatHiPtr rnorm, - Nlm_FloatHiPtr resids_x, - Int4 alphsize, - Nlm_FloatHiPtr resids_z, - const Nlm_FloatHi PNTR values, - Nlm_FloatHiPtr PNTR grads, - const Nlm_FloatHi PNTR row_sums, - const Nlm_FloatHi PNTR col_sums, - const Nlm_FloatHi PNTR x, - const Nlm_FloatHi PNTR z, - Int4 constrain_rel_entropy, - Nlm_FloatHi relative_entropy) +CalculateResiduals(double * rnorm, + double resids_x[], + int alphsize, + double resids_z[], + const double values[], + double ** grads, + const double row_sums[], + const double col_sums[], + const double x[], + const double z[], + int constrain_rel_entropy, + double relative_entropy) { /* Euclidean norms of the primal and dual residuals */ - Nlm_FloatHi norm_resids_z, norm_resids_x; + double norm_resids_z, norm_resids_x; DualResiduals(resids_x, alphsize, grads, z, constrain_rel_entropy); norm_resids_x = Nlm_EuclideanNorm(resids_x, alphsize * alphsize); ResidualsLinearConstraints(resids_z, alphsize, x, row_sums, col_sums); - if(constrain_rel_entropy) { + if (constrain_rel_entropy) { resids_z[2 * alphsize - 1] = relative_entropy - values[1]; norm_resids_z = Nlm_EuclideanNorm(resids_z, 2 * alphsize); @@ -400,72 +368,89 @@ CalculateResiduals(Nlm_FloatHiPtr rnorm, * backsolve using this factorization are stored. */ struct ReNewtonSystem { - Int4 alphsize; /*< the size of the alphabet */ - Int4 constrain_rel_entropy; /*< if true, use the relative entropy + int alphsize; /*< the size of the alphabet */ + int constrain_rel_entropy; /*< if true, use the relative entropy constraint for this optimization problem */ - Nlm_FloatHiPtr PNTR W; /*< A lower-triangular matrix + double ** W; /*< A lower-triangular matrix representing a factorization of the (2,2) block, -J D^{-1} J^T, of the condensed linear system */ - Nlm_FloatHiPtr Dinv; /*< The diagonal elements of the + double * Dinv; /*< The diagonal elements of the inverse of the necessarily diagonal (1,1) block of the linear system */ - Nlm_FloatHiPtr grad_re; /*< the gradient of the + double * grad_re; /*< the gradient of the relative-entropy constraint, if this constraint is used. */ }; typedef struct ReNewtonSystem ReNewtonSystem; -typedef ReNewtonSystem PNTR ReNewtonSystemPtr; /** - * Create a new uninitialized ReNewtonSystem; the fields are - * initialized by the FactorReNewtonSystem procedure. - * ReNewtonSystemNew and FactorReNewtonSystem are called from only the - * newt procedure. + * Free the memory associated with a ReNewtonSystem. * - * @param alphsize the size of the alphabet for this optimization - * problem. + * @param newton_system on entry *newton_system points to the + * system to be freed. On exit, *newton_system + * is set to NULL. */ -static ReNewtonSystemPtr -ReNewtonSystemNew(Int4 alphsize) +static void +ReNewtonSystemFree(ReNewtonSystem ** newton_system) { - ReNewtonSystemPtr newton_system; /* the new ReNewtonSystem */ - - newton_system = (ReNewtonSystem *) Nlm_MemNew(sizeof(ReNewtonSystem)); + if (*newton_system != NULL) { + Nlm_DenseMatrixFree(&(*newton_system)->W); - newton_system->alphsize = alphsize; - newton_system->constrain_rel_entropy = 1; - newton_system->W = Nlm_LtriangMatrixNew(2 * alphsize); + free((*newton_system)->Dinv); + (*newton_system)->Dinv = NULL; - newton_system->Dinv = - (Nlm_FloatHiPtr) Nlm_MemNew(alphsize * alphsize * sizeof(Nlm_FloatHi)); - newton_system->grad_re = - (Nlm_FloatHiPtr) Nlm_MemNew(alphsize * alphsize * sizeof(Nlm_FloatHi)); + free((*newton_system)->grad_re); + (*newton_system)->grad_re = NULL; - return newton_system; + free(*newton_system); + *newton_system = NULL; + } } /** - * Free the memory associated with a ReNewtonSystem. + * Create a new uninitialized ReNewtonSystem; the fields are + * initialized by the FactorReNewtonSystem procedure. + * ReNewtonSystemNew and FactorReNewtonSystem are called from only the + * newt procedure. * - * @param newton_system on entry *newton_system points to the - * system to be freed. On exit, *newton_system - * is set to NULL. + * @param alphsize the size of the alphabet for this optimization + * problem. */ -static void -ReNewtonSystemFree(ReNewtonSystemPtr PNTR newton_system) +static ReNewtonSystem * ReNewtonSystemNew(int alphsize) { - (*newton_system)->W = Nlm_DenseMatrixFree((*newton_system)->W); - (*newton_system)->Dinv = - (Nlm_FloatHiPtr) Nlm_MemFree((*newton_system)->Dinv); - (*newton_system)->grad_re = - (Nlm_FloatHiPtr) Nlm_MemFree((*newton_system)->grad_re); + ReNewtonSystem * newton_system; /* the new ReNewtonSystem */ + + newton_system = (ReNewtonSystem *) malloc(sizeof(ReNewtonSystem)); + if (newton_system != NULL) { + newton_system->alphsize = alphsize; + newton_system->constrain_rel_entropy = 1; + newton_system->W = NULL; + newton_system->Dinv = NULL; + newton_system->grad_re = NULL; + + newton_system->W = Nlm_LtriangMatrixNew(2 * alphsize); + if (newton_system->W == NULL) + goto error_return; + newton_system->Dinv = + (double *) malloc(alphsize * alphsize * sizeof(double)); + if (newton_system->Dinv == NULL) + goto error_return; + newton_system->grad_re = + (double *) malloc(alphsize * alphsize * sizeof(double)); + if (newton_system->grad_re == NULL) + goto error_return; + } + goto normal_return; +error_return: + ReNewtonSystemFree(&newton_system); +normal_return: - *newton_system = (ReNewtonSystemPtr) Nlm_MemFree(*newton_system); + return newton_system; } @@ -484,22 +469,23 @@ ReNewtonSystemFree(ReNewtonSystemPtr PNTR newton_system) * problem. */ static void -FactorReNewtonSystem(ReNewtonSystemPtr newton_system, - const Nlm_FloatHi PNTR x, - const Nlm_FloatHi PNTR z, - Nlm_FloatHiPtr PNTR grads, - Int4 constrain_rel_entropy) +FactorReNewtonSystem(ReNewtonSystem * newton_system, + const double x[], + const double z[], + double ** grads, + int constrain_rel_entropy, + double * workspace) { - Int4 i; /* iteration index */ - Int4 n; /* the length of x */ - Int4 m; /* the length of z */ + int i; /* iteration index */ + int n; /* the length of x */ + int m; /* the length of z */ /* Pointers to fields in newton_systems; the names of the local * variables match the names of the fields. */ - Nlm_FloatHiPtr PNTR W = newton_system->W; - Int4 alphsize = newton_system->alphsize; - Nlm_FloatHiPtr Dinv = newton_system->Dinv; - Nlm_FloatHiPtr grad_re = newton_system->grad_re; + double ** W = newton_system->W; + int alphsize = newton_system->alphsize; + double * Dinv = newton_system->Dinv; + double * grad_re = newton_system->grad_re; n = alphsize * alphsize; m = constrain_rel_entropy ? 2 * alphsize : 2 * alphsize - 1; @@ -507,51 +493,45 @@ FactorReNewtonSystem(ReNewtonSystemPtr newton_system, newton_system->constrain_rel_entropy = constrain_rel_entropy; /* The original system has the form - * + * * (D J^T) * (J 0 ). * - * We block reduce the system to + * We block reduce the system to * * (D J^T ) * (0 -J D^{-1} J^T). * * First we find the inverse of the diagonal matrix D. */ - - if(constrain_rel_entropy) { - Nlm_FloatHi eta; /* dual variable for the relative + + if (constrain_rel_entropy) { + double eta; /* dual variable for the relative entropy constraint */ eta = z[m - 1]; - for(i = 0; i < n; i++) { + for (i = 0; i < n; i++) { Dinv[i] = x[i] / (1 - eta); } } else { - Nlm_MemCpy(Dinv, x, n * sizeof(Nlm_FloatHi)); + memcpy(Dinv, x, n * sizeof(double)); } /* Then we compute J D^{-1} J^T; First fill in the part that corresponds * to the linear constraints */ ScaledSymmetricProductA(W, Dinv, alphsize); - if(constrain_rel_entropy) { - Nlm_FloatHiPtr work; /* a vector for intermediate computations */ - + if (constrain_rel_entropy) { /* Save the gradient of the relative entropy constraint. */ - Nlm_MemCpy(grad_re, grads[1], n * sizeof(Nlm_FloatHi)); + memcpy(grad_re, grads[1], n * sizeof(double)); /* Fill in the part of J D^{-1} J^T that corresponds to the relative * entropy constraint. */ - work = (Nlm_FloatHiPtr) Nlm_MemNew(n * sizeof(Nlm_FloatHi)); - W[m - 1][m - 1] = 0.0; - for(i = 0; i < n; i++) { - work[i] = Dinv[i] * grad_re[i]; + for (i = 0; i < n; i++) { + workspace[i] = Dinv[i] * grad_re[i]; - W[m - 1][m - 1] += grad_re[i] * work[i]; + W[m - 1][m - 1] += grad_re[i] * workspace[i]; } - MultiplyByA(0.0, &W[m - 1][0], alphsize, 1.0, work); - - work = (Nlm_FloatHiPtr) Nlm_MemFree(work); + MultiplyByA(0.0, &W[m - 1][0], alphsize, 1.0, workspace); } /* Factor J D^{-1} J^T and save the result in W. */ Nlm_FactorLtriangPosDef(W, m); @@ -569,42 +549,38 @@ FactorReNewtonSystem(ReNewtonSystemPtr newton_system, * @param newton_system the factored matrix for the Newton system. */ static void -SolveReNewtonSystem(Nlm_FloatHiPtr x, - Nlm_FloatHiPtr z, - const ReNewtonSystem PNTR newton_system) +SolveReNewtonSystem(double x[], double z[], + const ReNewtonSystem * newton_system, double workspace[]) { - Int4 i; /* iteration index */ - Int4 n; /* the size of x */ - Int4 mA; /* the number of linear constraints */ - Int4 m; /* the size of z */ - Nlm_FloatHiPtr work; /* vector for intermediate calculations */ + int i; /* iteration index */ + int n; /* the size of x */ + int mA; /* the number of linear constraints */ + int m; /* the size of z */ /* Local variables that represent fields of newton_system */ - Nlm_FloatHiPtr PNTR W = newton_system->W; - Nlm_FloatHiPtr Dinv = newton_system->Dinv; - Nlm_FloatHiPtr grad_re = newton_system->grad_re; - Int4 alphsize = newton_system->alphsize; - Int4 constrain_rel_entropy = newton_system->constrain_rel_entropy; + double ** W = newton_system->W; + double * Dinv = newton_system->Dinv; + double * grad_re = newton_system->grad_re; + int alphsize = newton_system->alphsize; + int constrain_rel_entropy = newton_system->constrain_rel_entropy; n = alphsize * alphsize; mA = 2 * alphsize - 1; m = constrain_rel_entropy ? mA + 1 : mA; - work = (Nlm_FloatHiPtr) Nlm_MemNew(n * sizeof(Nlm_FloatHi)); - /* Apply the same block reduction to the right-hand side as was * applied to the matrix: * * rzhat = rz - J D^{-1} rx */ - for(i = 0; i < n; i++) { - work[i] = x[i] * Dinv[i]; + for (i = 0; i < n; i++) { + workspace[i] = x[i] * Dinv[i]; } - MultiplyByA(1.0, z, alphsize, -1.0, work); + MultiplyByA(1.0, z, alphsize, -1.0, workspace); - if(constrain_rel_entropy) { - for(i = 0; i < n; i++) { - z[m - 1] -= grad_re[i] * work[i]; + if (constrain_rel_entropy) { + for (i = 0; i < n; i++) { + z[m - 1] -= grad_re[i] * workspace[i]; } } @@ -615,17 +591,16 @@ SolveReNewtonSystem(Nlm_FloatHiPtr x, * * x = D^{-1) (rx + J\T z) */ - if(constrain_rel_entropy) { + if (constrain_rel_entropy) { for(i = 0; i < n; i++) { x[i] += grad_re[i] * z[m - 1]; } } MultiplyByAtranspose(1.0, x, alphsize, 1.0, z); - for(i = 0; i < n; i++) { + for (i = 0; i < n; i++) { x[i] *= Dinv[i]; } - work = (Nlm_FloatHiPtr) Nlm_MemFree(work); } @@ -648,25 +623,22 @@ SolveReNewtonSystem(Nlm_FloatHiPtr x, * is used in this optimization problem */ static void -EvaluateReFunctions(Nlm_FloatHiPtr values, - Nlm_FloatHiPtr PNTR grads, - Int4 alphsize, - const Nlm_FloatHi PNTR x, - const Nlm_FloatHi PNTR q, - const Nlm_FloatHi PNTR scores, - Int4 constrain_rel_entropy) +EvaluateReFunctions(double values[], double ** grads, int alphsize, + const double x[], const double q[], + const double scores[], + int constrain_rel_entropy) { - Int4 k; /* iteration index over elements of x, q and scores */ - Nlm_FloatHi temp; /* holds intermediate values in a computation */ + int k; /* iteration index over elements of x, q and scores */ + double temp; /* holds intermediate values in a computation */ values[0] = 0.0; values[1] = 0.0; - for(k = 0; k < alphsize * alphsize; k++) { + for (k = 0; k < alphsize * alphsize; k++) { temp = log(x[k] / q[k]); values[0] += x[k] * temp; grads[0][k] = temp + 1; - if(constrain_rel_entropy) { + if (constrain_rel_entropy) { temp += scores[k]; values[1] += x[k] * temp; @@ -691,17 +663,17 @@ EvaluateReFunctions(Nlm_FloatHiPtr values, * @param col_freqs background frequencies of the other sequence */ static void -ComputeScoresFromProbs(Nlm_FloatHiPtr scores, - Int4 alphsize, - const Nlm_FloatHi PNTR target_freqs, - const Nlm_FloatHi PNTR row_freqs, - const Nlm_FloatHi PNTR col_freqs) +ComputeScoresFromProbs(double scores[], + int alphsize, + const double target_freqs[], + const double row_freqs[], + const double col_freqs[]) { - Int4 i, j; /* iteration indices over characters in the alphabet */ - Int4 k; /* index into scores and target_freqs */ + int i, j; /* iteration indices over characters in the alphabet */ + int k; /* index into scores and target_freqs */ - for(i = 0; i < alphsize; i++) { - for(j = 0; j < alphsize; j++) { + for (i = 0; i < alphsize; i++) { + for (j = 0; j < alphsize; j++) { k = i * alphsize + j; scores[k] = log(target_freqs[k] / (row_freqs[i] * col_freqs[j])); @@ -736,71 +708,81 @@ ComputeScoresFromProbs(Nlm_FloatHiPtr scores, * this argument is ignored. * @param maxits the maximum number of iterations permitted for the * optimization algorithm; a good value is 2000. - * @param tol the solution tolerance; the residuals of the optimization + * @param tol the solution tolerance; the residuals of the optimization * program must have Euclidean norm <= tol for the * algorithm to terminate. * * @returns if an optimal set of target frequencies is - * found, then the number of iterations used by the - * optimization algorithm; otherwise maxits + 1. + * found, then 0, if the iteration failed to + * converge, then 1, if there was some error, then -1. */ -Int4 -OptimizeTargetFrequencies(Nlm_FloatHiPtr x, - Int4 alphsize, - const Nlm_FloatHi PNTR q, - const Nlm_FloatHi PNTR row_sums, - const Nlm_FloatHi PNTR col_sums, - Int4 constrain_rel_entropy, - Nlm_FloatHi relative_entropy, - Nlm_FloatHi tol, - Int4 maxits) +int +Blast_OptimizeTargetFrequencies(double x[], + int alphsize, + int *iterations, + const double q[], + const double row_sums[], + const double col_sums[], + int constrain_rel_entropy, + double relative_entropy, + double tol, + int maxits) { - Int4 its; /* number of iterations that have been performed */ - Int4 n; /* number of target frequencies; the size of x */ - Int4 mA; /* number of linear constraints */ - Int4 m; /* total number of constraints */ + int its; /* number of iterations that have been performed */ + int n; /* number of target frequencies; the size of x */ + int mA; /* number of linear constraints */ + int m; /* total number of constraints */ - Nlm_FloatHi values[2]; /* values of the nonlinear functions + double values[2]; /* values of the nonlinear functions at this iterate */ - Nlm_FloatHiPtr PNTR grads; /* gradients of the nonlinear + double ** grads = NULL; /* gradients of the nonlinear functions at this iterate */ - ReNewtonSystemPtr newton_system; /* factored matrix of the - linear system to be solved - at this iteration */ - Nlm_FloatHiPtr z; /* dual variables (Lagrange multipliers) */ - Nlm_FloatHiPtr resids_x; /* dual residuals (gradient of Lagrangian) */ - Nlm_FloatHiPtr resids_z; /* primal (constraint) residuals */ - Nlm_FloatHi rnorm; /* norm of the residuals for the + ReNewtonSystem * + newton_system = NULL; /* factored matrix of the linear + system to be solved at this + iteration */ + double * z = NULL; /* dual variables (Lagrange multipliers) */ + double * resids_x = NULL; /* dual residuals (gradient of Lagrangian) */ + double * resids_z = NULL; /* primal (constraint) residuals */ + double rnorm; /* norm of the residuals for the current iterate */ - Nlm_FloatHiPtr old_scores; /* a scoring matrix, with lambda = 1, + double * old_scores = NULL; /* a scoring matrix, with lambda = 1, generated from q, row_sums and col_sums */ - Int4 converged; /* true if Newton's method converged + double * workspace = NULL; /* A vector for intermediate computations */ + int converged; /* true if Newton's method converged to a *minimizer* (strong second-order point) */ + int status; /* the return status */ n = alphsize * alphsize; mA = 2 * alphsize - 1; m = constrain_rel_entropy ? mA + 1 : mA; newton_system = ReNewtonSystemNew(alphsize); - - resids_x = (Nlm_FloatHiPtr) Nlm_MemNew(n * sizeof(Nlm_FloatHi)); - resids_z = (Nlm_FloatHiPtr) Nlm_MemNew((mA + 1) * sizeof(Nlm_FloatHi)); + if (newton_system == NULL) goto error_return; + resids_x = (double *) malloc(n * sizeof(double)); + if (resids_x == NULL) goto error_return; + resids_z = (double *) malloc((mA + 1) * sizeof(double)); + if (resids_z == NULL) goto error_return; /* z must be initialized to zero */ - z = (Nlm_FloatHiPtr) Nlm_Calloc( mA + 1, sizeof(Nlm_FloatHi)); + z = (double *) calloc( mA + 1, sizeof(double)); + if (z == NULL) goto error_return; + old_scores = (double *) malloc(n * sizeof(double)); + if (old_scores == NULL) goto error_return; + workspace = (double *) malloc(n * sizeof(double)); + if (workspace == NULL) goto error_return; + grads = Nlm_DenseMatrixNew(2, n); + if (grads == NULL) goto error_return; - old_scores = (Nlm_FloatHiPtr) Nlm_MemNew(n * sizeof(Nlm_FloatHi)); ComputeScoresFromProbs(old_scores, alphsize, q, row_sums, col_sums); - grads = Nlm_DenseMatrixNew(2, n); - /* Use q as the initial value for x */ - Nlm_MemCpy(x, q, n * sizeof(Nlm_FloatHi)); + memcpy(x, q, n * sizeof(double)); its = 0; /* Initialize the iteration count. Note that we may - converge in zero iterations if the initial x is + converge in zero iterations if the initial x is optimal. */ - while(its <= maxits) { + while (its <= maxits) { /* Compute the residuals */ EvaluateReFunctions(values, grads, alphsize, x, q, old_scores, constrain_rel_entropy); @@ -810,21 +792,22 @@ OptimizeTargetFrequencies(Nlm_FloatHiPtr x, /* and check convergence; the test correctly handles the case in which rnorm is NaN (not a number). */ - if(!(rnorm > tol)) { + if ( !(rnorm > tol) ) { /* We converged at the current iterate */ break; } else { /* we did not converge, so increment the iteration counter and start a new iteration */ - if(++its <= maxits) { + if (++its <= maxits) { /* We have not exceeded the maximum number of iterations; take a Newton step. */ - Nlm_FloatHi alpha; /* a positive number used to scale the + double alpha; /* a positive number used to scale the Newton step. */ FactorReNewtonSystem(newton_system, x, z, grads, - constrain_rel_entropy); - SolveReNewtonSystem(resids_x, resids_z, newton_system); + constrain_rel_entropy, workspace); + SolveReNewtonSystem(resids_x, resids_z, newton_system, + workspace); /* Calculate a value of alpha that ensure that x is positive */ @@ -836,23 +819,30 @@ OptimizeTargetFrequencies(Nlm_FloatHiPtr x, } } } - converged = 0; - if( its <= maxits && rnorm <= tol ) { + if (its <= maxits && rnorm <= tol) { /* Newton's iteration converged */ - if( !constrain_rel_entropy || z[m - 1] < 1 ) { + if ( !constrain_rel_entropy || z[m - 1] < 1 ) { /* and the final iterate is a minimizer */ converged = 1; } } - - grads = Nlm_DenseMatrixFree(grads); - old_scores = (Nlm_FloatHiPtr) Nlm_MemFree(old_scores); - z = (Nlm_FloatHiPtr) Nlm_MemFree(z); - resids_z = (Nlm_FloatHiPtr) Nlm_MemFree(resids_z); - resids_x = (Nlm_FloatHiPtr) Nlm_MemFree(resids_x); - + status = converged ? 0 : 1; + *iterations = its; + goto normal_return; + +error_return: + status = -1; + *iterations = 0; +normal_return: + + Nlm_DenseMatrixFree(&grads); + free(workspace); + free(old_scores); + free(z); + free(resids_z); + free(resids_x); ReNewtonSystemFree(&newton_system); - return converged ? its : maxits + 1; + return status; } diff --git a/algo/blast/composition_adjustment/optimize_target_freq.h b/algo/blast/composition_adjustment/optimize_target_freq.h index f9684418..5b56b000 100644 --- a/algo/blast/composition_adjustment/optimize_target_freq.h +++ b/algo/blast/composition_adjustment/optimize_target_freq.h @@ -1,58 +1,59 @@ -/* =========================================================================== -* -* PUBLIC DOMAIN NOTICE -* National Center for Biotechnology Information -* -* This software/database is a "United States Government Work" under the -* terms of the United States Copyright Act. It was written as part of -* the author's official duties as a United States Government employee and -* thus cannot be copyrighted. This software/database is freely available -* to the public for use. The National Library of Medicine and the U.S. -* Government have not placed any restriction on its use or reproduction. -* -* Although all reasonable efforts have been taken to ensure the accuracy -* and reliability of the software and data, the NLM and the U.S. -* Government do not and cannot warrant the performance or results that -* may be obtained by using this software or data. The NLM and the U.S. -* Government disclaim all warranties, express or implied, including -* warranties of performance, merchantability or fitness for any particular -* purpose. -* -* Please cite the author in any work or product based on this material. -* -* ===========================================================================*/ +/* $Id: optimize_target_freq.h,v 1.6 2005/12/01 13:54:04 gertz Exp $ + * =========================================================================== + * + * PUBLIC DOMAIN NOTICE + * National Center for Biotechnology Information + * + * This software/database is a "United States Government Work" under the + * terms of the United States Copyright Act. It was written as part of + * the author's official duties as a United States Government employee and + * thus cannot be copyrighted. This software/database is freely available + * to the public for use. The National Library of Medicine and the U.S. + * Government have not placed any restriction on its use or reproduction. + * + * Although all reasonable efforts have been taken to ensure the accuracy + * and reliability of the software and data, the NLM and the U.S. + * Government do not and cannot warrant the performance or results that + * may be obtained by using this software or data. The NLM and the U.S. + * Government disclaim all warranties, express or implied, including + * warranties of performance, merchantability or fitness for any particular + * purpose. + * + * Please cite the author in any work or product based on this material. + * + * ===========================================================================*/ -/***************************************************************************** +/** + * @file optimize_target_freq.h + * @author E. Michael Gertz + * + * Exports for optimized_target_freq.c + */ -File name: re_newton.h +#ifndef __OPTIMIZE_TARGET_FREQ__ +#define __OPTIMIZE_TARGET_FREQ__ -Author: E. Michael Gertz +#include <algo/blast/core/blast_export.h> -Contents: Exports for re_newton.c - Mid-level functions that directly solve the optimization - problem for compositional score matrix adjustment. - Used in conjunction with Newton_procedures.c and nlm_numerics +#ifdef __cplusplus +extern "C" { +#endif -******************************************************************************/ -/* - * $Log: re_newton.h,v $ - * Revision 1.1 2005/05/16 16:11:41 papadopo - * Initial revision - * - */ +NCBI_XBLAST_EXPORT +int +Blast_OptimizeTargetFrequencies(double x[], + int alphsize, + int * iterations, + const double q[], + const double row_sums[], + const double col_sums[], + int constrain_rel_entropy, + double relative_entropy, + double tol, + int maxits); -#ifndef RE_NEWTON -#define RE_NEWTON +#ifdef __cplusplus +} +#endif -Int4 -OptimizeTargetFrequencies(Nlm_FloatHiPtr x, - Int4 alphsize, - const Nlm_FloatHi PNTR q, - const Nlm_FloatHi PNTR row_sums, - const Nlm_FloatHi PNTR col_sums, - Int4 constrain_rel_entropy, - Nlm_FloatHi relative_entropy, - Nlm_FloatHi tol, - Int4 maxits); - #endif diff --git a/algo/blast/composition_adjustment/redo_alignment.c b/algo/blast/composition_adjustment/redo_alignment.c new file mode 100644 index 00000000..dd71e677 --- /dev/null +++ b/algo/blast/composition_adjustment/redo_alignment.c @@ -0,0 +1,1367 @@ +/* =========================================================================== +* +* PUBLIC DOMAIN NOTICE +* National Center for Biotechnology Information +* +* This software/database is a "United States Government Work" under the +* terms of the United States Copyright Act. It was written as part of +* the author's official duties as a United States Government employee and +* thus cannot be copyrighted. This software/database is freely available +* to the public for use. The National Library of Medicine and the U.S. +* Government have not placed any restriction on its use or reproduction. +* +* Although all reasonable efforts have been taken to ensure the accuracy +* and reliability of the software and data, the NLM and the U.S. +* Government do not and cannot warrant the performance or results that +* may be obtained by using this software or data. The NLM and the U.S. +* Government disclaim all warranties, express or implied, including +* warranties of performance, merchantability or fitness for any particular +* purpose. +* +* Please cite the author in any work or product based on this material. +* +* ===========================================================================*/ + +/** @file kappa_common.c + * + * @author Alejandro Schaffer, E. Michael Gertz + * + * Routines for redoing a set of alignments, using either + * composition matrix adjustment or the Smith-Waterman algorithm (or + * both.) + */ +#ifndef SKIP_DOXYGEN_PROCESSING +static char const rcsid[] = + "$Id: redo_alignment.c,v 1.2 2005/12/01 15:41:42 gertz Exp $"; +#endif /* SKIP_DOXYGEN_PROCESSING */ + +#include <stdlib.h> +#include <assert.h> +#include <math.h> +#include <algo/blast/core/ncbi_std.h> +#include <algo/blast/composition_adjustment/redo_alignment.h> +#include <algo/blast/composition_adjustment/nlm_linear_algebra.h> +#include <algo/blast/composition_adjustment/composition_adjustment.h> +#include <algo/blast/composition_adjustment/composition_constants.h> +#include <algo/blast/composition_adjustment/smith_waterman.h> +#include <algo/blast/composition_adjustment/compo_heap.h> + +/* The natural log of 2, defined in newer systems as M_LN2 in math.h, but + missing in older systems. */ +#define LOCAL_LN2 0.69314718055994530941723212145818 + +/** Define COMPO_INTENSE_DEBUG to be true to turn on rigorous but + * expensive consistency tests in the composition_adjustment + * module. + * + * This macro is usually used as part of a C-conditional + * if (COMPO_INTENSE_DEBUG) { + * perform expensive tests + * } + * The C compiler will then validate the code to perform the tests, but + * will almost always strip the code if COMPO_INTENSE_DEBUG is false. + */ +#ifndef COMPO_INTENSE_DEBUG +#define COMPO_INTENSE_DEBUG 0 +#endif + +/** by what factor might initially reported E-value exceed true Evalue */ +#define EVALUE_STRETCH 5 + +/** -1/0/1 if a is less than/greater than/equal to b */ +#ifndef CMP +#define CMP(a,b) ((a)>(b) ? 1 : ((a)<(b) ? -1 : 0)) +#endif + +/** For translated subject sequences, the number of amino acids to + include before and after the existing aligned segment when + generating a composition-based scoring system. */ +static const int kWindowBorder = 200; + +/** pseudocounts for relative-entropy-based score matrix adjustment */ +static const int kReMatrixAdjustmentPseudocounts = 20; + +/** + * s_WindowInfo - a struct whose instances represent a range + * of data in a sequence. */ +typedef struct s_WindowInfo +{ + BlastCompo_SequenceRange query_range; /**< range of the query + included in this window */ + BlastCompo_SequenceRange subject_range; /**< range of the subject + included in this window */ + BlastCompo_Alignment * align; /**< list of existing alignments + contained in this window */ + int hspcnt; /**< number of alignment in + this window */ +} s_WindowInfo; + + +/** + * Create a new BlastCompo_Alignment; parameters to this function + * correspond directly to fields of BlastCompo_Alignment */ +BlastCompo_Alignment * +BlastCompo_AlignmentNew(int score, + ECompoAdjustModes comp_adjustment_mode, + int queryStart, int queryEnd, int queryIndex, + int matchStart, int matchEnd, int frame, + void * context) +{ + BlastCompo_Alignment * align = malloc(sizeof(BlastCompo_Alignment)); + if (align != NULL) { + align->score = score; + align->comp_adjustment_mode = comp_adjustment_mode; + align->queryIndex = queryIndex; + align->queryStart = queryStart; + align->queryEnd = queryEnd; + align->matchStart = matchStart; + align->matchEnd = matchEnd; + align->frame = frame; + align->context = context; + align->next = NULL; + } + return align; +} + + +/** + * Recursively free all alignments in the singly linked list whose + * head is *palign. Set *palign to NULL. + * + * @param palign pointer to the head of a singly linked list + * of alignments. + */ +void +BlastCompo_AlignmentsFree(BlastCompo_Alignment ** palign, + void (*free_context)(void*)) +{ + BlastCompo_Alignment * align; /* represents the current + alignment in loops */ + align = *palign; *palign = NULL; + while (align != NULL) { + /* Save the value of align->next, because align is to be deleted. */ + BlastCompo_Alignment * align_next = align->next; + + align_next = align->next; + if (free_context != NULL && align->context != NULL) { + free_context(align->context); + } + free(align); + align = align_next; + } +} + + +/** + * Reverse a list of BlastCompo_Alignments. */ +static void +s_AlignmentsRev(BlastCompo_Alignment ** plist) +{ + BlastCompo_Alignment *list; /* the forward list */ + BlastCompo_Alignment *new_list; /* the reversed list */ + list = *plist; new_list = NULL; + while (list != NULL) { + BlastCompo_Alignment * list_next = list->next; + list->next = new_list; + new_list = list; + list = list_next; + } + *plist = new_list; +} + + +/** + * Compare two BlastCompo_Alignments. */ +static int +s_AlignmentCmp(const BlastCompo_Alignment * a, + const BlastCompo_Alignment * b) +{ + int result; + if (0 == (result = CMP(b->score, a->score)) && + 0 == (result = CMP(a->matchStart, b->matchStart)) && + 0 == (result = CMP(b->matchEnd, a->matchEnd)) && + 0 == (result = CMP(a->queryStart, b->queryStart))) { + /* if all other tests cannot distinguish the alignments, then + * the final test is the result */ + result = CMP(b->queryEnd, a->queryEnd); + } + return result; +} + +/** Temporary function to determine whether alignments are sorted */ +static int +s_AlignmentsAreSorted(BlastCompo_Alignment * alignments) +{ + BlastCompo_Alignment * align; + for (align = alignments; align != NULL; align = align->next) { + if (align->next && align->next->score > align->score) { + return 0; + } + } + return 1; +} + + +static int +s_DistinctAlignmentsLength(BlastCompo_Alignment * list) +{ + int length = 0; + for ( ; list != NULL; list = list->next) { + length++; + } + return length; +} + + +static void +s_DistinctAlignmentsSort(BlastCompo_Alignment ** plist, int hspcnt) +{ + /* mergesort */ + + if (COMPO_INTENSE_DEBUG) { + assert(s_DistinctAlignmentsLength(*plist) == hspcnt); + } + if(hspcnt > 1) { + BlastCompo_Alignment * list = *plist; + BlastCompo_Alignment *leftlist, *rightlist, **tail; + int i, leftcnt, rightcnt; + + /* Split the list in half */ + leftcnt = hspcnt/2; + rightcnt = hspcnt - leftcnt; + + leftlist = list; + /* Find the point to split the list; this loop splits lists + correctly only when list != NULL and leftcnt > 0, which is + necessarily the case because hspcnt > 1 */ + assert(list != NULL && leftcnt > 0); + for (i = 0; i < leftcnt - 1 && list->next != NULL; i++) { + list = list->next; + } + rightlist = list->next; + list->next = NULL; + + if (COMPO_INTENSE_DEBUG) { + assert(s_DistinctAlignmentsLength(rightlist) == rightcnt); + assert(s_DistinctAlignmentsLength(leftlist) == leftcnt); + } + /* Sort the two lists */ + if (leftcnt > 1) + s_DistinctAlignmentsSort(&leftlist, leftcnt); + if (rightcnt > 1) + s_DistinctAlignmentsSort(&rightlist, rightcnt); + + /* And then merge them */ + list = NULL; + tail = &list; + while (leftlist != NULL || rightlist != NULL) { + if (leftlist == NULL) { + *tail = rightlist; + rightlist = NULL; + } else if (rightlist == NULL) { + *tail = leftlist; + leftlist = NULL; + } else { + BlastCompo_Alignment * elt; + if (s_AlignmentCmp(leftlist, rightlist) < 0) { + elt = leftlist; + leftlist = leftlist->next; + } else { + elt = rightlist; + rightlist = rightlist->next; + } + *tail = elt; + tail = &elt->next; + } + } + *plist = list; + if (COMPO_INTENSE_DEBUG) { + assert(s_DistinctAlignmentsLength(list) == hspcnt); + assert(s_AlignmentsAreSorted(list)); + } + } +} + + +/** + * Copy a BlastCompo_Alignment, setting the next field to NULL + */ +static BlastCompo_Alignment * +s_AlignmentCopy(const BlastCompo_Alignment * align) +{ + return BlastCompo_AlignmentNew(align->score, + align->comp_adjustment_mode, + align->queryStart, + align->queryEnd, + align->queryIndex, + align->matchStart, + align->matchEnd, align->frame, + align->context); + +} + + +/** + * Given a list of alignments and a new alignment, create a new list + * of alignments that conditionally includes the new alignment. + * + * If there is an equal or higher-scoring alignment in the preexisting + * list of alignments that shares an endpoint with the new alignment, + * then preexisting list is returned. Otherwise, a new list is + * returned with the new alignment as its head and the elements of + * preexisting list that do not share an endpoint with the new + * alignment as its tail. The order of elements is preserved. + * + * Typically, a list of alignments is built one alignment at a time + * through a call to s_WithDistinctEnds. All alignments in the resulting + * list have distinct endpoints. Which items are retained in the list + * depends on the order in which they were added. + * + * Note that an endpoint is a triple, specifying a frame, a location + * in the query and a location in the subject. In other words, + * alignments that are not in the same frame never share endpoints. + * + * @param p_newAlign on input the alignment that may be added to + * the list; on output NULL + * @param p_oldAlignment on input the existing list of alignments; + * on output the new list + */ +static void +s_WithDistinctEnds(BlastCompo_Alignment **p_newAlign, + BlastCompo_Alignment **p_oldAlignments, + void free_align_tracebacks(void *)) +{ + /* Deference the input parameters. */ + BlastCompo_Alignment * newAlign = *p_newAlign; + BlastCompo_Alignment * oldAlignments = *p_oldAlignments; + BlastCompo_Alignment * align; /* represents the current + alignment in loops */ + int include_new_align; /* true if the new alignment + may be added to the list */ + *p_newAlign = NULL; + include_new_align = 1; + + for (align = oldAlignments; align != NULL; align = align->next) { + if (align->frame == newAlign->frame && + ((align->queryStart == newAlign->queryStart && + align->matchStart == newAlign->matchStart) + || (align->queryEnd == newAlign->queryEnd && + align->matchEnd == newAlign->matchEnd))) { + /* At least one of the endpoints of newAlign matches an endpoint + of align. */ + if (newAlign->score <= align->score) { + /* newAlign cannot be added to the list. */ + include_new_align = 0; + break; + } + } + } + if (include_new_align) { + /* tail of the list being created */ + BlastCompo_Alignment **tail; + + tail = &newAlign->next; + align = oldAlignments; + while (align != NULL) { + /* Save align->next because align may be deleted. */ + BlastCompo_Alignment * align_next = align->next; + align->next = NULL; + if (align->frame == newAlign->frame && + ((align->queryStart == newAlign->queryStart && + align->matchStart == newAlign->matchStart) + || (align->queryEnd == newAlign->queryEnd && + align->matchEnd == newAlign->matchEnd))) { + /* The alignment shares an end with newAlign; */ + /* delete it. */ + BlastCompo_AlignmentsFree(&align, free_align_tracebacks); + } else { /* The alignment does not share an end with newAlign; */ + /* add it to the output list. */ + *tail = align; + tail = &align->next; + } + align = align_next; + } /* end while align != NULL */ + *p_oldAlignments = newAlign; + } else { /* do not include_new_align */ + BlastCompo_AlignmentsFree(&newAlign, free_align_tracebacks); + } /* end else do not include newAlign */ +} + + +/** Release the data associated with this object. */ +static void s_SequenceDataRelease(BlastCompo_SequenceData * self) +{ + if (self->buffer) free(self->buffer); + self->data = NULL; self->buffer = NULL; +} + + + +/** + * Create and initialize a new s_WindowInfo. + * + * Parameters to this function correspond directly to fields of + * s_WindowInfo. + */ +static s_WindowInfo * +s_WindowInfoNew(int begin, int end, int context, + int queryOrigin, int queryLength, int query_index, + BlastCompo_Alignment * align) +{ + s_WindowInfo * window; /* new window to be returned */ + + window = malloc(sizeof(s_WindowInfo)); + if (window != NULL) { + window->subject_range.begin = begin; + window->subject_range.end = end; + window->subject_range.context = context; + window->query_range.begin = queryOrigin; + window->query_range.end = queryOrigin + queryLength; + window->query_range.context = query_index; + window->align = align; + window->hspcnt = 0; + for ( ; align != NULL; align = align->next) { + window->hspcnt++; + } + } + return window; +} + + +/** + * Free an instance of s_WindowInfo. + * + * @param *window on entry the window to be freed; on exit NULL + */ +static void +s_WindowInfoFree(s_WindowInfo ** window) +{ + if (*window != NULL) { + BlastCompo_AlignmentsFree(&(*window)->align, NULL); + free(*window); + } + *window = NULL; +} + + +/** + * Join two instance of s_WindowInfo into a single window + * + * @param win1 on entry, one of the two windows to be joined; on exit + * the combined window + * @param *pwin2 on entry, the other window to be joined, on exit NULL + */ +static void +s_WindowInfoJoin(s_WindowInfo * win1, s_WindowInfo ** pwin2) +{ + /* the second window, which will be deleted when this routine exits */ + s_WindowInfo * win2 = *pwin2; + BlastCompo_Alignment *align, **tail; + /* subject ranges for the two windows */ + BlastCompo_SequenceRange * sbjct_range1 = &win1->subject_range; + BlastCompo_SequenceRange * sbjct_range2 = &win2->subject_range; + + assert(sbjct_range1->context == sbjct_range2->context); + assert(win1->query_range.context == win2->query_range.context); + + sbjct_range1->begin = MIN(sbjct_range1->begin, sbjct_range2->begin); + sbjct_range1->end = MAX(sbjct_range1->end, sbjct_range2->end); + win1->hspcnt += win2->hspcnt; + + tail = &win1->align; + for (align = win1->align; align != NULL; align = align->next) { + tail = &align->next; + } + *tail = win2->align; + win2->align = NULL; + + s_WindowInfoFree(pwin2); +} + + +/** + * A comparison routine used to sort a list of windows, first by frame + * and then by location. + */ +static int +s_LocationCompareWindows(const void * vp1, const void *vp2) +{ + /* w1 and w2 are the windows being compared */ + s_WindowInfo * w1 = *(s_WindowInfo **) vp1; + s_WindowInfo * w2 = *(s_WindowInfo **) vp2; + /* the subject ranges of the two windows */ + BlastCompo_SequenceRange * sr1 = &w1->subject_range; + BlastCompo_SequenceRange * sr2 = &w2->subject_range; + /* the query indices of the two windows */ + /* the query ranges of the two windows */ + BlastCompo_SequenceRange * qr1 = &w1->query_range; + BlastCompo_SequenceRange * qr2 = &w2->query_range; + + int result; /* result of the comparison */ + if (0 == (result = CMP(qr1->context, qr2->context)) && + 0 == (result = CMP(sr1->context, sr2->context)) && + 0 == (result = CMP(sr1->begin, sr2->begin)) && + 0 == (result = CMP(sr1->end, sr2->end)) && + 0 == (result = CMP(qr1->begin, qr2->begin))) { + result = CMP(qr1->end, qr2->end); + } + return result; +} + + +/** + * A comparison routine used to sort a list of windows by position in + * the subject, ignoring strand and frame. Ties are broken + * deterministically. + */ +static int +s_SubjectCompareWindows(const void * vp1, const void *vp2) +{ + /* w1 and w2 are the windows being compared */ + s_WindowInfo * w1 = *(s_WindowInfo **) vp1; + s_WindowInfo * w2 = *(s_WindowInfo **) vp2; + /* the subject ranges of the two windows */ + BlastCompo_SequenceRange * sr1 = &w1->subject_range; + BlastCompo_SequenceRange * sr2 = &w2->subject_range; + /* the query ranges of the two windows */ + BlastCompo_SequenceRange * qr1 = &w1->query_range; + BlastCompo_SequenceRange * qr2 = &w2->query_range; + + int result; /* result of the comparison */ + if (0 == (result = CMP(sr1->begin, sr2->begin)) && + 0 == (result = CMP(sr1->end, sr2->end)) && + 0 == (result = CMP(sr1->context, sr2->context)) && + 0 == (result = CMP(qr1->begin, qr2->begin)) && + 0 == (result = CMP(qr1->end, qr2->end))) { + result = CMP(qr1->context, qr2->context); + } + return result; +} + + + +/** + * Read a list of alignments from a translated search and create a + * new array of pointers to s_WindowInfo so that each alignment is + * contained in exactly one window. See s_WindowsFromAligns for the + * meaning of the parameters. (@sa s_WindowsFromAligns). + * + * @return 0 on success, -1 on out-of-memory + */ +static int +s_WindowsFromTranslatedAligns(BlastCompo_Alignment * alignments, + BlastCompo_QueryInfo * query_info, + int hspcnt, int border, int sequence_length, + s_WindowInfo ***pwindows, int * nWindows) +{ + int k; /* iteration index */ + s_WindowInfo ** windows; /* the output list of windows */ + int length_joined; /* the current length of the + list of joined windows */ + BlastCompo_Alignment * align; /* represents the current + alignment in the main loop */ + *nWindows = 0; + windows = *pwindows = calloc(hspcnt, sizeof(s_WindowInfo*)); + *nWindows = hspcnt; + if (windows == NULL) + goto error_return; + + for (align = alignments, k = 0; + align != NULL; + align = align->next, k++) { + int frame; /* translation frame */ + int query_index; /* index of the query contained in the + current HSP */ + int query_origin; /* start of the current query in the + concatenated query */ + int query_length; /* length of the current query */ + int translated_length; /* length of the translation of the entire + nucleotide sequence in this frame */ + int begin, end; /* interval in amino acid coordinates of + the translated window */ + /* copy of the current alignment to add to the window */ + BlastCompo_Alignment * align_copy; + frame = align->frame; + query_index = align->queryIndex; + query_origin = query_info[query_index].origin; + query_length = query_info[query_index].seq.length; + translated_length = (sequence_length - ABS(frame) + 1)/3; + + begin = MAX(0, align->matchStart - border); + end = MIN(translated_length, align->matchEnd + border); + align_copy = s_AlignmentCopy(align); + if (align_copy == NULL) + goto error_return; + windows[k] = + s_WindowInfoNew(begin, end, frame, query_origin, query_length, + query_index, align_copy); + if (windows[k] == NULL) + goto error_return; + } + qsort(windows, hspcnt, sizeof(BlastCompo_SequenceRange*), + s_LocationCompareWindows); + + /* Join windows that overlap or are too close together. */ + length_joined = 0; + for (k = 0; k < hspcnt; k++) { /* for all windows in the + original list */ + s_WindowInfo * window; /* window at this value of k */ + s_WindowInfo * nextWindow; /* window at the next + value of k, or NULL if + no such window + exists */ + window = windows[k]; + nextWindow = ( k + 1 < hspcnt ) ? windows[k+1] : NULL; + + if(nextWindow != NULL && + window->subject_range.context == + nextWindow->subject_range.context && + window->query_range.context == nextWindow->query_range.context && + window->subject_range.end >= nextWindow->subject_range.begin) { + /* Join the current window with the next window. Do not add the + current window to the output list. */ + s_WindowInfoJoin(nextWindow, &windows[k]); + } else { + /* Don't join the current window with the next window. Add the + current window to the output list instead */ + windows[length_joined] = window; + length_joined++; + } /* end else don't join the current window with the next window */ + } /* end for all windows in the original list */ + *nWindows = length_joined; + + for (k = length_joined; k < hspcnt; k++) { + windows[k] = NULL; + } + for (k = 0; k < length_joined; k++) { + s_DistinctAlignmentsSort(&windows[k]->align, windows[k]->hspcnt); + } + qsort(windows, *nWindows, sizeof(BlastCompo_SequenceRange*), + s_SubjectCompareWindows); + return 0; /* normal return */ + +error_return: + for (k = 0; k < *nWindows; k++) { + if (windows[k] != NULL) + s_WindowInfoFree(&windows[k]); + } + free(windows); + *pwindows = NULL; + return -1; +} + + +/** + * Read a list of alignments from a protein search and create a + * new array of pointers to s_WindowInfo so that each alignment is + * contained in exactly one window. See s_WindowsFromAligns for the + * meaning of the parameters. (@sa s_WindowsFromAligns). + * + * @return 0 on success, -1 on out-of-memory + */ +static int +s_WindowsFromProteinAligns(BlastCompo_Alignment * alignments, + BlastCompo_QueryInfo * query_info, + int numQueries, + int sequence_length, + s_WindowInfo ***pwindows, + int * nWindows) +{ + BlastCompo_Alignment * align; + int query_index; /* index of the query */ + int query_origin; /* start of an individual query in the + concatenated query */ + int query_length; /* length of an individual query */ + int window_index; /* index of a window in the window list */ + + /* new list of windows */ + s_WindowInfo ** windows = + calloc(numQueries, sizeof(s_WindowInfo*)); + *nWindows = 0; + if (windows == NULL) + goto error_return; + *nWindows = numQueries; + for (align = alignments; align != NULL; align = align->next) { + BlastCompo_Alignment * copiedAlign; + + query_index = align->queryIndex; + query_origin = query_info[query_index].origin; + query_length = query_info[query_index].seq.length; + + if (windows[query_index] == NULL) { + windows[query_index] = + s_WindowInfoNew(0, sequence_length, 0, query_origin, + query_length, query_index, NULL); + if (windows[query_index] == NULL) + goto error_return; + } + copiedAlign = s_AlignmentCopy(align); + if (copiedAlign == NULL) + goto error_return; + copiedAlign->next = windows[query_index]->align; + windows[query_index]->align = copiedAlign; + windows[query_index]->hspcnt++; + } + window_index = 0; + for (query_index = 0; query_index < numQueries; query_index++) { + if (windows[query_index] != NULL) { + windows[window_index] = windows[query_index]; + s_AlignmentsRev(&windows[window_index]->align); + window_index++; + } + } + /* shrink to fit */ + { + s_WindowInfo ** new_windows = + realloc(windows, window_index * sizeof(BlastCompo_SequenceRange*)); + if (new_windows == NULL) { + goto error_return; + } else { + windows = new_windows; + *nWindows = window_index; + } + } + qsort(windows, *nWindows, sizeof(BlastCompo_SequenceRange*), + s_SubjectCompareWindows); + *pwindows = windows; + /* Normal return */ + return 0; + +error_return: + for (window_index = 0; window_index < *nWindows; window_index++) { + s_WindowInfoFree(&windows[window_index]); + } + free(windows); + return -1; +} + + +/** + * Read a list of alignments from a search (protein or translated) and + * create a new array of pointers to s_WindowInfo so that each + * alignment is contained in exactly one window. + * + * @param alignments a list of alignments from a translated + * search + * @param query_info information about the query/queries used + * in the search + * @param hspcnt number of alignments + * @param numQueries number of queries + * @param border border around windows; windows with + * overlapping borders will be joined. + * @param sequence_length length of the subject sequence, in + * nucleotides for translated searches or + * in amino acids for protein searches + * @param *pwindows the new array of windows + * @param nWindows the length of *pwindows + * @param subject_is_translated is the subject sequence translated? + * + * @return 0 on success, -1 on out-of-memory + */ +static int +s_WindowsFromAligns(BlastCompo_Alignment * alignments, + BlastCompo_QueryInfo * query_info, int hspcnt, + int numQueries, int border, int sequence_length, + s_WindowInfo ***pwindows, int * nWindows, + int subject_is_translated) +{ + if (subject_is_translated) { + return s_WindowsFromTranslatedAligns(alignments, query_info, + hspcnt, border, + sequence_length, + pwindows, nWindows); + } else { + return s_WindowsFromProteinAligns(alignments, query_info, + numQueries, sequence_length, + pwindows, nWindows); + } +} + + +/** + * Compute the amino acid composition of the subject region. + * + * @param subject_composition the computed composition. + * @param subject subject sequence data + * @param subject_range the range of the given subject data in + * the complete subject sequence + * @param align an alignment of the query to the + * subject range + */ +static void +s_GetSubjectComposition(Blast_AminoAcidComposition * subject_composition, + BlastCompo_SequenceData * subject, + BlastCompo_SequenceRange * subject_range, + BlastCompo_Alignment * align) +{ + Uint1 * subject_data; /* sequence data for the subject */ + int length; /* length of the subject portion of the alignment */ + int start; /* start of the subject portion, relative to the given + range */ + int finish; /* end of the subject portion, relative to the + given range */ + int translation_frame; /* the translation frame of the subject + sequence */ + /* [left, right) is the interval of the subject to use when + * computing composition. The endpoints are offsets into the + * subject_range. */ + int left, right; + + subject_data = subject->data; + length = subject_range->end - subject_range->begin; + start = align->matchStart - subject_range->begin; + finish = align->matchEnd - subject_range->begin; + translation_frame = subject_range->context; + + if (translation_frame == 0) { + /* This is not a tblastn search; use the whole subject when + * computing the composition */ + left = 0; + right = length; + } else { + /* This is a tblastn search; use only the part of the subject. */ + Blast_GetCompositionRange(&left, &right, subject_data, length, + start, finish); + } + Blast_ReadAaComposition(subject_composition, &subject_data[left], + right - left); +} + + +/** + * Compute an evalue from a score and a set of statistical parameters + */ +static double +s_EvalueFromScore(int score, double Lambda, double logK, double searchsp) +{ + return searchsp * exp(-(Lambda * score) + logK); +} + + +/** + * The number of bits by which the score of a previously computed + * alignment must exceed the score of the HSP under consideration for + * a containment relationship to be reported by the isContained + * routine. */ +#define KAPPA_BIT_TOL 2.0 + + +#define KAPPA_CONTAINED_IN_HSP(a,b,c,d,e,f) \ +((a <= c && b >= c) && (d <= f && e >= f)) +#define KAPPA_SIGN(a) ((a > 0) ? 1 : ((a < 0) ? -1 : 0)) +/** + * Return true if an alignment is contained in a previously-computed + * alignment of sufficiently high score. + * + * @param in_align the alignment to be tested + * @param alignments list of alignments + * @param lambda Karlin-Altschul statistical parameter + */ +static Boolean +s_IsContained(BlastCompo_Alignment * in_align, + BlastCompo_Alignment * alignments, + double lambda) +{ + BlastCompo_Alignment * align; /* represents the current alignment + in the main loop */ + /* Endpoints of the alignment */ + int query_offset = in_align->queryStart; + int query_end = in_align->queryEnd; + int subject_offset = in_align->matchStart; + int subject_end = in_align->matchEnd; + double score = in_align->score; + double scoreThresh = score + KAPPA_BIT_TOL * LOCAL_LN2/lambda; + + for (align = alignments; align != NULL; align = align->next ) { + /* for all elements of alignments */ + if (KAPPA_SIGN(in_align->frame) == KAPPA_SIGN(align->frame)) { + /* hsp1 and hsp2 are in the same query/subject frame */ + if (KAPPA_CONTAINED_IN_HSP + (align->queryStart, align->queryEnd, query_offset, + align->matchStart, align->matchEnd, subject_offset) && + KAPPA_CONTAINED_IN_HSP + (align->queryStart, align->queryEnd, query_end, + align->matchStart, align->matchEnd, subject_end) && + scoreThresh <= align->score) { + return 1; + } + } + } + return 0; +} + + +/** Free a set of Blast_RedoAlignParams */ +void +Blast_RedoAlignParamsFree(Blast_RedoAlignParams ** pparams) +{ + if (*pparams != NULL) { + Blast_MatrixInfoFree(&(*pparams)->matrix_info); + free((*pparams)->gapping_params); + free(*pparams); + *pparams = NULL; + } +} + +/** Create new Blast_RedoAlignParams object. The parameters of this + * function correspond directly to the fields of + * Blast_RedoAlignParams. The new Blast_RedoAlignParams object takes + * possession of *pmatrix_info and *pgapping_params, so these values + * are set to NULL on exit. */ +Blast_RedoAlignParams * +Blast_RedoAlignParamsNew(Blast_MatrixInfo ** pmatrix_info, + BlastCompo_GappingParams ** pgapping_params, + int adjustParameters, int positionBased, + int subject_is_translated, + int ccat_query_length, int cutoff_s, + double cutoff_e, int do_link_hsps, double Lambda, + double logK, + const Blast_RedoAlignCallbacks * callbacks) +{ + Blast_RedoAlignParams * params = malloc(sizeof(Blast_RedoAlignParams)); + if (params) { + params->matrix_info = *pmatrix_info; + *pmatrix_info = NULL; + params->gapping_params = *pgapping_params; + *pgapping_params = NULL; + + params->adjustParameters = adjustParameters; + params->positionBased = positionBased; + params->RE_pseudocounts = kReMatrixAdjustmentPseudocounts; + params->subject_is_translated = subject_is_translated; + params->ccat_query_length = ccat_query_length; + params->cutoff_s = cutoff_s; + params->cutoff_e = cutoff_e; + params->do_link_hsps = do_link_hsps; + params->Lambda = Lambda; + params->logK = logK; + params->callbacks = callbacks; + } else { + free(*pmatrix_info); *pmatrix_info = NULL; + free(*pgapping_params); *pgapping_params = NULL; + } + return params; +} + + +/** + * Recompute all alignments for one query/subject pair using + * composition-based statistics or composition-based matrix adjustment. + * + * @param alignments an array of lists containing the newly + * computed alignments. There is one array + * element for each query in the original + * search + * @param params parameters used to redo the alignments + * @param incoming_aligns a list of existing alignments + * @param hspcnt length of incoming_aligns + * @param matchingSeq the database sequence + * @param ccat_query_length the length of the concatenated query + * @param query information about all queries + * @param numQueries the number of queries + * @param matrix the scoring matrix + * @param NRrecord a workspace used to adjust the composition. + * + * @return 0 on success, -1 on out-of-memory + */ +int +Blast_RedoOneMatch(BlastCompo_Alignment ** alignments, + Blast_RedoAlignParams * params, + BlastCompo_Alignment * incoming_aligns, int hspcnt, + BlastCompo_MatchingSequence * matchingSeq, + int ccat_query_length, BlastCompo_QueryInfo query_info[], + int numQueries, int ** matrix, + Blast_CompositionWorkspace * NRrecord) +{ + int status = 0; /* return status */ + s_WindowInfo **windows; /* array of windows */ + int nWindows; /* length of windows */ + int window_index; /* loop index */ + int query_index; /* index of the current query */ + /* which mode of composition adjustment is actually used? */ + ECompoAdjustModes whichMode = eNoCompositionAdjustment; + + /* fields of params, as local variables */ + Blast_MatrixInfo * scaledMatrixInfo = params->matrix_info; + int adjustParameters = params->adjustParameters; + int positionBased = params->positionBased; + int RE_rule = params->adjustParameters - 1; + int RE_pseudocounts = params->RE_pseudocounts; + int subject_is_translated = params->subject_is_translated; + double Lambda = params->Lambda; + BlastCompo_GappingParams * gapping_params = params->gapping_params; + const Blast_RedoAlignCallbacks * callbacks = params->callbacks; + + assert(adjustParameters < 2 || !positionBased); + for (query_index = 0; query_index < numQueries; query_index++) { + alignments[query_index] = NULL; + } + status = + s_WindowsFromAligns(incoming_aligns, query_info, hspcnt, numQueries, + kWindowBorder, matchingSeq->length, &windows, + &nWindows, subject_is_translated); + if (status != 0) { + goto function_level_cleanup; + } + /* for all windows */ + for (window_index = 0; window_index < nWindows; window_index++) { + s_WindowInfo * window; /* the current window */ + BlastCompo_Alignment * in_align; /* the current alignment */ + int hsp_index; /* index of the current alignment */ + /* data for the current window */ + BlastCompo_SequenceData subject = {0,}; + BlastCompo_SequenceData * query; /* query data for this window */ + /* the composition of this query */ + Blast_AminoAcidComposition * query_composition; + + window = windows[window_index]; + status = + callbacks->get_range(matchingSeq, &window->subject_range, + &subject); + if (status != 0) { + goto window_index_loop_cleanup; + } + /* for all alignments in this window */ + for (in_align = window->align, hsp_index = 0; + in_align != NULL; + in_align = in_align->next, hsp_index++) { + query_index = in_align->queryIndex; + query = &query_info[query_index].seq; + query_composition = &query_info[query_index].composition; + /* if in_align is not contained in a higher-scoring + * alignment */ + if ( !s_IsContained(in_align, alignments[query_index], Lambda) ) { + BlastCompo_Alignment * newAlign; /* the new alignment */ + /* adjust_search_failed is true only if Blast_AdjustScores + * is called and returns a nonzero value */ + int adjust_search_failed = 0; + if (adjustParameters && + (subject_is_translated || hsp_index == 0)) { + Blast_AminoAcidComposition subject_composition; + s_GetSubjectComposition(&subject_composition, + &subject, + &window->subject_range, + in_align); + adjust_search_failed = + Blast_AdjustScores(matrix, query_composition, + query->length, + &subject_composition, + subject.length, + scaledMatrixInfo, RE_rule, + RE_pseudocounts, NRrecord, + &whichMode, + callbacks->calc_lambda); + if (adjust_search_failed < 0) { /* fatal error */ + status = adjust_search_failed; + goto window_index_loop_cleanup; + } + } + if ( !adjust_search_failed ) { + newAlign = + callbacks-> + redo_one_alignment(in_align, whichMode, + query, &window->query_range, + ccat_query_length, + &subject, &window->subject_range, + matchingSeq->length, + gapping_params); + s_WithDistinctEnds(&newAlign, &alignments[query_index], + callbacks->free_align_traceback); + } + } /* end if in_align is not contained...*/ + } /* end for all alignments in this window */ +window_index_loop_cleanup: + if (subject.data != NULL) + s_SequenceDataRelease(&subject); + if (status != 0) + goto function_level_cleanup; + } /* end for all windows */ +function_level_cleanup: + if (status != 0) { + for (query_index = 0; query_index < numQueries; query_index++) { + BlastCompo_AlignmentsFree(&alignments[query_index], + callbacks->free_align_traceback); + } + } + for (window_index = 0; window_index < nWindows; window_index++) { + s_WindowInfoFree(&windows[window_index]); + } + free(windows); + + return status; +} + + +/** + * Recompute all alignments for one query/subject pair using the + * Smith-Waterman algorithm and possibly also composition-based + * statistics or composition-based matrix adjustment. + * + * @param alignments an array of lists containing the newly + * computed alignments. There is one array + * element for each query in the original + * search + * @param params parameters used to redo the alignments + * @param incoming_aligns a list of existing alignments + * @param hspcnt length of incoming_aligns + * @param matchingSeq the database sequence + * @param query information about all queries + * @param numQueries the number of queries + * @param matrix the scoring matrix + * @param NRrecord a workspace used to adjust the composition. + * @param forbidden a workspace used to hold forbidden ranges + * for the Smith-Waterman algorithm. + * @param significantMatches an array of heaps of alignments for + * query-subject pairs that have already + * been redone; used to terminate the + * Smith-Waterman algorithm early if it is + * clear that the current match is not + * significant enough to be saved. + * + * @return 0 on success, -1 on out-of-memory + */ +int +Blast_RedoOneMatchSmithWaterman(BlastCompo_Alignment ** alignments, + Blast_RedoAlignParams * params, + BlastCompo_Alignment * incoming_aligns, + int hspcnt, + BlastCompo_MatchingSequence * matchingSeq, + BlastCompo_QueryInfo query_info[], + int numQueries, + int ** matrix, + Blast_CompositionWorkspace * NRrecord, + Blast_ForbiddenRanges * forbidden, + BlastCompo_Heap * significantMatches) +{ + int status = 0; /* status return value */ + s_WindowInfo **windows = NULL; /* array of windows */ + int nWindows; /* length of windows */ + int window_index; /* loop index */ + int query_index; /* index of the current query */ + /* which mode of composition adjustment is actually used? */ + ECompoAdjustModes whichMode = eNoCompositionAdjustment; + + /* fields of params, as local variables */ + Blast_MatrixInfo * scaledMatrixInfo = params->matrix_info; + int adjustParameters = params->adjustParameters; + int positionBased = params->positionBased; + int RE_rule = params->adjustParameters - 1; + int RE_pseudocounts = params->RE_pseudocounts; + int subject_is_translated = params->subject_is_translated; + int do_link_hsps = params->do_link_hsps; + int ccat_query_length = params->ccat_query_length; + BlastCompo_GappingParams * gapping_params = params->gapping_params; + double Lambda = params->Lambda; + double logK = params->logK; + const Blast_RedoAlignCallbacks * callbacks = params->callbacks; + + int gap_open = gapping_params->gap_open; + int gap_extend = gapping_params->gap_extend; + + assert(adjustParameters < 2 || !positionBased); + for (query_index = 0; query_index < numQueries; query_index++) { + alignments[query_index] = NULL; + } + /* Find the multiple translation windows used by tblastn queries. */ + status = + s_WindowsFromAligns(incoming_aligns, query_info, hspcnt, numQueries, + kWindowBorder, matchingSeq->length, &windows, + &nWindows, subject_is_translated); + if (status != 0) + goto function_level_cleanup; + /* We are performing a Smith-Waterman alignment */ + for (window_index = 0; window_index < nWindows; window_index++) { + /* for all window */ + s_WindowInfo * window = NULL; /* the current window */ + BlastCompo_SequenceData subject = {0,}; + /* subject data for this window */ + BlastCompo_SequenceData * query; /* query data for this window */ + /* the composition of this query */ + Blast_AminoAcidComposition * query_composition; + double searchsp; /* effective search space */ + + /* adjust_search_failed is true only if Blast_AdjustScores + * is called and returns a nonzero value */ + int adjust_search_failed = FALSE; + + window = windows[window_index]; + query_index = window->query_range.context; + query = &query_info[query_index].seq; + query_composition = &query_info[query_index].composition; + searchsp = query_info[query_index].eff_search_space; + + status = callbacks->get_range(matchingSeq, &window->subject_range, + &subject); + if (status != 0) + goto window_index_loop_cleanup; + + /* For Smith-Waterman alignments, adjust the search using the + * composition of the highest scoring alignment in window */ + if (adjustParameters) { + Blast_AminoAcidComposition subject_composition; + s_GetSubjectComposition(&subject_composition, + &subject, &window->subject_range, + window->align); + adjust_search_failed = + Blast_AdjustScores(matrix, + query_composition, query->length, + &subject_composition, subject.length, + scaledMatrixInfo, + RE_rule, RE_pseudocounts, NRrecord, + &whichMode, callbacks->calc_lambda); + if (adjust_search_failed < 0) { /* fatal error */ + status = adjust_search_failed; + goto window_index_loop_cleanup; + } + } + if ( !adjust_search_failed ) { + /* BlastCompo_AdjustSearch ran without error; compute the new + alignments. */ + int aSwScore; /* score computed by the + * Smith-Waterman algorithm. */ + int alignment_is_significant; /* True if the score/evalue of + * the Smith-Waterman alignment + * is significant. */ + Blast_ForbiddenRangesClear(forbidden); + do { + int matchEnd, queryEnd; /* end points of the alignments + * computed by the Smith-Waterman + * algorithm. */ + status = + Blast_SmithWatermanScoreOnly(&aSwScore, &matchEnd, + &queryEnd, + subject.data, + subject.length, + query->data, + query->length, matrix, + gap_open, gap_extend, + positionBased, + forbidden); + if (status != 0) + goto window_index_loop_cleanup; + + if (do_link_hsps) { + alignment_is_significant = aSwScore >= params->cutoff_s; + } else { + double newSwEvalue; /* evalue as computed by the + * Smith-Waterman algorithm */ + newSwEvalue = + s_EvalueFromScore(aSwScore, Lambda, logK, searchsp); + + alignment_is_significant = newSwEvalue < params->cutoff_e; + if (alignments[query_index] == NULL) { + /* this is the most significant alignment; if + * it will not be accepted, no alignments from + * this match will */ + alignment_is_significant = + alignment_is_significant && + BlastCompo_HeapWouldInsert( + &significantMatches[query_index], + newSwEvalue, aSwScore, matchingSeq->index); + } + } + if (alignment_is_significant) { + /* the redone alignment */ + BlastCompo_Alignment * newAlign; + int matchStart, queryStart; /* the start of the + * alignment in the + * match/query sequence */ + int updatedScore; /* score found by the SW + algorithm run in reverse */ + status = + Blast_SmithWatermanFindStart(&updatedScore, + &matchStart, + &queryStart, + subject.data, + subject.length, + query->data, + matrix, gap_open, + gap_extend, + matchEnd, + queryEnd, + aSwScore, + positionBased, + forbidden); + if (status != 0) { + goto window_index_loop_cleanup; + } + status = + callbacks-> + new_xdrop_align(&newAlign, &queryEnd, &matchEnd, + queryStart, matchStart, aSwScore, + query, &window->query_range, + ccat_query_length, + &subject, &window->subject_range, + matchingSeq->length, + gapping_params, whichMode); + if (status != 0) { + goto window_index_loop_cleanup; + } + newAlign->next = alignments[query_index]; + alignments[query_index] = newAlign; + + if (window->hspcnt > 1) { + /* We may compute more alignments; make the range + of the current alignment forbidden */ + status = + Blast_ForbiddenRangesPush(forbidden, + queryStart, queryEnd, + matchStart, matchEnd); + } + if (status != 0) { + goto window_index_loop_cleanup; + } + } + /* end if the next local alignment is significant */ + } while (alignment_is_significant && window->hspcnt > 1); + /* end do..while the next local alignment is significant, and + * the original blast search found more than one alignment. */ + } /* end if BlastCompo_AdjustSearch ran without error. */ +window_index_loop_cleanup: + if (subject.data != NULL) + s_SequenceDataRelease(&subject); + if (status != 0) + goto function_level_cleanup; + } /* end for all windows */ + +function_level_cleanup: + if (status != 0) { + for (query_index = 0; query_index < numQueries; query_index++) { + BlastCompo_AlignmentsFree(&alignments[query_index], + callbacks->free_align_traceback); + } + } + for (window_index = 0; window_index < nWindows; window_index++) { + s_WindowInfoFree(&windows[window_index]); + } + free(windows); + + return status; +} + + +/** Return true if a heuristic determines that it is unlikely to be + * worthwhile to redo a query-subject pair with the given evalue; used + * to terminate the main loop for redoing all alignments early. */ +int +BlastCompo_EarlyTermination(double evalue, + BlastCompo_Heap significantMatches[], + int numQueries) +{ + int i; + for (i = 0; i < numQueries; i++) { + if (BlastCompo_HeapFilledToCutoff(&significantMatches[i])) { + double ecutoff = significantMatches[i].ecutoff; + /* Only matches with evalue <= ethresh will be saved. */ + if (evalue <= EVALUE_STRETCH * ecutoff) { + /* The evalue if this match is sufficiently small + * that we want to redo it to try to obtain an + * alignment with evalue smaller than ecutoff. */ + return FALSE; + } + } else { + return FALSE; + } + } + return TRUE; +} diff --git a/algo/blast/composition_adjustment/redo_alignment.h b/algo/blast/composition_adjustment/redo_alignment.h new file mode 100644 index 00000000..948fac72 --- /dev/null +++ b/algo/blast/composition_adjustment/redo_alignment.h @@ -0,0 +1,333 @@ +/* $Id: redo_alignment.h,v 1.1 2005/12/01 13:52:42 gertz Exp $ + * =========================================================================== + * + * PUBLIC DOMAIN NOTICE + * National Center for Biotechnology Information + * + * This software/database is a "United States Government Work" under the + * terms of the United States Copyright Act. It was written as part of + * the author's official duties as a United States Government employee and + * thus cannot be copyrighted. This software/database is freely available + * to the public for use. The National Library of Medicine and the U.S. + * Government have not placed any restriction on its use or reproduction. + * + * Although all reasonable efforts have been taken to ensure the accuracy + * and reliability of the software and data, the NLM and the U.S. + * Government do not and cannot warrant the performance or results that + * may be obtained by using this software or data. The NLM and the U.S. + * Government disclaim all warranties, express or implied, including + * warranties of performance, merchantability or fitness for any particular + * purpose. + * + * Please cite the author in any work or product based on this material. + * + * ===========================================================================*/ +/** + * @file kappa_common.h + * @author Alejandro Schaffer, E. Michael Gertz + * + * Definitions used to redo a set of alignments, using either + * composition matrix adjustment or the Smith-Waterman algorithm (or + * both.) + * + * Definitions with the prefix 'BlastCompo_' are primarily intended for use + * by glue code that interfaces with this module, i.e. the definitions + * need to be externally available so that glue code may be written, but + * are not intended for general use. + */ +#ifndef __REDO_ALIGNMENT__ +#define __REDO_ALIGNMENT__ + +#include <algo/blast/composition_adjustment/composition_adjustment.h> +#include <algo/blast/composition_adjustment/smith_waterman.h> +#include <algo/blast/composition_adjustment/compo_heap.h> + + +/** + * Within the composition adjustment module, an object of type + * BlastCompo_Alignment represents a distinct alignment of the query + * sequence to the current subject sequence. These objects are + * typically part of a singly linked list of distinct alignments, + * stored in the reverse of the order in which they were computed. + */ +typedef struct BlastCompo_Alignment { + int score; /**< the score of this alignment */ + ECompoAdjustModes comp_adjustment_mode; /**< how the score was computed */ + int queryIndex; /**< index of the query in a concatenated query */ + int queryStart; /**< the start of the alignment in the query */ + int queryEnd; /**< one past the end of the alignment in the query */ + int matchStart; /**< the start of the alignment in the subject */ + int matchEnd; /**< one past the end of the alignment in the + subject */ + int frame; /**< the subject frame */ + void * context; /**< traceback info for a gapped alignment */ + struct BlastCompo_Alignment * next; /**< the next alignment in the + list */ +} BlastCompo_Alignment; + +NCBI_XBLAST_EXPORT +BlastCompo_Alignment * +BlastCompo_AlignmentNew(int score, + ECompoAdjustModes comp_adjustment_mode, + int queryIndex, int queryStart, int queryEnd, + int matchStart, int matchEnd, int frame, + void * context); + +void BlastCompo_AlignmentsFree(BlastCompo_Alignment ** palign, + void (*free_context)(void*)); + +/** Parameters used to compute gapped alignments */ +struct BlastCompo_GappingParams { + int gap_open; /**< penalty for opening a gap */ + int gap_extend; /**< penalty for extending a gapped alignment by + one residue */ + int decline_align; /**< penalty for declining to align two characters */ + int x_dropoff; /**< for x-drop algorithms, once a path falls below + the best score by this (positive) amount, the + path is no longer searched */ + void * context; /**< a pointer to any additional gapping parameters + that may be needed by the calling routine. */ +}; +typedef struct BlastCompo_GappingParams BlastCompo_GappingParams; + + +/** + * BlastCompo_SequenceRange - a struct whose instances represent a range + * of data in a sequence. */ +typedef struct BlastCompo_SequenceRange +{ + int begin; /**< the starting index of the range */ + int end; /**< one beyond the last item in the range */ + int context; /**< integer identifier for this window, can + indicate a translation frame or an index into a + set of sequences. */ +} BlastCompo_SequenceRange; + + +/** + * BlastCompo_SequenceData - represents a string of amino acids or nucleotides + */ +typedef struct BlastCompo_SequenceData { + Uint1 * data; /**< amino acid or nucleotide data */ + int length; /**< the length of data. For amino acid data + &data[-1] is a valid address and + data[-1] == 0. */ + Uint1 * buffer; /**< if non-nil, points to memory that + must be freed when this instance of + BlastCompo_SequenceData is deleted. */ +} BlastCompo_SequenceData; + + +/** + * A BlastCompo_MatchingSequence represents a subject sequence to be aligned + * with the query. This abstract sequence is used to hide the + * complexity associated with actually obtaining and releasing the + * data for a matching sequence, e.g. reading the sequence from a DB + * or translating it from a nucleotide sequence. + * + * We draw a distinction between a sequence itself, and strings of + * data that may be obtained from the sequence. The amino + * acid/nucleotide data is represented by an object of type + * BlastCompo_SequenceData. There may be more than one instance of + * BlastCompo_SequenceData per BlastCompo_MatchingSequence, each representing a + * different range in the sequence, or a different translation frame. + */ +typedef struct BlastCompo_MatchingSequence { + Int4 length; /**< length of this matching sequence */ + Int4 index; /**< index of this sequence in the database */ + void * local_data; +} BlastCompo_MatchingSequence; + + +/** Collected information about a query */ +struct BlastCompo_QueryInfo { + int origin; /**< origin of the query in a + concatenated query */ + BlastCompo_SequenceData seq; /**< sequence data for the query */ + Blast_AminoAcidComposition composition; /**< the composition of + the query */ + double eff_search_space; /**< effective search space of searches + involving this query */ +}; +typedef struct BlastCompo_QueryInfo BlastCompo_QueryInfo; + + +/** Callbacks **/ + +/** Function type: calculate the statistical parameter Lambda from a + * set of score probabilities. + * + * @param probs an array of score probabilities + * @param min_score the score corresponding to probs[0] + * @param max_score the largest score in the probs array + * @param lambda0 an initial guess for Lambda + * @return Lambda + */ +typedef double +calc_lambda_type(double * probs, int min_score, int max_score, + double lambda0); + +/** + * Function type: Get a range of data for a sequence. + * + * @param sequence a sequence + * @param range the range to get + * @param data the data obtained + */ +typedef int +get_range_type(const BlastCompo_MatchingSequence * sequence, + const BlastCompo_SequenceRange * range, + BlastCompo_SequenceData * data); + +/** + * Function type: Calculate the traceback for one alignment by + * performing an x-drop alignment in both directions + * + * @param in_align the existing alignment, without traceback + * @param whichMode which mode of composition adjustment has + * been used to adjust the scoring matrix + * @param query_data query sequence data + * @param query_range range of this query in the concatenated + * query + * @param ccat_query_length total length of the concatenated query + * @param subject_data subject sequence data + * @param subject_range range of subject_data in the translated + * query, in amino acid coordinates + * @param full_subject_length length of the full subject sequence + * @param gapping_params parameters used to compute gapped + * alignments + */ +typedef BlastCompo_Alignment * +redo_one_alignment_type(BlastCompo_Alignment * in_align, + ECompoAdjustModes whichMode, + BlastCompo_SequenceData * query_data, + BlastCompo_SequenceRange * query_range, + int ccat_query_length, + BlastCompo_SequenceData * subject_data, + BlastCompo_SequenceRange * subject_range, + int full_subject_length, + BlastCompo_GappingParams * gapping_params); + +/** + * Function type: Calculate the traceback for one alignment by + * performing an x-drop alignment in the forward direction, possibly + * increasing the x-drop parameter until the desired score is + * attained. + * + * The start, end and score of the alignment should be obtained + * using the Smith-Waterman algorithm before this routine is called. + * + * @param *palign the new alignment + * @param *pqueryEnd on entry, the end of the alignment in the + * query, as computed by the Smith-Waterman + * algorithm. On exit, the end as computed by + * the x-drop algorithm + * @param *pmatchEnd like as *pqueryEnd, but for the subject + * sequence + * @param queryStart the starting point in the query + * @param matchStart the starting point in the subject + * @param score the score of the alignment, as computed by + * the Smith-Waterman algorithm + * @param query query sequence data + * @param query_range range of this query in the concatenated + * query + * @param ccat_query_length total length of the concatenated query + * @param subject subject sequence data + * @param subject_range range of subject_data in the translated + * query, in amino acid coordinates + * @param full_subject_length length of the full subject sequence + * @param gapping_params parameters used to compute gapped + * alignments + * @param whichMode which mode of composition adjustment has + * been used to adjust the scoring matrix + * @return 0 on success, -1 for out-of-memory error + */ +typedef int +new_xdrop_align_type(BlastCompo_Alignment **palign, + Int4 * pqueryEnd, Int4 * pmatchEnd, + Int4 queryStart, Int4 matchStart, Int4 score, + BlastCompo_SequenceData * query, + BlastCompo_SequenceRange * query_range, + Int4 ccat_query_length, + BlastCompo_SequenceData * subject, + BlastCompo_SequenceRange * subject_range, + Int4 full_subject_length, + BlastCompo_GappingParams * gapping_params, + ECompoAdjustModes whichMode); + +/** Callbacks used by Blast_RedoOneMatch and + * Blast_RedoOneMatchSmithWaterman routines */ +struct Blast_RedoAlignCallbacks { + calc_lambda_type * calc_lambda; + get_range_type * get_range; + redo_one_alignment_type * redo_one_alignment; + new_xdrop_align_type * new_xdrop_align; + void (*free_align_traceback)(void*); +}; +typedef struct Blast_RedoAlignCallbacks Blast_RedoAlignCallbacks; + +/** A parameter block for the Blast_RedoOneMatch and + * Blast_RedoOneMatchSmithWaterman routines */ +struct Blast_RedoAlignParams { + Blast_MatrixInfo * matrix_info; + BlastCompo_GappingParams * gapping_params; + int adjustParameters; + int positionBased; + int RE_pseudocounts; + int subject_is_translated; + int ccat_query_length; + int cutoff_s; + double cutoff_e; + int do_link_hsps; + double Lambda; + double logK; + const Blast_RedoAlignCallbacks * callbacks; +}; +typedef struct Blast_RedoAlignParams Blast_RedoAlignParams; + + +NCBI_XBLAST_EXPORT +Blast_RedoAlignParams * +Blast_RedoAlignParamsNew(Blast_MatrixInfo ** pmatrix_info, + BlastCompo_GappingParams **pgapping_params, + int adjustParameters, int positionBased, + int subject_is_translated, + int ccat_query_length, int cutoff_s, + double cutoff_e, int do_link_hsps, double Lambda, + double logK, + const Blast_RedoAlignCallbacks * callbacks); + +NCBI_XBLAST_EXPORT +void Blast_RedoAlignParamsFree(Blast_RedoAlignParams ** pparams); + +NCBI_XBLAST_EXPORT +int Blast_RedoOneMatchSmithWaterman(BlastCompo_Alignment ** alignments, + Blast_RedoAlignParams * params, + BlastCompo_Alignment * in_aligns, + int hspcnt, + BlastCompo_MatchingSequence * matchingSeq, + BlastCompo_QueryInfo query[], + int numQueries, + int ** matrix, + Blast_CompositionWorkspace * NRrecord, + Blast_ForbiddenRanges * forbidden, + BlastCompo_Heap * significantMatches); + +NCBI_XBLAST_EXPORT +int Blast_RedoOneMatch(BlastCompo_Alignment ** alignments, + Blast_RedoAlignParams * params, + BlastCompo_Alignment * incoming_aligns, + int hspcnt, + BlastCompo_MatchingSequence * matchingSeq, + int ccat_query_length, + BlastCompo_QueryInfo query[], + int numQueries, + int ** matrix, + Blast_CompositionWorkspace * NRrecord); + +NCBI_XBLAST_EXPORT +int BlastCompo_EarlyTermination(double evalue, + BlastCompo_Heap significantMatches[], + int numQueries); + +#endif diff --git a/algo/blast/composition_adjustment/smith_waterman.c b/algo/blast/composition_adjustment/smith_waterman.c new file mode 100644 index 00000000..66b7f9cf --- /dev/null +++ b/algo/blast/composition_adjustment/smith_waterman.c @@ -0,0 +1,715 @@ +/* =========================================================================== +* +* PUBLIC DOMAIN NOTICE +* National Center for Biotechnology Information +* +* This software/database is a "United States Government Work" under the +* terms of the United States Copyright Act. It was written as part of +* the author's official duties as a United States Government employee and +* thus cannot be copyrighted. This software/database is freely available +* to the public for use. The National Library of Medicine and the U.S. +* Government have not placed any restriction on its use or reproduction. +* +* Although all reasonable efforts have been taken to ensure the accuracy +* and reliability of the software and data, the NLM and the U.S. +* Government do not and cannot warrant the performance or results that +* may be obtained by using this software or data. The NLM and the U.S. +* Government disclaim all warranties, express or implied, including +* warranties of performance, merchantability or fitness for any particular +* purpose. +* +* Please cite the author in any work or product based on this material. +* +* ===========================================================================*/ + +/** + * @file smith_waterman.c + * + * Routines for computing rigorous, Smith-Waterman alignments. + */ +#ifndef SKIP_DOXYGEN_PROCESSING +static char const rcsid[] = + "$Id: smith_waterman.c,v 1.1 2005/12/01 13:48:09 gertz Exp $"; +#endif /* SKIP_DOXYGEN_PROCESSING */ + +#include <algo/blast/core/ncbi_std.h> +#include <algo/blast/composition_adjustment/composition_constants.h> +#include <algo/blast/composition_adjustment/smith_waterman.h> + +/** A structure used internally by the Smith-Waterman algorithm to + * represent gaps */ +typedef struct SwGapInfo { + int noGap; + int gapExists; +} SwGapInfo; + + +/** + * Compute the score and right-hand endpoints of the locally optimal + * Smith-Waterman alignment. + * + * @param *score the computed score + * @param *matchSeqEnd the right-hand end of the alignment in the + * database sequence + * @param *queryEnd the right-hand end of the alignment in the + * query sequence + * @param matchSeq the database sequence data + * @param matchSeqLength length of matchSeq + * @param query the query sequence data + * @param queryLength length of query + * @param matrix amino-acid scoring matrix + * @param gapOpen penalty for opening a gap + * @param gapExtend penalty for extending a gap by one amino acid + * @param positionSpecific determines whether matrix is position + * specific or not + */ +static int +BLbasicSmithWatermanScoreOnly(int *score, int *matchSeqEnd, int *queryEnd, + const Uint1 * matchSeq, int matchSeqLength, + const Uint1 * query, int queryLength, + int **matrix, int gapOpen, int gapExtend, + int positionSpecific) +{ + int bestScore; /* best score seen so far */ + int newScore; /* score of next entry */ + int bestMatchSeqPos, bestQueryPos; /* position ending best score in + matchSeq and query sequences */ + SwGapInfo *scoreVector; /* keeps one row of the + Smith-Waterman matrix overwrite + old row with new row */ + int *matrixRow; /* one row of score matrix */ + int newGapCost; /* cost to have a gap of one character */ + int prevScoreNoGapMatchSeq; /* score one row and column up with + no gaps */ + int prevScoreGapMatchSeq; /* score if a gap already started in + matchSeq */ + int continueGapScore; /* score for continuing a gap in matchSeq */ + int matchSeqPos, queryPos; /* positions in matchSeq and query */ + + scoreVector = (SwGapInfo *) malloc(matchSeqLength * sizeof(SwGapInfo)); + if (scoreVector == NULL) { + return -1; + } + bestMatchSeqPos = 0; + bestQueryPos = 0; + bestScore = 0; + newGapCost = gapOpen + gapExtend; + for (matchSeqPos = 0; matchSeqPos < matchSeqLength; matchSeqPos++) { + scoreVector[matchSeqPos].noGap = 0; + scoreVector[matchSeqPos].gapExists = -gapOpen; + } + for (queryPos = 0; queryPos < queryLength; queryPos++) { + if (positionSpecific) + matrixRow = matrix[queryPos]; + else + matrixRow = matrix[query[queryPos]]; + newScore = 0; + prevScoreNoGapMatchSeq = 0; + prevScoreGapMatchSeq = -(gapOpen); + for (matchSeqPos = 0; matchSeqPos < matchSeqLength; matchSeqPos++) { + /* testing scores with a gap in matchSeq, either starting a + * new gap or extending an existing gap*/ + if ((newScore = newScore - newGapCost) > + (prevScoreGapMatchSeq = prevScoreGapMatchSeq - gapExtend)) + prevScoreGapMatchSeq = newScore; + /* testing scores with a gap in query, either starting a + * new gap or extending an existing gap*/ + if ((newScore = scoreVector[matchSeqPos].noGap - newGapCost) > + (continueGapScore = + scoreVector[matchSeqPos].gapExists - gapExtend)) + continueGapScore = newScore; + /* compute new score extending one position in matchSeq + * and query */ + newScore = + prevScoreNoGapMatchSeq + matrixRow[matchSeq[matchSeqPos]]; + if (newScore < 0) + newScore = 0; /*Smith-Waterman locality condition*/ + /*test two alternatives*/ + if (newScore < prevScoreGapMatchSeq) + newScore = prevScoreGapMatchSeq; + if (newScore < continueGapScore) + newScore = continueGapScore; + prevScoreNoGapMatchSeq = scoreVector[matchSeqPos].noGap; + scoreVector[matchSeqPos].noGap = newScore; + scoreVector[matchSeqPos].gapExists = continueGapScore; + if (newScore > bestScore) { + bestScore = newScore; + bestQueryPos = queryPos; + bestMatchSeqPos = matchSeqPos; + } + } + } + free(scoreVector); + if (bestScore < 0) + bestScore = 0; + *matchSeqEnd = bestMatchSeqPos; + *queryEnd = bestQueryPos; + *score = bestScore; + + return 0; +} + + +/** + * Find the left-hand endpoints of the locally optimal Smith-Waterman + * alignment given the score and right-hand endpoints computed by + * BLbasicSmithWatermanScoreOnly. + * + * @param *score_out the score of the optimal alignment -- should + * equal score_in. + * @param *matchSeqStart the left-hand endpoint of the alignment in + * the database sequence + * @param *queryStart the right-hand endpoint of the alignment + * in the query sequence + * @param matchSeq the database sequence data + * @param matchSeqLength length of matchSeq + * @param query the query sequence data + * @param matrix amino-acid scoring matrix + * @param gapOpen penalty for opening a gap + * @param gapExtend penalty for extending a gap by one amino acid + * @param matchSeqEnd right-hand endpoint of the alignment in + * the database sequence + * @param queryEnd right-hand endpoint of the alignment in + * the query + * @param score_in the score of the alignment + * @param positionSpecific determines whether matrix is position + * specific or not + */ +static int +BLSmithWatermanFindStart(int *score_out, + int *matchSeqStart, int *queryStart, + const Uint1 * matchSeq, int matchSeqLength, + const Uint1 *query, + int **matrix, int gapOpen, int gapExtend, + int matchSeqEnd, int queryEnd, int score_in, + int positionSpecific) +{ + int bestScore; /* best score seen so far*/ + int newScore; /* score of next entry*/ + int bestMatchSeqPos, bestQueryPos; /*position starting best score in + matchSeq and database sequences */ + SwGapInfo *scoreVector; /* keeps one row of the Smith-Waterman + matrix overwrite old row with new row */ + int *matrixRow; /* one row of score matrix */ + int newGapCost; /* cost to have a gap of one character */ + int prevScoreNoGapMatchSeq; /* score one row and column up + with no gaps*/ + int prevScoreGapMatchSeq; /* score if a gap already started in + matchSeq */ + int continueGapScore; /* score for continuing a gap in query */ + int matchSeqPos, queryPos; /* positions in matchSeq and query */ + + scoreVector = (SwGapInfo *) malloc(matchSeqLength * sizeof(SwGapInfo)); + if (scoreVector == NULL) { + return -1; + } + bestMatchSeqPos = 0; + bestQueryPos = 0; + bestScore = 0; + newGapCost = gapOpen + gapExtend; + for (matchSeqPos = 0; matchSeqPos < matchSeqLength; matchSeqPos++) { + scoreVector[matchSeqPos].noGap = 0; + scoreVector[matchSeqPos].gapExists = -(gapOpen); + } + for (queryPos = queryEnd; queryPos >= 0; queryPos--) { + if (positionSpecific) + matrixRow = matrix[queryPos]; + else + matrixRow = matrix[query[queryPos]]; + newScore = 0; + prevScoreNoGapMatchSeq = 0; + prevScoreGapMatchSeq = -(gapOpen); + for (matchSeqPos = matchSeqEnd; matchSeqPos >= 0; matchSeqPos--) { + /* testing scores with a gap in matchSeq, either starting + * a new gap or extending an existing gap */ + if ((newScore = newScore - newGapCost) > + (prevScoreGapMatchSeq = prevScoreGapMatchSeq - gapExtend)) + prevScoreGapMatchSeq = newScore; + /* testing scores with a gap in query, either starting a + * new gap or extending an existing gap */ + if ((newScore = scoreVector[matchSeqPos].noGap - newGapCost) > + (continueGapScore = + scoreVector[matchSeqPos].gapExists - gapExtend)) + continueGapScore = newScore; + /* compute new score extending one position in matchSeq + * and query */ + newScore = + prevScoreNoGapMatchSeq + matrixRow[matchSeq[matchSeqPos]]; + if (newScore < 0) + newScore = 0; /* Smith-Waterman locality condition */ + /* test two alternatives */ + if (newScore < prevScoreGapMatchSeq) + newScore = prevScoreGapMatchSeq; + if (newScore < continueGapScore) + newScore = continueGapScore; + prevScoreNoGapMatchSeq = scoreVector[matchSeqPos].noGap; + scoreVector[matchSeqPos].noGap = newScore; + scoreVector[matchSeqPos].gapExists = continueGapScore; + if (newScore > bestScore) { + bestScore = newScore; + bestQueryPos = queryPos; + bestMatchSeqPos = matchSeqPos; + } + if (bestScore >= score_in) + break; + } + if (bestScore >= score_in) + break; + } + free(scoreVector); + if (bestScore < 0) + bestScore = 0; + *matchSeqStart = bestMatchSeqPos; + *queryStart = bestQueryPos; + *score_out = bestScore; + + return 0; +} + + +/** + * Compute the score and right-hand endpoints of the locally optimal + * Smith-Waterman alignment, subject to the restriction that some + * ranges are forbidden. + * + * @param *score the computed score + * @param *matchSeqEnd the right-hand end of the alignment in the + * database sequence + * @param *queryEnd the right-hand end of the alignment in the + * query sequence + * @param matchSeq the database sequence data + * @param matchSeqLength length of matchSeq + * @param query the query sequence data + * @param queryLength length of query + * @param matrix amino-acid scoring matrix + * @param gapOpen penalty for opening a gap + * @param gapExtend penalty for extending a gap by one amino acid + * @param numForbidden number of forbidden ranges [in] + * @param forbiddenRanges lists areas that should not be aligned [in] + * @param positionSpecific determines whether matrix is position + * specific or not + */ +static int +BLspecialSmithWatermanScoreOnly(int *score, int *matchSeqEnd, int *queryEnd, + const Uint1 * matchSeq, int matchSeqLength, + const Uint1 *query, int queryLength, + int **matrix, int gapOpen, int gapExtend, + const int *numForbidden, + int ** forbiddenRanges, + int positionSpecific) +{ + int bestScore; /* best score seen so far */ + int newScore; /* score of next entry*/ + int bestMatchSeqPos, bestQueryPos; /*position ending best score in + matchSeq and database sequences */ + SwGapInfo *scoreVector; /* keeps one row of the Smith-Waterman + matrix overwrite old row with new row */ + int *matrixRow; /* one row of score matrix */ + int newGapCost; /* cost to have a gap of one character */ + int prevScoreNoGapMatchSeq; /* score one row and column up + with no gaps*/ + int prevScoreGapMatchSeq; /* score if a gap already started in + matchSeq */ + int continueGapScore; /* score for continuing a gap in query */ + int matchSeqPos, queryPos; /* positions in matchSeq and query */ + int forbidden; /* is this position forbidden? */ + int f; /* index over forbidden positions */ + + scoreVector = (SwGapInfo *) malloc(matchSeqLength * sizeof(SwGapInfo)); + if (scoreVector == NULL) { + return -1; + } + bestMatchSeqPos = 0; + bestQueryPos = 0; + bestScore = 0; + newGapCost = gapOpen + gapExtend; + for (matchSeqPos = 0; matchSeqPos < matchSeqLength; matchSeqPos++) { + scoreVector[matchSeqPos].noGap = 0; + scoreVector[matchSeqPos].gapExists = -(gapOpen); + } + for (queryPos = 0; queryPos < queryLength; queryPos++) { + if (positionSpecific) + matrixRow = matrix[queryPos]; + else + matrixRow = matrix[query[queryPos]]; + newScore = 0; + prevScoreNoGapMatchSeq = 0; + prevScoreGapMatchSeq = -(gapOpen); + for (matchSeqPos = 0; matchSeqPos < matchSeqLength; matchSeqPos++) { + /* testing scores with a gap in matchSeq, either starting + * a new gap or extending an existing gap */ + if ((newScore = newScore - newGapCost) > + (prevScoreGapMatchSeq = prevScoreGapMatchSeq - gapExtend)) + prevScoreGapMatchSeq = newScore; + /* testing scores with a gap in query, either starting a + * new gap or extending an existing gap */ + if ((newScore = scoreVector[matchSeqPos].noGap - newGapCost) > + (continueGapScore = + scoreVector[matchSeqPos].gapExists - gapExtend)) + continueGapScore = newScore; + /* compute new score extending one position in matchSeq + * and query */ + forbidden = FALSE; + for (f = 0; f < numForbidden[queryPos]; f++) { + if ((matchSeqPos >= forbiddenRanges[queryPos][2 * f]) && + (matchSeqPos <= forbiddenRanges[queryPos][2*f + 1])) { + forbidden = TRUE; + break; + } + } + if (forbidden) + newScore = COMPO_SCORE_MIN; + else + newScore = + prevScoreNoGapMatchSeq + matrixRow[matchSeq[matchSeqPos]]; + if (newScore < 0) + newScore = 0; /* Smith-Waterman locality condition */ + /* test two alternatives */ + if (newScore < prevScoreGapMatchSeq) + newScore = prevScoreGapMatchSeq; + if (newScore < continueGapScore) + newScore = continueGapScore; + prevScoreNoGapMatchSeq = scoreVector[matchSeqPos].noGap; + scoreVector[matchSeqPos].noGap = newScore; + scoreVector[matchSeqPos].gapExists = continueGapScore; + if (newScore > bestScore) { + bestScore = newScore; + bestQueryPos = queryPos; + bestMatchSeqPos = matchSeqPos; + } + } + } + free(scoreVector); + if (bestScore < 0) + bestScore = 0; + *matchSeqEnd = bestMatchSeqPos; + *queryEnd = bestQueryPos; + *score = bestScore; + + return 0; +} + + +/** + * Find the left-hand endpoints of the locally optimal Smith-Waterman + * alignment, subject to the restriction that certain ranges may not + * be aligned, given the score and right-hand endpoints computed by + * BLspecialSmithWatermanScoreOnly. + * + * @param *score_out the score of the optimal alignment -- should + * equal score_in. + * @param *matchSeqStart the left-hand endpoint of the alignment in + * the database sequence + * @param *queryStart the right-hand endpoint of the alignment + * in the query sequence + * @param matchSeq the database sequence data + * @param matchSeqLength length of matchSeq + * @param query the query sequence data + * @param matrix amino-acid scoring matrix + * @param gapOpen penalty for opening a gap + * @param gapExtend penalty for extending a gap by one amino acid + * @param matchSeqEnd right-hand endpoint of the alignment in + * the database sequence + * @param queryEnd right-hand endpoint of the alignment in + * the query + * @param score_in the score of the alignment + * @param numForbidden number of forbidden ranges + * @param forbiddenRanges lists areas that should not be aligned + * @param positionSpecific determines whether matrix is position + * specific or not + */ +static int +BLspecialSmithWatermanFindStart(int * score_out, + int *matchSeqStart, int *queryStart, + const Uint1 * matchSeq, int matchSeqLength, + const Uint1 *query, int **matrix, + int gapOpen, int gapExtend, int matchSeqEnd, + int queryEnd, int score_in, + const int *numForbidden, + int ** forbiddenRanges, + int positionSpecific) +{ + int bestScore; /* best score seen so far */ + int newScore; /* score of next entry */ + int bestMatchSeqPos, bestQueryPos; /* position starting best score in + matchSeq and database sequences */ + SwGapInfo *scoreVector; /* keeps one row of the + Smith-Waterman matrix; overwrite + old row with new row*/ + int *matrixRow; /* one row of score matrix */ + int newGapCost; /* cost to have a gap of one character */ + int prevScoreNoGapMatchSeq; /* score one row and column up + with no gaps*/ + int prevScoreGapMatchSeq; /* score if a gap already started in + matchSeq */ + int continueGapScore; /* score for continuing a gap in query */ + int matchSeqPos, queryPos; /* positions in matchSeq and query */ + int forbidden; /* is this position forbidden? */ + int f; /* index over forbidden positions */ + + scoreVector = (SwGapInfo *) malloc(matchSeqLength * sizeof(SwGapInfo)); + if (scoreVector == NULL) { + return -1; + } + bestMatchSeqPos = 0; + bestQueryPos = 0; + bestScore = 0; + newGapCost = gapOpen + gapExtend; + for (matchSeqPos = 0; matchSeqPos < matchSeqLength; matchSeqPos++) { + scoreVector[matchSeqPos].noGap = 0; + scoreVector[matchSeqPos].gapExists = -(gapOpen); + } + for (queryPos = queryEnd; queryPos >= 0; queryPos--) { + if (positionSpecific) + matrixRow = matrix[queryPos]; + else + matrixRow = matrix[query[queryPos]]; + newScore = 0; + prevScoreNoGapMatchSeq = 0; + prevScoreGapMatchSeq = -(gapOpen); + for (matchSeqPos = matchSeqEnd; matchSeqPos >= 0; matchSeqPos--) { + /* testing scores with a gap in matchSeq, either starting a + * new gap or extending an existing gap*/ + if ((newScore = newScore - newGapCost) > + (prevScoreGapMatchSeq = prevScoreGapMatchSeq - gapExtend)) + prevScoreGapMatchSeq = newScore; + /* testing scores with a gap in query, either starting a + * new gap or extending an existing gap*/ + if ((newScore = scoreVector[matchSeqPos].noGap - newGapCost) > + (continueGapScore = + scoreVector[matchSeqPos].gapExists - gapExtend)) + continueGapScore = newScore; + /* compute new score extending one position in matchSeq + * and query */ + forbidden = FALSE; + for (f = 0; f < numForbidden[queryPos]; f++) { + if ((matchSeqPos >= forbiddenRanges[queryPos][2 * f]) && + (matchSeqPos <= forbiddenRanges[queryPos][2*f + 1])) { + forbidden = TRUE; + break; + } + } + if (forbidden) + newScore = COMPO_SCORE_MIN; + else + newScore = + prevScoreNoGapMatchSeq + matrixRow[matchSeq[matchSeqPos]]; + if (newScore < 0) + newScore = 0; /* Smith-Waterman locality condition */ + /* test two alternatives */ + if (newScore < prevScoreGapMatchSeq) + newScore = prevScoreGapMatchSeq; + if (newScore < continueGapScore) + newScore = continueGapScore; + prevScoreNoGapMatchSeq = scoreVector[matchSeqPos].noGap; + scoreVector[matchSeqPos].noGap = newScore; + scoreVector[matchSeqPos].gapExists = continueGapScore; + if (newScore > bestScore) { + bestScore = newScore; + bestQueryPos = queryPos; + bestMatchSeqPos = matchSeqPos; + } + if (bestScore >= score_in) + break; + } + if (bestScore >= score_in) + break; + } + free(scoreVector); + if (bestScore < 0) + bestScore = 0; + *matchSeqStart = bestMatchSeqPos; + *queryStart = bestQueryPos; + *score_out = bestScore; + + return 0; +} + + +/** + * Release the storage associated with the fields of self, but do not + * delete self + * + * @param self an instance of Blast_ForbiddenRanges [in][out] + */ +void +Blast_ForbiddenRangesRelease(Blast_ForbiddenRanges * self) +{ + int f; + if (self->ranges) { + for (f = 0; f < self->capacity; f++) free(self->ranges[f]); + } + free(self->ranges); self->ranges = NULL; + free(self->numForbidden); self->numForbidden = NULL; +} + + +/** + * Initialize a new, empty Blast_ForbiddenRanges + * + * @param self object to be initialized + * @param capacity the number of ranges that may be stored + * (must be at least as long as the length + * of the query) + */ +int +Blast_ForbiddenRangesInitialize(Blast_ForbiddenRanges * self, + int capacity) +{ + int f; + self->capacity = capacity; + self->numForbidden = NULL; + self->ranges = NULL; + self->isEmpty = TRUE; + + self->numForbidden = (int *) calloc(capacity, sizeof(int)); + if (self->numForbidden == NULL) + goto error_return; + self->ranges = (int **) calloc(capacity, sizeof(int *)); + if (self->ranges == NULL) + goto error_return; + for (f = 0; f < capacity; f++) { + self->numForbidden[f] = 0; + self->ranges[f] = (int *) malloc(2 * sizeof(int)); + if (self->ranges[f] == NULL) + goto error_return; + self->ranges[f][0] = 0; + self->ranges[f][1] = 0; + } + return 0; +error_return: + Blast_ForbiddenRangesRelease(self); + return -1; +} + + +/** Reset self to be empty */ +void +Blast_ForbiddenRangesClear(Blast_ForbiddenRanges * self) +{ + int f; + for (f = 0; f < self->capacity; f++) { + self->numForbidden[f] = 0; + } + self->isEmpty = TRUE; +} + + +/** Add some ranges to self + * @param self an instance of Blast_ForbiddenRanges [in][out] + * @param queryStart start of the alignment in the query sequence + * @param queryAlignmentExtent length of the alignment in the query sequence + * @param matchStart start of the alignment in the subject sequence + * @param matchAlignmentExtent length of the alignment in the + * subject sequence + */ +int +Blast_ForbiddenRangesPush(Blast_ForbiddenRanges * self, + int queryStart, + int queryEnd, + int matchStart, + int matchEnd) +{ + int f; + for (f = queryStart; f < queryEnd; f++) { + int last = 2 * self->numForbidden[f]; + if (0 != last) { /* we must resize the array */ + int * new_ranges = + realloc(self->ranges[f], (last + 2) * sizeof(int)); + if (new_ranges == NULL) + return -1; + self->ranges[f] = new_ranges; + } + self->ranges[f][last] = matchStart; + self->ranges[f][last + 1] = matchEnd; + + self->numForbidden[f]++; + } + self->isEmpty = FALSE; + + return 0; +} + + +/** + * Calls BLbasicSmithWatermanScoreOnly if forbiddenRanges is empty and + * calls BLspecialSmithWatermanScoreOnly otherwise. See + * BLspecialSmithWatermanScoreOnly for the meaning of the parameters + * to this routine. + */ +int +Blast_SmithWatermanScoreOnly(int *score, + int *matchSeqEnd, int *queryEnd, + const Uint1 * subject_data, int subject_length, + const Uint1 * query_data, int query_length, + int **matrix, + int gapOpen, + int gapExtend, + int positionSpecific, + const Blast_ForbiddenRanges * forbiddenRanges ) +{ + if (forbiddenRanges->isEmpty) { + return BLbasicSmithWatermanScoreOnly(score, matchSeqEnd, + queryEnd, subject_data, + subject_length, + query_data, query_length, + matrix, gapOpen, + gapExtend, + positionSpecific); + } else { + return BLspecialSmithWatermanScoreOnly(score, matchSeqEnd, + queryEnd, subject_data, + subject_length, + query_data, + query_length, matrix, + gapOpen, gapExtend, + forbiddenRanges->numForbidden, + forbiddenRanges->ranges, + positionSpecific); + } +} + + +/** + * Calls BLSmithWatermanFindStart if forbiddenRanges is empty and + * calls BLspecialSmithWatermanFindStart otherwise. See + * BLspecialSmithWatermanFindStart for the meaning of the parameters + * to this routine. + */ +int +Blast_SmithWatermanFindStart(int * score_out, + int *matchSeqStart, + int *queryStart, + const Uint1 * subject_data, int subject_length, + const Uint1 * query_data, + int **matrix, + int gapOpen, + int gapExtend, + int matchSeqEnd, + int queryEnd, + int score_in, + int positionSpecific, + const Blast_ForbiddenRanges * forbiddenRanges) +{ + if (forbiddenRanges->isEmpty) { + return BLSmithWatermanFindStart(score_out, matchSeqStart, + queryStart, subject_data, + subject_length, query_data, + matrix, gapOpen, gapExtend, + matchSeqEnd, queryEnd, + score_in, positionSpecific); + } else { + return BLspecialSmithWatermanFindStart(score_out, + matchSeqStart, + queryStart, + subject_data, + subject_length, + query_data, matrix, + gapOpen, gapExtend, + matchSeqEnd, queryEnd, + score_in, + forbiddenRanges->numForbidden, + forbiddenRanges->ranges, + positionSpecific); + } +} diff --git a/algo/blast/composition_adjustment/smith_waterman.h b/algo/blast/composition_adjustment/smith_waterman.h new file mode 100644 index 00000000..b206c4bd --- /dev/null +++ b/algo/blast/composition_adjustment/smith_waterman.h @@ -0,0 +1,103 @@ +/* $Id: smith_waterman.h,v 1.1 2005/12/01 13:52:20 gertz Exp $ + * =========================================================================== + * + * PUBLIC DOMAIN NOTICE + * National Center for Biotechnology Information + * + * This software/database is a "United States Government Work" under the + * terms of the United States Copyright Act. It was written as part of + * the author's official duties as a United States Government employee and + * thus cannot be copyrighted. This software/database is freely available + * to the public for use. The National Library of Medicine and the U.S. + * Government have not placed any restriction on its use or reproduction. + * + * Although all reasonable efforts have been taken to ensure the accuracy + * and reliability of the software and data, the NLM and the U.S. + * Government do not and cannot warrant the performance or results that + * may be obtained by using this software or data. The NLM and the U.S. + * Government disclaim all warranties, express or implied, including + * warranties of performance, merchantability or fitness for any particular + * purpose. + * + * Please cite the author in any work or product based on this material. + * + * ===========================================================================*/ +/** + * @file smith_waterman.h + * @author Alejandro Schaffer, E. Michael Gertz + * + * Definitions for computing Smith-Waterman alignments + */ +#ifndef __SMITH_WATERMAN__ +#define __SMITH_WATERMAN__ + +#include <algo/blast/core/blast_export.h> +#include <algo/blast/core/ncbi_std.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * An instance of Blast_ForbiddenRanges is used by the Smith-Waterman + * algorithm to represent ranges in the database that are not to be + * aligned. + */ +typedef struct Blast_ForbiddenRanges { + int isEmpty; /**< True if there are no forbidden ranges */ + int *numForbidden; /**< how many forbidden ranges at each + database position */ + int **ranges; /**< forbidden ranges for each database + position */ + int capacity; /**< length of the query sequence */ +} Blast_ForbiddenRanges; + +NCBI_XBLAST_EXPORT +int Blast_ForbiddenRangesInitialize(Blast_ForbiddenRanges * self, + int capacity); + +NCBI_XBLAST_EXPORT +void Blast_ForbiddenRangesClear(Blast_ForbiddenRanges * self); + +NCBI_XBLAST_EXPORT +int Blast_ForbiddenRangesPush(Blast_ForbiddenRanges * self, + int queryStart, int queryEnd, + int matchStart, int matchEnd); + +NCBI_XBLAST_EXPORT +void Blast_ForbiddenRangesRelease(Blast_ForbiddenRanges * self); + +NCBI_XBLAST_EXPORT +int Blast_SmithWatermanFindStart(int * score_out, + int *matchSeqStart, + int *queryStart, + const Uint1 * subject_data, + int subject_length, + const Uint1 * query_data, + int **matrix, + int gapOpen, + int gapExtend, + int matchSeqEnd, + int queryEnd, + int score_in, + int positionSpecific, + const Blast_ForbiddenRanges * + forbiddenRanges); + +NCBI_XBLAST_EXPORT +int Blast_SmithWatermanScoreOnly(int *score, + int *matchSeqEnd, int *queryEnd, + const Uint1 * subject_data, + int subject_length, + const Uint1 * query_data, + int query_length, int **matrix, + int gapOpen, int gapExtend, + int positionSpecific, + const Blast_ForbiddenRanges * + forbiddenRanges); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/algo/blast/core/aa_ungapped.c b/algo/blast/core/aa_ungapped.c index f7544f3a..e1514540 100644 --- a/algo/blast/core/aa_ungapped.c +++ b/algo/blast/core/aa_ungapped.c @@ -1,4 +1,4 @@ -/* $Id: aa_ungapped.c,v 1.44 2005/04/06 13:42:01 camacho Exp $ +/* $Id: aa_ungapped.c,v 1.45 2005/11/16 14:27:03 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. @@ -30,7 +30,7 @@ #ifndef SKIP_DOXYGEN_PROCESSING static char const rcsid[] = - "$Id: aa_ungapped.c,v 1.44 2005/04/06 13:42:01 camacho Exp $"; + "$Id: aa_ungapped.c,v 1.45 2005/11/16 14:27:03 madden Exp $"; #endif /* SKIP_DOXYGEN_PROCESSING */ #include <algo/blast/core/aa_ungapped.h> diff --git a/algo/blast/core/aa_ungapped.h b/algo/blast/core/aa_ungapped.h index a3803d27..20580d57 100644 --- a/algo/blast/core/aa_ungapped.h +++ b/algo/blast/core/aa_ungapped.h @@ -1,4 +1,4 @@ -/* $Id: aa_ungapped.h,v 1.23 2005/03/28 21:22:50 dondosha Exp $ +/* $Id: aa_ungapped.h,v 1.24 2005/11/16 14:31:36 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. diff --git a/algo/blast/core/blast_def.h b/algo/blast/core/blast_def.h index 5a2f322d..b7bfebba 100644 --- a/algo/blast/core/blast_def.h +++ b/algo/blast/core/blast_def.h @@ -1,4 +1,4 @@ -/* $Id: blast_def.h,v 1.61 2005/06/27 17:58:05 camacho Exp $ +/* $Id: blast_def.h,v 1.64 2005/11/16 14:31:36 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. @@ -23,7 +23,6 @@ * * =========================================================================== * - * Author: Ilya Dondoshansky * */ @@ -122,24 +121,29 @@ typedef struct BlastSeqLoc { /** Structure for keeping the query masking information */ typedef struct BlastMaskLoc { - Int4 total_size; /**< Total size of the BlastSeqLoc array below. Inside the - engine equal to number of contexts in the BlastQueryInfo - structure. For lower case mask in a translated search, - total size is at first equal to number of query - sequences, but then expanded to number of contexts - (total number of translated frames), i.e. 6 times number - of queries. */ - BlastSeqLoc** seqloc_array; /**< array of mask locations. */ + /** Total size of the BlastSeqLoc array below. This is always the number + of queries times the number of contexts. Note that in the case of + translated query searches, these locations must be provided in protein + coordinates to BLAST_MainSetUp. + @sa BLAST_GetNumberOfContexts + @sa BlastMaskLocDNAToProtein + */ + Int4 total_size; + + /** Array of masked locations. + Every query is allocated the number of contexts associated with the + program. In the case of nucleotide searches, the strand(s) to search + dictatate which elements of the array for a given query are filled. For + translated searches, this should also be the same (by design) but the + C toolkit API does NOT implement this, it rather fills all elements + for all queries with masked locations in protein coordinates (if any). + The C++ API does follow the convention which populates each element, only + if so dictated by the strand(s) to search for each query. + @sa BLAST_GetNumberOfContexts + */ + BlastSeqLoc** seqloc_array; } BlastMaskLoc; - -/** Encapsulates masking/filtering information. */ -typedef struct BlastMaskInformation { - BlastMaskLoc* filter_slp; /**< masking locations. */ - Boolean mask_at_hash; /**< if TRUE masking used only for building lookup table. */ -} BlastMaskInformation; - - /** Structure to hold a sequence. */ typedef struct BLAST_SequenceBlk { Uint1* sequence; /**< Sequence used for search (could be translation). */ diff --git a/algo/blast/core/blast_diagnostics.c b/algo/blast/core/blast_diagnostics.c index 496dee14..cec16621 100644 --- a/algo/blast/core/blast_diagnostics.c +++ b/algo/blast/core/blast_diagnostics.c @@ -1,4 +1,4 @@ -/* $Id: blast_diagnostics.c,v 1.7 2005/06/09 17:06:14 dondosha Exp $ +/* $Id: blast_diagnostics.c,v 1.8 2005/11/16 14:27:03 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. @@ -34,7 +34,7 @@ #ifndef SKIP_DOXYGEN_PROCESSING static char const rcsid[] = - "$Id: blast_diagnostics.c,v 1.7 2005/06/09 17:06:14 dondosha Exp $"; + "$Id: blast_diagnostics.c,v 1.8 2005/11/16 14:27:03 madden Exp $"; #endif /* SKIP_DOXYGEN_PROCESSING */ #include <algo/blast/core/blast_diagnostics.h> diff --git a/algo/blast/core/blast_diagnostics.h b/algo/blast/core/blast_diagnostics.h index 997cc579..3f6cf53c 100644 --- a/algo/blast/core/blast_diagnostics.h +++ b/algo/blast/core/blast_diagnostics.h @@ -1,4 +1,4 @@ -/* $Id: blast_diagnostics.h,v 1.8 2005/01/24 14:23:05 camacho Exp $ +/* $Id: blast_diagnostics.h,v 1.9 2005/11/16 14:31:36 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. @@ -23,7 +23,6 @@ * * =========================================================================== * - * Author: Ilya Dondoshansky * */ diff --git a/algo/blast/core/blast_dust.c b/algo/blast/core/blast_dust.c index d9c8788e..7d311f18 100644 --- a/algo/blast/core/blast_dust.c +++ b/algo/blast/core/blast_dust.c @@ -1,4 +1,4 @@ -/* $Id: blast_dust.c,v 1.35 2005/07/21 13:52:38 camacho Exp $ +/* $Id: blast_dust.c,v 1.37 2005/11/16 14:27:03 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. @@ -37,7 +37,7 @@ #ifndef SKIP_DOXYGEN_PROCESSING static char const rcsid[] = - "$Id: blast_dust.c,v 1.35 2005/07/21 13:52:38 camacho Exp $"; + "$Id: blast_dust.c,v 1.37 2005/11/16 14:27:03 madden Exp $"; #endif /* SKIP_DOXYGEN_PROCESSING */ #include <algo/blast/core/blast_dust.h> @@ -331,6 +331,7 @@ dust_triplet_find (Uint1* seq_start, Int4 icur, Int4 max, Uint1* s1) static Int2 GetDustLocations (BlastSeqLoc** loc, DREGION* reg, Int4 nreg) { + BlastSeqLoc* tail = NULL; /* pointer to tail of loc linked list */ if (!loc) return -1; @@ -341,7 +342,9 @@ GetDustLocations (BlastSeqLoc** loc, DREGION* reg, Int4 nreg) if (nreg > 0) { Int4 i; for (i = 0; reg && i < nreg; i++) { - BlastSeqLocNew(loc, reg->from, reg->to); + /* Cache the tail of the list to avoid the overhead of traversing the + * list when appending to it */ + tail = BlastSeqLocNew(tail ? &tail : loc, reg->from, reg->to); reg = reg->next; } } diff --git a/algo/blast/core/blast_dust.h b/algo/blast/core/blast_dust.h index b2d82570..cab3dca2 100644 --- a/algo/blast/core/blast_dust.h +++ b/algo/blast/core/blast_dust.h @@ -1,4 +1,4 @@ -/* $Id: blast_dust.h,v 1.13 2004/08/10 14:52:00 ivanov Exp $ +/* $Id: blast_dust.h,v 1.14 2005/11/16 14:31:36 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. diff --git a/algo/blast/core/blast_engine.c b/algo/blast/core/blast_engine.c index ae74288c..c3825ab1 100644 --- a/algo/blast/core/blast_engine.c +++ b/algo/blast/core/blast_engine.c @@ -1,4 +1,4 @@ -/* $Id: blast_engine.c,v 1.198 2005/08/15 16:11:20 dondosha Exp $ +/* $Id: blast_engine.c,v 1.203 2005/11/22 13:44:13 coulouri Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. @@ -23,8 +23,6 @@ * * =========================================================================== * - * Author: Ilya Dondoshansky - * */ /** @file blast_engine.c @@ -57,7 +55,7 @@ #ifndef SKIP_DOXYGEN_PROCESSING static char const rcsid[] = - "$Id: blast_engine.c,v 1.198 2005/08/15 16:11:20 dondosha Exp $"; + "$Id: blast_engine.c,v 1.203 2005/11/22 13:44:13 coulouri Exp $"; #endif /* SKIP_DOXYGEN_PROCESSING */ #include <algo/blast/core/blast_engine.h> @@ -75,8 +73,8 @@ static char const rcsid[] = NCBI_XBLAST_EXPORT const int kBlastMajorVersion = 2; NCBI_XBLAST_EXPORT const int kBlastMinorVersion = 2; -NCBI_XBLAST_EXPORT const int kBlastPatchVersion = 12; -NCBI_XBLAST_EXPORT const char* kBlastReleaseDate = "Aug-07-2005"; +NCBI_XBLAST_EXPORT const int kBlastPatchVersion = 13; +NCBI_XBLAST_EXPORT const char* kBlastReleaseDate = "Nov-27-2005"; /** Structure to be passed to s_BlastSearchEngineCore, containing pointers to various preallocated structures and arrays. */ @@ -385,10 +383,7 @@ s_BlastSearchEngineCore(EBlastProgramType program_number, BLAST_SequenceBlk* que /* For nucleotide search, if match score is = 2, the odd scores are rounded down to the nearest even number. */ - if (program_number == eBlastTypeBlastn && - score_params->options->reward == 2) { - Blast_HSPListAdjustOddBlastnScores(hsp_list); - } + Blast_HSPListAdjustOddBlastnScores(hsp_list, score_options->gapped_calculation, gap_align->sbp); Blast_HSPListSortByScore(hsp_list); @@ -804,7 +799,6 @@ BLAST_PreliminarySearchEngine(EBlastProgramType program_number, seq_arg.seq, lookup_wrap, gap_align, score_params, word_params, ext_params, hit_params, db_options, diagnostics, aux_struct, &hsp_list); - if (status) break; @@ -844,7 +838,9 @@ BLAST_PreliminarySearchEngine(EBlastProgramType program_number, } /* Save the results. */ - BlastHSPStreamWrite(hsp_stream, &hsp_list); + status = BlastHSPStreamWrite(hsp_stream, &hsp_list); + if (status != 0) + break; } BlastSeqSrcReleaseSequence(seq_src, (void*) &seq_arg); @@ -866,15 +862,20 @@ BLAST_PreliminarySearchEngine(EBlastProgramType program_number, Int2 Blast_RunPreliminarySearch(EBlastProgramType program, - BLAST_SequenceBlk* query, BlastQueryInfo* query_info, - const BlastSeqSrc* seq_src, const BlastScoringOptions* score_options, - BlastScoreBlk* sbp, LookupTableWrap* lookup_wrap, + BLAST_SequenceBlk* query, + BlastQueryInfo* query_info, + const BlastSeqSrc* seq_src, + const BlastScoringOptions* score_options, + BlastScoreBlk* sbp, + LookupTableWrap* lookup_wrap, const BlastInitialWordOptions* word_options, const BlastExtensionOptions* ext_options, const BlastHitSavingOptions* hit_options, const BlastEffectiveLengthsOptions* eff_len_options, - const PSIBlastOptions* psi_options, const BlastDatabaseOptions* db_options, - BlastHSPStream* hsp_stream, BlastDiagnostics* diagnostics) + const PSIBlastOptions* psi_options, + const BlastDatabaseOptions* db_options, + BlastHSPStream* hsp_stream, + BlastDiagnostics* diagnostics) { Int2 status = 0; BlastScoringParameters* score_params = NULL;/**< Scoring parameters */ diff --git a/algo/blast/core/blast_engine.h b/algo/blast/core/blast_engine.h index d08f94fd..7e59d99d 100644 --- a/algo/blast/core/blast_engine.h +++ b/algo/blast/core/blast_engine.h @@ -1,4 +1,4 @@ -/* $Id: blast_engine.h,v 1.51 2005/06/28 12:29:24 ivanov Exp $ +/* $Id: blast_engine.h,v 1.52 2005/08/31 17:36:28 jcherry Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -93,38 +93,6 @@ Blast_RunFullSearch(EBlastProgramType program_number, BlastHSPStream* hsp_stream, const BlastRPSInfo* rps_info, BlastDiagnostics* diagnostics, BlastHSPResults** results); -/** The high level function performing an RPS BLAST search - * @param program_number Type of BLAST program [in] - * @param query The query sequence [in] - * @param query_info Additional query information [in] - * @param seq_src Structure containing BLAST database [in] - * @param sbp Scoring and statistical parameters [in] - * @param score_options Hit scoring options [in] - * @param lookup_wrap The lookup table, constructed earlier [in] - * @param word_options Options for processing initial word hits [in] - * @param ext_options Options and parameters for the gapped extension [in] - * @param hit_options Options for saving the HSPs [in] - * @param eff_len_options Options for setting effective lengths [in] - * @param psi_options Options specific to PSI-BLAST [in] - * @param hsp_stream Placeholder for saving results [in] - * @param diagnostics Return statistics containing numbers of hits on - * different stages of the search [out] - * @param results Structure holding all saved results [in] [out] - */ -Int4 -BLAST_RPSSearchEngine(EBlastProgramType program_number, - BLAST_SequenceBlk* query, BlastQueryInfo* query_info, - const BlastSeqSrc* seq_src, BlastScoreBlk* sbp, - const BlastScoringOptions* score_options, - LookupTableWrap* lookup_wrap, - const BlastInitialWordOptions* word_options, - const BlastExtensionOptions* ext_options, - const BlastHitSavingOptions* hit_options, - const BlastEffectiveLengthsOptions* eff_len_options, - const PSIBlastOptions* psi_options, - BlastHSPStream* hsp_stream, BlastDiagnostics* diagnostics, - BlastHSPResults** results); - /** Perform the preliminary stage of the BLAST search. * @param program_number Type of BLAST program [in] * @param query The query sequence [in] diff --git a/algo/blast/core/blast_extend.c b/algo/blast/core/blast_extend.c index ec45fe4f..51eec8a4 100644 --- a/algo/blast/core/blast_extend.c +++ b/algo/blast/core/blast_extend.c @@ -1,4 +1,4 @@ -/* $Id: blast_extend.c,v 1.87 2005/06/23 19:06:04 madden Exp $ +/* $Id: blast_extend.c,v 1.90 2005/12/05 16:36:50 papadopo Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,6 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. @@ -23,8 +22,6 @@ * * =========================================================================== * - * Author: Ilya Dondoshansky - * */ /** @file blast_extend.c @@ -33,7 +30,7 @@ #ifndef SKIP_DOXYGEN_PROCESSING static char const rcsid[] = - "$Id: blast_extend.c,v 1.87 2005/06/23 19:06:04 madden Exp $"; + "$Id: blast_extend.c,v 1.90 2005/12/05 16:36:50 papadopo Exp $"; #endif /* SKIP_DOXYGEN_PROCESSING */ #include <algo/blast/core/blast_extend.h> @@ -89,6 +86,7 @@ score_compare_match(const void* v1, const void* v2) { BlastInitHSP* h1,* h2; + int result = 0; h1 = (BlastInitHSP*) v1; h2 = (BlastInitHSP*) v2; @@ -102,32 +100,19 @@ score_compare_match(const void* v1, const void* v2) else if (h2->ungapped_data == NULL) return -1; - if (h1->ungapped_data->score < h2->ungapped_data->score) - return 1; - if (h1->ungapped_data->score > h2->ungapped_data->score) - return -1; - - - /* Tie breaks: starting offset in subject; then length - * (equivalent to ending offset in subject), then starting - * offset in query. - */ - if (h1->ungapped_data->s_start < h2->ungapped_data->s_start) - return 1; - if (h1->ungapped_data->s_start > h2->ungapped_data->s_start ) - return -1; - - if (h1->ungapped_data->length < h2->ungapped_data->length) - return 1; - if (h1->ungapped_data->length > h2->ungapped_data->length) - return -1; + if (0 == (result = BLAST_CMP(h2->ungapped_data->score, + h1->ungapped_data->score)) && + 0 == (result = BLAST_CMP(h1->ungapped_data->s_start, + h2->ungapped_data->s_start)) && + 0 == (result = BLAST_CMP(h2->ungapped_data->length, + h1->ungapped_data->length)) && + 0 == (result = BLAST_CMP(h1->ungapped_data->q_start, + h2->ungapped_data->q_start))) { + result = BLAST_CMP(h2->ungapped_data->length, + h1->ungapped_data->length); + } - if( h1->ungapped_data->q_start < h2->ungapped_data->q_start ) - return 1; - if( h1->ungapped_data->q_start > h2->ungapped_data->q_start ) - return -1; - - return 0; + return result; } void Blast_InitHitListSortByScore(BlastInitHitList* init_hitlist) @@ -296,7 +281,7 @@ static Int2 s_NuclUngappedExtend(BLAST_SequenceBlk* query, BLAST_SequenceBlk* subject, Int4** matrix, Int4 q_off, Int4 s_off, Int4 X, - BlastUngappedData** ungapped_data) + BlastUngappedData* ungapped_data) { Uint1* q; Int4 sum, score; @@ -343,14 +328,8 @@ s_NuclUngappedExtend(BLAST_SequenceBlk* query, } } - if (ungapped_data) { - if ((*ungapped_data = (BlastUngappedData*) - malloc(sizeof(BlastUngappedData))) == NULL) - return -1; - (*ungapped_data)->q_start = q_beg - query->sequence; - (*ungapped_data)->s_start = - s_off - (q_off - (*ungapped_data)->q_start); - } + ungapped_data->q_start = q_beg - query->sequence; + ungapped_data->s_start = s_off - (q_off - ungapped_data->q_start); if (q_avail < s_avail) { sf = subject0 + (s_off + q_avail)/COMPRESSION_RATIO; @@ -381,10 +360,8 @@ s_NuclUngappedExtend(BLAST_SequenceBlk* query, base--; } - if (ungapped_data) { - (*ungapped_data)->length = q_end - q_beg; - (*ungapped_data)->score = score; - } + ungapped_data->length = q_end - q_beg; + ungapped_data->score = score; return 0; } @@ -428,6 +405,7 @@ s_BlastnDiagExtendInitialHit(BLAST_SequenceBlk* query, Int4 diag, real_diag; Int4 s_pos; BlastUngappedData* ungapped_data; + BlastUngappedData dummy_ungapped_data; Int4 window_size = word_params->options->window_size; Boolean hit_ready; Boolean new_hit = FALSE, second_hit = FALSE; @@ -465,8 +443,9 @@ s_BlastnDiagExtendInitialHit(BLAST_SequenceBlk* query, if (hit_ready) { if (word_params->options->ungapped_extension) { /* Perform ungapped extension */ + ungapped_data = &dummy_ungapped_data; s_NuclUngappedExtend(query, subject, matrix, q_off, s_off, - -word_params->x_dropoff, &ungapped_data); + -word_params->x_dropoff, ungapped_data); last_hit = ungapped_data->length + ungapped_data->s_start + diag_table->offset; @@ -474,13 +453,18 @@ s_BlastnDiagExtendInitialHit(BLAST_SequenceBlk* query, ungapped_data = NULL; last_hit = s_pos; } - if (!ungapped_data || - ungapped_data->score >= word_params->cutoff_score) { + if (ungapped_data == NULL) { BLAST_SaveInitialHit(init_hitlist, q_off, s_off, ungapped_data); /* Set the "saved" flag for this hit */ hit_saved = ~LAST_HIT_MASK; + } else if (ungapped_data->score >= word_params->cutoff_score) { + BlastUngappedData *final_data = (BlastUngappedData *)malloc( + sizeof(BlastUngappedData)); + *final_data = *ungapped_data; + BLAST_SaveInitialHit(init_hitlist, q_off, s_off, final_data); + /* Set the "saved" flag for this hit */ + hit_saved = ~LAST_HIT_MASK; } else { - sfree(ungapped_data); /* Unset the "saved" flag for this hit */ hit_saved = 0; } @@ -567,6 +551,7 @@ s_BlastnStacksExtendInitialHit(BLAST_SequenceBlk* query, Int4 stack_top; Int4 window_size; Boolean hit_ready = FALSE, two_hits; + BlastUngappedData dummy_ungapped_data; BlastUngappedData* ungapped_data = NULL; window_size = word_params->options->window_size; @@ -607,21 +592,27 @@ s_BlastnStacksExtendInitialHit(BLAST_SequenceBlk* query, if (hit_ready) { if (word_params->options->ungapped_extension) { /* Perform ungapped extension */ + ungapped_data = &dummy_ungapped_data; s_NuclUngappedExtend(query, subject, matrix, q_off, s_off, - -word_params->x_dropoff, &ungapped_data); + -word_params->x_dropoff, ungapped_data); last_hit = ungapped_data->length + ungapped_data->s_start; } else { ungapped_data = NULL; last_hit = s_end; } - if (!ungapped_data || - ungapped_data->score >= word_params->cutoff_score) { + if (ungapped_data == NULL) { BLAST_SaveInitialHit(init_hitlist, q_off, s_off, ungapped_data); /* Set the "saved" flag for this hit */ hit_saved = ~LAST_HIT_MASK; + } else if (ungapped_data->score >= word_params->cutoff_score) { + BlastUngappedData *final_data = (BlastUngappedData *)malloc( + sizeof(BlastUngappedData)); + *final_data = *ungapped_data; + BLAST_SaveInitialHit(init_hitlist, q_off, s_off, final_data); + /* Set the "saved" flag for this hit */ + hit_saved = ~LAST_HIT_MASK; } else { - sfree(ungapped_data); /* Unset the "saved" flag for this hit */ hit_saved = 0; } @@ -669,20 +660,24 @@ s_BlastnStacksExtendInitialHit(BLAST_SequenceBlk* query, hit_ready = TRUE; if (word_params->options->ungapped_extension) { /* Perform ungapped extension */ + ungapped_data = &dummy_ungapped_data; s_NuclUngappedExtend(query, subject, matrix, q_off, s_off, - -word_params->x_dropoff, &ungapped_data); + -word_params->x_dropoff, ungapped_data); stack[stack_top].level = (ungapped_data->length + ungapped_data->s_start); } else { ungapped_data = NULL; } - if (!ungapped_data || - ungapped_data->score >= word_params->cutoff_score) { - BLAST_SaveInitialHit(init_hitlist, q_off, s_off, - ungapped_data); + if (ungapped_data == NULL) { + BLAST_SaveInitialHit(init_hitlist, q_off, s_off, ungapped_data); + stack[stack_top].level |= ~LAST_HIT_MASK; + } else if (ungapped_data->score >= word_params->cutoff_score) { + BlastUngappedData *final_data = (BlastUngappedData *)malloc( + sizeof(BlastUngappedData)); + *final_data = *ungapped_data; + BLAST_SaveInitialHit(init_hitlist, q_off, s_off, final_data); stack[stack_top].level |= ~LAST_HIT_MASK; } else { - sfree(ungapped_data); /* Set hit length back to 0 after ungapped extension failure */ stack[stack_top].level &= LAST_HIT_MASK; diff --git a/algo/blast/core/blast_filter.c b/algo/blast/core/blast_filter.c index 255aa088..2ad89ad9 100644 --- a/algo/blast/core/blast_filter.c +++ b/algo/blast/core/blast_filter.c @@ -1,4 +1,4 @@ -/* $Id: blast_filter.c,v 1.73 2005/07/19 13:43:30 madden Exp $ +/* $Id: blast_filter.c,v 1.78 2005/11/16 14:27:03 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,6 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. @@ -23,8 +22,6 @@ * * =========================================================================== * - * Author: Ilya Dondoshansky - * */ /** @file blast_filter.c @@ -33,7 +30,7 @@ #ifndef SKIP_DOXYGEN_PROCESSING static char const rcsid[] = - "$Id: blast_filter.c,v 1.73 2005/07/19 13:43:30 madden Exp $"; + "$Id: blast_filter.c,v 1.78 2005/11/16 14:27:03 madden Exp $"; #endif /* SKIP_DOXYGEN_PROCESSING */ #include <algo/blast/core/blast_def.h> @@ -397,75 +394,127 @@ BlastFilteringOptionsFromString(EBlastProgramType program_number, const char* in BlastSeqLoc* BlastSeqLocNew(BlastSeqLoc** head, Int4 from, Int4 to) { BlastSeqLoc* loc = (BlastSeqLoc*) calloc(1, sizeof(BlastSeqLoc)); - SSeqRange* seq_range = (SSeqRange*) malloc(sizeof(SSeqRange)); - - seq_range->left = from; - seq_range->right = to; - loc->ssr = seq_range; - - if (head) - { - if (*head) - { - BlastSeqLoc* tmp = *head; - while (tmp->next) - tmp = tmp->next; - tmp->next = loc; - } - else - { - *head = loc; - } + if ( !loc ) { + return NULL; } - - return loc; + loc->ssr = (SSeqRange*) calloc(1, sizeof(SSeqRange)); + loc->ssr->left = from; + loc->ssr->right = to; + + return BlastSeqLocAppend(head, loc); } -BlastSeqLoc* BlastSeqLocFree(BlastSeqLoc* loc) +BlastSeqLoc* BlastSeqLocAppend(BlastSeqLoc** head, BlastSeqLoc* node) { - SSeqRange* seq_range; - BlastSeqLoc* next_loc; - - while (loc) { - next_loc = loc->next; - seq_range = loc->ssr; - sfree(seq_range); - sfree(loc); - loc = next_loc; - } - return NULL; + if ( !node ) { + return NULL; + } + + if (head) + { + if (*head) + { + BlastSeqLoc* tmp = *head; + while (tmp->next) + tmp = tmp->next; + tmp->next = node; + } + else + { + *head = node; + } + } + + return node; } /** Makes a copy of the BlastSeqLoc and also a copy of the * SSRange element. Does not copy BlastSeqLoc that is pointed * to by "next". - * @param from the object to be copied [in] + * @param source the object to be copied [in] * @return another BlastSeqLoc* */ +static BlastSeqLoc* s_BlastSeqLocNodeDup(BlastSeqLoc* source) +{ + if ( !source ) { + return NULL; + } + ASSERT(source->ssr); + return BlastSeqLocNew(NULL, source->ssr->left, source->ssr->right); +} -static BlastSeqLoc* s_BlastSeqLocDup(BlastSeqLoc* from) +/** Prepend node to the head of the list and return the new head of the list */ +static BlastSeqLoc* s_BlastSeqLocPrepend(BlastSeqLoc* head, BlastSeqLoc* node) { - BlastSeqLoc* to; - SSeqRange* seq_range; + if ( !node ) { + return NULL; + } + node->next = head; + return node; +} - if (from == NULL) - return NULL; +/** Reverse elements in the list + * @param head pointer to pointer to the head of the list. After this call, + * this is set to NULL [in|out] + * @return the new head of the list or NULL if argument is NULL + */ +static BlastSeqLoc* s_BlastSeqLocListReverse(BlastSeqLoc** head) +{ + BlastSeqLoc* retval = NULL; /* return value */ + BlastSeqLoc* itr = NULL; /* iterator */ + + if ( !head ) { + return NULL; + } - seq_range = from->ssr; - ASSERT(seq_range); + for (itr = *head; itr; itr = itr->next) { + retval = s_BlastSeqLocPrepend(retval, s_BlastSeqLocNodeDup(itr)); + } + *head = BlastSeqLocFree(*head); + return retval; +} - to = BlastSeqLocNew(NULL, seq_range->left, seq_range->right); +BlastSeqLoc* BlastSeqLocNodeFree(BlastSeqLoc* loc) +{ + if ( !loc ) { + return NULL; + } + sfree(loc->ssr); + sfree(loc); + return NULL; +} - return to; +BlastSeqLoc* BlastSeqLocFree(BlastSeqLoc* loc) +{ + while (loc) { + BlastSeqLoc* next_loc = loc->next; + loc = BlastSeqLocNodeFree(loc); + loc = next_loc; + } + return NULL; +} + +BlastSeqLoc* BlastSeqLocListDup(BlastSeqLoc* head) +{ + BlastSeqLoc* retval = NULL; + BlastSeqLoc* retval_tail = NULL; + + for (; head; head = head->next) { + retval_tail = BlastSeqLocAppend(retval_tail ? &retval_tail : &retval, + s_BlastSeqLocNodeDup(head)); + } + + return retval; } BlastMaskLoc* BlastMaskLocNew(Int4 total) { - BlastMaskLoc* retval = (BlastMaskLoc *) calloc(1, sizeof(BlastMaskLoc)); - retval->total_size = total; - if (total > 0) - retval->seqloc_array = (BlastSeqLoc **) calloc(total, sizeof(BlastSeqLoc *)); - return retval; + BlastMaskLoc* retval = (BlastMaskLoc *) calloc(1, sizeof(BlastMaskLoc)); + retval->total_size = total; + if (total > 0) + retval->seqloc_array = (BlastSeqLoc **) calloc(total, + sizeof(BlastSeqLoc *)); + return retval; } BlastMaskLoc* BlastMaskLocFree(BlastMaskLoc* mask_loc) @@ -485,61 +534,27 @@ BlastMaskLoc* BlastMaskLocFree(BlastMaskLoc* mask_loc) return NULL; } -/** Calculates length of the DNA query from the BlastQueryInfo structure that - * contains context information for translated frames for a set of queries. - * @param query_info Query information containing data for all contexts [in] - * @param query_index Which query to find DNA length for? - * @return DNA length of the query, calculated as sum of 3 protein frame lengths, - * plus 2, because 2 last nucleotide residues do not have a - * corresponding codon. - */ -static Int4 -s_GetTranslatedQueryDNALength(const BlastQueryInfo* query_info, Int4 query_index) -{ - Int4 start_context = NUM_FRAMES*query_index; - Int4 dna_length = 2; - Int4 index; - - /* Make sure that query index is within appropriate range, and that this is - really a translated search */ - ASSERT(query_index < query_info->num_queries); - ASSERT(start_context < query_info->last_context); - - /* If only reverse strand is searched, then forward strand contexts don't - have lengths information */ - if (query_info->contexts[start_context].query_length == 0) - start_context += 3; - - for (index = start_context; index < start_context + 3; ++index) - dna_length += query_info->contexts[index].query_length; - - return dna_length; -} - Int2 BlastMaskLocDNAToProtein(BlastMaskLoc* mask_loc, const BlastQueryInfo* query_info) { - BlastSeqLoc** prot_seqloc_array; Uint4 seq_index; if (!mask_loc) return 0; - /* Check that the number of sequences in BlastQueryInfo is the same as the - size of the DNA mask locations array in the BlastMaskLoc. */ - ASSERT(mask_loc->total_size == query_info->num_queries); - - mask_loc->total_size *= NUM_FRAMES; - prot_seqloc_array = - (BlastSeqLoc**) calloc(mask_loc->total_size, sizeof(BlastSeqLoc*)); + /* Check that the array size in BlastMaskLoc corresponds to the number + of contexts in BlastQueryInfo. */ + ASSERT(mask_loc->total_size == query_info->last_context + 1); /* Loop over multiple DNA sequences */ for (seq_index = 0; seq_index < (Uint4)query_info->num_queries; ++seq_index) { - BlastSeqLoc** prot_seqloc = - &(prot_seqloc_array[NUM_FRAMES*seq_index]); - BlastSeqLoc* dna_seqloc = mask_loc->seqloc_array[seq_index]; - Int4 dna_length = s_GetTranslatedQueryDNALength(query_info, seq_index); + const Uint4 kCtxIndex = NUM_FRAMES * seq_index; + BlastSeqLoc* dna_seqloc = mask_loc->seqloc_array[kCtxIndex]; + BlastSeqLoc** prot_seqloc = &(mask_loc->seqloc_array[kCtxIndex]); + Int4 dna_length = BlastQueryInfoGetQueryLength(query_info, + eBlastTypeBlastx, + seq_index); Int4 context; /* Reproduce this mask for all 6 frames, with translated coordinates */ @@ -565,8 +580,6 @@ Int2 BlastMaskLocDNAToProtein(BlastMaskLoc* mask_loc, } BlastSeqLocFree(dna_seqloc); } - sfree(mask_loc->seqloc_array); - mask_loc->seqloc_array = prot_seqloc_array; return 0; } @@ -592,7 +605,9 @@ Int2 BlastMaskLocProteinToDNA(BlastMaskLoc* mask_loc, { Int4 frame_start = index*NUM_FRAMES; Int4 frame_index; - Int4 dna_length = s_GetTranslatedQueryDNALength(query_info, index); + Int4 dna_length = BlastQueryInfoGetQueryLength(query_info, + eBlastTypeBlastx, + index); /* Loop over all frames of one DNA sequence */ for (frame_index=frame_start; frame_index<(frame_start+NUM_FRAMES); frame_index++) { @@ -684,64 +699,48 @@ s_BlastSeqLocSort (BlastSeqLoc* list, return list; } -/* This will go in place of CombineSeqLocs to combine filtered locations */ -Int2 -CombineMaskLocations(BlastSeqLoc* mask_loc, BlastSeqLoc* *mask_loc_out, - Int4 link_value) +BlastSeqLoc* +BlastSeqLocCombine(BlastSeqLoc* mask_loc, Int4 link_value) { - Int2 status=0; /* return value. */ - Int4 start, stop; /* USed to merge overlapping SeqLoc's. */ - SSeqRange* ssr = NULL; - BlastSeqLoc* loc_head=NULL,* last_loc=NULL,* loc_var=NULL; - BlastSeqLoc* new_loc = NULL; + BlastSeqLoc* retval = NULL; + BlastSeqLoc* retval_tail = NULL; + Int4 start, stop; /* Used to merge overlapping SeqLoc's. */ + BlastSeqLoc* loc_head=NULL,* loc_var=NULL; - if (!mask_loc) { - *mask_loc_out = NULL; - return 0; + if ( !mask_loc ) { + return NULL; } - /* Put all the SeqLoc's into one big linked list. */ - loc_var = mask_loc; - loc_head = last_loc = s_BlastSeqLocDup(loc_var); - while (loc_var->next) - { - last_loc->next = s_BlastSeqLocDup(loc_var->next); - last_loc = last_loc->next; - loc_var = loc_var->next; + /* Copy the BlastSeqLoc-s and sort them by starting position. */ + loc_head = loc_var = s_BlastSeqLocSort(BlastSeqLocListDup(mask_loc), + s_SeqRangeSortByStartPosition); + if ( !loc_head ) { + return NULL; } - - /* Sort them by starting position. */ - loc_head = (BlastSeqLoc*) - s_BlastSeqLocSort (loc_head, s_SeqRangeSortByStartPosition); - - ssr = (SSeqRange*) loc_head->ssr; - start = ssr->left; - stop = ssr->right; - loc_var = loc_head; - ssr = NULL; - - while (loc_var) { - if (loc_var->next) - ssr = loc_var->next->ssr; - if (ssr && ((stop + link_value) > ssr->left)) { - stop = MAX(stop, ssr->right); - } else { - BlastSeqLocNew(&new_loc, start, stop); - if (loc_var->next) { - start = ssr->left; - stop = ssr->right; - } - } - loc_var = loc_var->next; - ssr = NULL; + start = loc_head->ssr->left; + stop = loc_head->ssr->right; + + for (; loc_var; loc_var = loc_var->next) { + SSeqRange* ssr = loc_var->next ? loc_var->next->ssr : NULL; + + if (ssr && ((stop + link_value) > ssr->left)) { + stop = MAX(stop, ssr->right); + } else { + /* Cache the tail of the list to avoid the overhead of traversing the + * list when appending to it */ + retval_tail = BlastSeqLocNew((retval_tail ? &retval_tail : &retval), + start, stop); + if (loc_var->next) { + start = ssr->left; + stop = ssr->right; + } + } } - *mask_loc_out = new_loc; - - /* Free memory allocated for the temporary list of SeqLocs */ + /* Free memory allocated for the temporary list of BlastSeqLoc-s */ BlastSeqLocFree(loc_head); - return status; + return retval; } Int2 @@ -750,8 +749,9 @@ BLAST_ComplementMaskLocations(EBlastProgramType program_number, const BlastMaskLoc* mask_loc, BlastSeqLoc* *complement_mask) { Int4 context; - BlastSeqLoc* loc,* last_loc = NULL,* start_loc = NULL; const Boolean kIsNucl = (program_number == eBlastTypeBlastn); + BlastSeqLoc* tail = NULL; /* Pointer to the tail of the complement_mask + linked list */ if (complement_mask == NULL) return -1; @@ -763,52 +763,39 @@ BLAST_ComplementMaskLocations(EBlastProgramType program_number, Boolean first = TRUE; /* Specifies beginning of query. */ Boolean last_interval_open=TRUE; /* if TRUE last interval needs to be closed. */ - Boolean reverse = FALSE; /* Sequence on minus strand. */ - Int4 index; /* loop index */ Int4 start_offset, end_offset, filter_start, filter_end; Int4 left=0, right; /* Used for left/right extent of a region. */ + BlastSeqLoc* loc = NULL; - start_offset = query_info->contexts[context].query_offset; - end_offset = query_info->contexts[context].query_length + start_offset - 1; - - /* For blastn: check if this strand is not searched at all */ - if (end_offset < start_offset) + if (query_info->contexts[context].query_length <= 0) { continue; - index = BlastGetMaskLocIndexFromContext(kIsNucl, context); - reverse = BlastIsReverseStrand(kIsNucl, context); - - - /* mask_loc NULL is simply the case that NULL was passed in, which we take to - mean that nothing on query is masked. */ - if (mask_loc == NULL || mask_loc->seqloc_array[index] == NULL) - { - /* No masks for this context */ - if (!last_loc) - last_loc = BlastSeqLocNew(complement_mask, start_offset, end_offset); - else - last_loc = BlastSeqLocNew(&last_loc, start_offset, end_offset); + } + + start_offset = query_info->contexts[context].query_offset; + end_offset = query_info->contexts[context].query_length + + start_offset - 1; + ASSERT(start_offset <= end_offset); + + /* mask_loc NULL is simply the case that NULL was passed in, which we + take to mean that nothing on query is masked. */ + if (mask_loc == NULL || mask_loc->seqloc_array[context] == NULL) { + /* Cache the tail of the list to avoid the overhead of traversing the + * list when appending to it */ + tail = BlastSeqLocNew(tail ? &tail : complement_mask, + start_offset, end_offset); continue; } - if (reverse) { - BlastSeqLoc* prev_loc = NULL; - /* Reverse the order of the locations */ - for (start_loc = mask_loc->seqloc_array[index]; start_loc; - start_loc = start_loc->next) { - loc = s_BlastSeqLocDup(start_loc); - loc->next = prev_loc; - prev_loc = loc; - } - /* Save where this list starts, so it can be freed later */ - start_loc = loc; - } else { - loc = mask_loc->seqloc_array[index]; + if (BlastIsReverseStrand(kIsNucl, context)) { + mask_loc->seqloc_array[context] = + s_BlastSeqLocListReverse(&mask_loc->seqloc_array[context]); } + loc = mask_loc->seqloc_array[context]; first = TRUE; for ( ; loc; loc = loc->next) { SSeqRange* seq_range = loc->ssr; - if (reverse) { + if (BlastIsReverseStrand(kIsNucl, context)) { filter_start = end_offset - seq_range->right; filter_end = end_offset - seq_range->left; } else { @@ -837,10 +824,9 @@ BLAST_ComplementMaskLocations(EBlastProgramType program_number, right = filter_start - 1; - if (!last_loc) - last_loc = BlastSeqLocNew(complement_mask, left, right); - else - last_loc = BlastSeqLocNew(&last_loc, left, right); + /* Cache the tail of the list to avoid the overhead of traversing the + * list when appending to it */ + tail = BlastSeqLocNew((tail ? &tail : complement_mask), left, right); if (filter_end >= end_offset) { /* last masked region at end of sequence */ last_interval_open = FALSE; @@ -850,17 +836,12 @@ BLAST_ComplementMaskLocations(EBlastProgramType program_number, } } - if (reverse) { - start_loc = BlastSeqLocFree(start_loc); - } - if (last_interval_open) { /* Need to finish SSeqRange* for last interval. */ right = end_offset; - if (!last_loc) - last_loc = BlastSeqLocNew(complement_mask, left, right); - else - last_loc = BlastSeqLocNew(&last_loc, left, right); + /* Cache the tail of the list to avoid the overhead of traversing the + * list when appending to it */ + tail = BlastSeqLocNew((tail ? &tail : complement_mask), left, right); } } return 0; @@ -868,51 +849,44 @@ BLAST_ComplementMaskLocations(EBlastProgramType program_number, Int2 -BlastSetUp_Filter(EBlastProgramType program_number, Uint1* sequence, Int4 length, - Int4 offset, const SBlastFilterOptions* filter_options, BlastSeqLoc* *seqloc_retval, - Blast_Message* *blast_message) +BlastSetUp_Filter(EBlastProgramType program_number, + Uint1* sequence, + Int4 length, + Int4 offset, + const SBlastFilterOptions* filter_options, + BlastSeqLoc** seqloc_retval, + Blast_Message* *blast_message) { - Int2 seqloc_num=0; Int2 status=0; /* return value. */ - BlastSeqLoc* seg_loc = NULL; - ASSERT(filter_options); - ASSERT(seqloc_retval); + ASSERT(filter_options); + ASSERT(seqloc_retval); - *seqloc_retval = NULL; + *seqloc_retval = NULL; - status = SBlastFilterOptionsValidate(program_number, filter_options, blast_message); - if (status) - return status; + status = SBlastFilterOptionsValidate(program_number, filter_options, + blast_message); + if (status) + return status; if (filter_options->segOptions) { - SSegOptions* seg_options = filter_options->segOptions; - SegParameters* sparamsp=NULL; - - sparamsp = SegParametersNewAa(); - sparamsp->overlaps = TRUE; - if (seg_options->window > 0) - sparamsp->window = seg_options->window; - if (seg_options->locut > 0.0) - sparamsp->locut = seg_options->locut; - if (seg_options->hicut > 0.0) - sparamsp->hicut = seg_options->hicut; - - SeqBufferSeg(sequence, length, offset, sparamsp, &seg_loc); + SSegOptions* seg_options = filter_options->segOptions; + SegParameters* sparamsp=NULL; + + sparamsp = SegParametersNewAa(); + sparamsp->overlaps = TRUE; + if (seg_options->window > 0) + sparamsp->window = seg_options->window; + if (seg_options->locut > 0.0) + sparamsp->locut = seg_options->locut; + if (seg_options->hicut > 0.0) + sparamsp->hicut = seg_options->hicut; + + status = SeqBufferSeg(sequence, length, offset, sparamsp, + seqloc_retval); SegParametersFree(sparamsp); sparamsp = NULL; - seqloc_num++; - } - - if (seqloc_num) - { - BlastSeqLoc* seqloc_list=NULL; /* Holds all SeqLoc's for - return. */ - if (seg_loc) - seqloc_list = seg_loc; - - *seqloc_retval = seqloc_list; } return status; @@ -938,139 +912,136 @@ BlastSeqLocReverse(const BlastSeqLoc* filter_in, Int4 query_length) } static Int2 -s_GetFilteringLocationsForOneContext(BLAST_SequenceBlk* query_blk, const BlastQueryInfo* query_info, Int4 context, EBlastProgramType program_number, const SBlastFilterOptions* filter_options, BlastSeqLoc* *filter_out, Blast_Message* *blast_message) +s_GetFilteringLocationsForOneContext(BLAST_SequenceBlk* query_blk, + const BlastQueryInfo* query_info, + Int4 context, + EBlastProgramType program_number, + const SBlastFilterOptions* filter_options, + BlastSeqLoc* *filter_out, + Blast_Message* *blast_message) { - Int2 status = 0; - Int4 query_length = 0; /* Length of query described by SeqLocPtr. */ - Int4 context_offset; - BlastSeqLoc *lcase_mask_slp = NULL; /* Auxiliary locations for lower-case masking */ - BlastSeqLoc *filter_slp = NULL; /* SeqLocPtr computed for filtering. */ - BlastSeqLoc *filter_slp_combined; /* Used to hold combined SeqLoc's */ - Uint1 *buffer; /* holds sequence for plus strand or protein. */ - - const Boolean kIsNucl = (program_number == eBlastTypeBlastn); - Int4 index = BlastGetMaskLocIndexFromContext(kIsNucl, context); - - context_offset = query_info->contexts[context].query_offset; - buffer = &query_blk->sequence[context_offset]; + Int2 status = 0; + Int4 query_length = 0; /* Length of query described by SeqLocPtr. */ + Int4 context_offset; + BlastSeqLoc *filter_slp = NULL; /* SeqLocPtr computed for filtering. */ + Uint1 *buffer; /* holds sequence for plus strand or protein. */ + + const Boolean kIsNucl = (program_number == eBlastTypeBlastn); - if ((query_length = query_info->contexts[context].query_length) <= 0) - return 0; + context_offset = query_info->contexts[context].query_offset; + buffer = &query_blk->sequence[context_offset]; - if ((status = BlastSetUp_Filter(program_number, buffer, - query_length, 0, filter_options, &filter_slp, blast_message))) - return status; + if ((query_length = query_info->contexts[context].query_length) <= 0) { + return 0; + } - if (BlastIsReverseStrand(kIsNucl, context) == TRUE) - { /* Reverse this as it's on minus strand. */ - BlastSeqLoc *filter_slp_rev = BlastSeqLocReverse(filter_slp, query_length); - filter_slp = BlastSeqLocFree(filter_slp); - filter_slp = filter_slp_rev; - } + status = BlastSetUp_Filter(program_number, + buffer, + query_length, + 0, + filter_options, + &filter_slp, + blast_message); + if (status) + return status; + + if (BlastIsReverseStrand(kIsNucl, context) == TRUE) + { /* Reverse this as it's on minus strand. */ + BlastSeqLoc* tmp = BlastSeqLocReverse(filter_slp, query_length); + filter_slp = BlastSeqLocFree(filter_slp); + filter_slp = tmp; + } - /* Extract the mask locations corresponding to this query - (frame, strand), detach it from other masks. - NB: for translated search the mask locations are expected in - protein coordinates. The nucleotide locations must be converted - to protein coordinates prior to the call to BLAST_MainSetUp. - */ - lcase_mask_slp = NULL; + /* Extract the mask locations corresponding to this query + (frame, strand), detach it from other masks. + NB: for translated search the mask locations are expected in + protein coordinates. The nucleotide locations must be converted + to protein coordinates prior to the call to BLAST_MainSetUp. + */ + { + /* Auxiliary locations for lower-case masking or any other masking + * which occurred outside of CORE BLAST */ + BlastSeqLoc *lcase_mask_slp = NULL; if (query_blk->lcase_mask && query_blk->lcase_mask->seqloc_array) { - lcase_mask_slp = query_blk->lcase_mask->seqloc_array[index]; - /* Set location list to NULL, to allow safe memory deallocation, - ownership transferred to filter_slp below. */ - query_blk->lcase_mask->seqloc_array[index] = NULL; - } - - /* Attach the lower case mask locations to the filter locations and combine them */ - if (lcase_mask_slp) { - if (filter_slp) { - BlastSeqLoc *loc; /* Iterator variable */ - for (loc = filter_slp; loc->next; loc = loc->next); - loc->next = lcase_mask_slp; - } else { - filter_slp = lcase_mask_slp; - } + ASSERT(context < query_blk->lcase_mask->total_size); + lcase_mask_slp = query_blk->lcase_mask->seqloc_array[context]; + /* Set location list to NULL, to allow safe memory deallocation, + ownership transferred to filter_slp below. */ + query_blk->lcase_mask->seqloc_array[context] = NULL; } - filter_slp_combined = NULL; - CombineMaskLocations(filter_slp, &filter_slp_combined, 0); - *filter_out = filter_slp_combined; + /* Attach the lower case mask locations to the filter locations and + combine them */ + BlastSeqLocAppend(&filter_slp, lcase_mask_slp); + } - filter_slp = BlastSeqLocFree(filter_slp); + *filter_out = BlastSeqLocCombine(filter_slp, 0); + filter_slp = BlastSeqLocFree(filter_slp); return 0; } - Int2 -BlastSetUp_GetFilteringLocations(BLAST_SequenceBlk* query_blk, const BlastQueryInfo* query_info, EBlastProgramType program_number, const SBlastFilterOptions* filter_options, BlastMaskLoc** filter_maskloc, Blast_Message * *blast_message) +BlastSetUp_GetFilteringLocations(BLAST_SequenceBlk* query_blk, + const BlastQueryInfo* query_info, + EBlastProgramType program_number, + const SBlastFilterOptions* filter_options, + BlastMaskLoc** filter_maskloc, + Blast_Message** blast_message) { - Int2 status = 0; Int4 context = 0; /* loop variable. */ - const Boolean kIsNucl = (program_number == eBlastTypeBlastn); - Boolean no_forward_strand = (query_info->first_context > 0); /* filtering needed on reverse strand. */ + const int kNumContexts = query_info->last_context + 1; ASSERT(query_info && query_blk && filter_maskloc); - *filter_maskloc = BlastMaskLocNew(query_info->last_context+1); + ASSERT(kNumContexts == + query_info->num_queries*BLAST_GetNumberOfContexts(program_number)); + *filter_maskloc = BlastMaskLocNew(kNumContexts); for (context = query_info->first_context; context <= query_info->last_context; ++context) { - - Boolean reverse = BlastIsReverseStrand(kIsNucl, context); - - /* For each query, check if forward strand is present */ - if (query_info->contexts[context].query_length <= 0) - { - if (kIsNucl && (context & 1) == 0) /* Needed only for blastn, or does this not apply FIXME */ - no_forward_strand = TRUE; /* No plus strand, we cannot simply infer locations by going from plus to minus */ - continue; + + BlastSeqLoc *filter_per_context = NULL; + status = s_GetFilteringLocationsForOneContext(query_blk, + query_info, + context, + program_number, + filter_options, + &filter_per_context, + blast_message); + if (status) { + Blast_MessageWrite(blast_message, eBlastSevError, 2, 1, + "Failure at filtering"); + return status; } - else if (!reverse) /* This is a plus strand, safe to set no_forward_strand to FALSE as clearly there is one. */ - no_forward_strand = FALSE; - if (!reverse || no_forward_strand) - { - BlastSeqLoc *filter_per_context = NULL; /* Used to hold combined SeqLoc's */ - Int4 filter_index = BlastGetMaskLocIndexFromContext(kIsNucl, context); - if ((status=s_GetFilteringLocationsForOneContext(query_blk, query_info, context, program_number, filter_options, &filter_per_context, blast_message))) - { - Blast_MessageWrite(blast_message, eBlastSevError, 2, 1, - "Failure at filtering"); - return status; - } - - /* NB: for translated searches filter locations are returned in - protein coordinates, because the DNA lengths of sequences are - not available here. The caller must take care of converting - them back to nucleotide coordinates. */ - (*filter_maskloc)->seqloc_array[filter_index] = filter_per_context; - } + /* NB: for translated searches filter locations are returned in + protein coordinates, because the DNA lengths of sequences are + not available here. The caller must take care of converting + them back to nucleotide coordinates. */ + (*filter_maskloc)->seqloc_array[context] = filter_per_context; } - return 0; } -Int2 +void Blast_MaskTheResidues(Uint1 * buffer, Int4 length, Boolean is_na, const BlastSeqLoc* mask_loc, Boolean reverse, Int4 offset) { - SSeqRange *loc = NULL; - Int2 status = 0; - Int4 index, start, stop; - const Uint1 kMaskingLetter = is_na ? kNuclMask : kProtMask; - + ASSERT(buffer); for (; mask_loc; mask_loc = mask_loc->next) { - loc = (SSeqRange *) mask_loc->ssr; + + Int4 index, start, stop; + const Uint1 kMaskingLetter = is_na ? kNuclMask : kProtMask; + if (reverse) { - start = length - 1 - loc->right; - stop = length - 1 - loc->left; + start = length - 1 - mask_loc->ssr->right; + stop = length - 1 - mask_loc->ssr->left; } else { - start = loc->left; - stop = loc->right; + start = mask_loc->ssr->left; + stop = mask_loc->ssr->right; } start -= offset; @@ -1079,47 +1050,38 @@ Blast_MaskTheResidues(Uint1 * buffer, Int4 length, Boolean is_na, for (index = start; index <= stop; index++) buffer[index] = kMaskingLetter; } - - return status; } -Int2 -BlastSetUp_MaskQuery(BLAST_SequenceBlk* query_blk, const BlastQueryInfo* query_info, const BlastMaskLoc *filter_maskloc, EBlastProgramType program_number) +void +BlastSetUp_MaskQuery(BLAST_SequenceBlk* query_blk, + const BlastQueryInfo* query_info, + const BlastMaskLoc *filter_maskloc, + EBlastProgramType program_number) { const Boolean kIsNucl = (program_number == eBlastTypeBlastn); Int4 context; /* loop variable. */ - Int2 status=0; + + ASSERT(query_blk); + ASSERT(query_info); + ASSERT(filter_maskloc); for (context = query_info->first_context; context <= query_info->last_context; ++context) { - BlastSeqLoc *filter_per_context = NULL; /* Used to hold combined SeqLoc's */ - Boolean reverse = BlastIsReverseStrand(kIsNucl, context); - Int4 query_length; - Int4 context_offset; - Int4 maskloc_index; - Uint1 *buffer; /* holds sequence */ + Int4 query_length = 0; + Int4 context_offset = 0; + Uint1 *buffer = NULL; /* holds sequence */ /* For each query, check if forward strand is present */ - if ((query_length = query_info->contexts[context].query_length) <= 0) + if ( (query_length = query_info->contexts[context].query_length) <= 0) continue; context_offset = query_info->contexts[context].query_offset; buffer = &query_blk->sequence[context_offset]; + ASSERT(buffer); - maskloc_index = BlastGetMaskLocIndexFromContext(kIsNucl, context); - filter_per_context = filter_maskloc->seqloc_array[maskloc_index]; - - if (buffer) { - - if ((status = - Blast_MaskTheResidues(buffer, query_length, kIsNucl, - filter_per_context, reverse, 0))) - { - return status; - } - } + Blast_MaskTheResidues(buffer, query_length, kIsNucl, + filter_maskloc->seqloc_array[context], + BlastIsReverseStrand(kIsNucl, context), 0); } - - return 0; } diff --git a/algo/blast/core/blast_filter.h b/algo/blast/core/blast_filter.h index d03e1531..45240ffc 100644 --- a/algo/blast/core/blast_filter.h +++ b/algo/blast/core/blast_filter.h @@ -1,4 +1,4 @@ -/* $Id: blast_filter.h,v 1.32 2005/07/13 16:47:34 bealer Exp $ +/* $Id: blast_filter.h,v 1.34 2005/09/20 00:02:47 camacho Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -75,6 +75,22 @@ extern const Uint1 kProtMask; NCBI_XBLAST_EXPORT BlastSeqLoc* BlastSeqLocNew(BlastSeqLoc** head, Int4 from, Int4 to); +/** Appends the BlastSeqLoc to the list of BlastSeqLoc-s pointed to by head. + * @param head Pointer to the head of the linked list of BlastSeqLoc-s [in] + * @param node Pointer to the node to be added to the list. If this is NULL, + * this function does nothing. [in] + * @returns pointer to the second argument to this function (i.e.: tail of the + * list) + */ +BlastSeqLoc* BlastSeqLocAppend(BlastSeqLoc** head, BlastSeqLoc* node); + +/** Deallocate a single BlastSeqLoc structure and its contents, without + * following its next pointer + * @param node structure to deallocate [in] + * @return NULL + */ +BlastSeqLoc* BlastSeqLocNodeFree(BlastSeqLoc* node); + /** Deallocate all BlastSeqLoc objects in a chain. * @param loc object to be freed [in] * @return NULL pointer returned. @@ -82,13 +98,35 @@ BlastSeqLoc* BlastSeqLocNew(BlastSeqLoc** head, Int4 from, Int4 to); NCBI_XBLAST_EXPORT BlastSeqLoc* BlastSeqLocFree(BlastSeqLoc* loc); +/** Make a deep copy of the linked list of BlastSeqLoc-s pointed to by its + * argument + * @param head head of the linked list [in] + * @return NULL on NULL input or memory allocation failure, else a copy of the + * list and its contents + */ +BlastSeqLoc* BlastSeqLocListDup(BlastSeqLoc* head); + /** Converts reverse strand coordinates to forward strand. * @param filter_in BlastSeqLoc to be reversed [in] * @param query_length length of query [in] * @return reversed BlastSeqLoc */ NCBI_XBLAST_EXPORT -BlastSeqLoc* BlastSeqLocReverse(const BlastSeqLoc* filter_in, Int4 query_length); +BlastSeqLoc* BlastSeqLocReverse(const BlastSeqLoc* filter_in, + Int4 query_length); + +/** Go through all mask locations in one sequence, + * combine any that overlap. Deallocate the memory for the locations that + * were on the list, produce a new (merged) list of locations. + * @param mask_loc The list of masks to be merged [in] + * @param link_value Largest gap size between locations for which they + * should be linked together [in] + * @return The new (merged) list of masks or NULL if mask_loc is NULL or memory + * allocation failure. +*/ +NCBI_XBLAST_EXPORT +BlastSeqLoc* +BlastSeqLocCombine(BlastSeqLoc* mask_loc, Int4 link_value); /** Deallocate memory for a BlastMaskLoc structure * as well as the BlastSeqLoc's pointed to. @@ -98,8 +136,9 @@ BlastSeqLoc* BlastSeqLocReverse(const BlastSeqLoc* filter_in, Int4 query_length) NCBI_XBLAST_EXPORT BlastMaskLoc* BlastMaskLocFree(BlastMaskLoc* mask_loc); -/** Allocate memory for a BlastMaskLoc, also allocates array for BlastSeqLoc* of length total. - * @param total which context (i.e., strand) [in] +/** Allocate memory for a BlastMaskLoc. + * @param total number of contexts for which SSeqLocs should be allocated + * (result of number of queries * number of contexts for given program) [in] * @return Pointer to the allocated BlastMaskLoc structure. */ NCBI_XBLAST_EXPORT @@ -110,6 +149,12 @@ BlastMaskLoc* BlastMaskLocNew(Int4 total); * lists. * @param mask_loc Mask locations structure [in|out] * @param query_info Query information structure, containing contexts data [in] + * Note: This function does NOT take into consideration the strands requested + * to be searched, which is INCONSISTENT with what the C++ API does (this + * function is not called from the C++ API, only from the C API). Therefore, + * this function should either 1) be moved out of the CORE or 2) modified to + * take into consideration the strand specified for the nucleotide + * query/queries. */ Int2 BlastMaskLocDNAToProtein(BlastMaskLoc* mask_loc, const BlastQueryInfo* query_info); @@ -122,19 +167,6 @@ Int2 BlastMaskLocDNAToProtein(BlastMaskLoc* mask_loc, Int2 BlastMaskLocProteinToDNA(BlastMaskLoc* mask_loc, const BlastQueryInfo* query_info); -/** Go through all mask locations in one sequence, - * combine any that overlap. Deallocate the memory for the locations that - * were on the list, produce a new (merged) list of locations. - * @param mask_loc The list of masks to be merged [in] - * @param mask_loc_out The new (merged) list of masks. [out] - * @param link_value Largest gap size between locations fow which they - * should be linked together [in] -*/ -NCBI_XBLAST_EXPORT -Int2 -CombineMaskLocations(BlastSeqLoc* mask_loc, BlastSeqLoc* *mask_loc_out, - Int4 link_value); - /** This function takes the list of mask locations (i.e., regions that * should not be searched or not added to lookup table) and makes up a set * of SSeqRange*'s in the concatenated sequence built from a set of queries, @@ -150,14 +182,13 @@ CombineMaskLocations(BlastSeqLoc* mask_loc, BlastSeqLoc* *mask_loc_out, * @param complement_mask Linked list of SSeqRange*s in the concatenated * sequence to be indexed in the lookup table . [out] */ -NCBI_XBLAST_EXPORT Int2 BLAST_ComplementMaskLocations(EBlastProgramType program_number, const BlastQueryInfo* query_info, const BlastMaskLoc* mask_loc, BlastSeqLoc* *complement_mask); -/** Runs filtering functions, according to the filtering options, returns - * SeqLocPtr. Should combine all SeqLocs so they are non-redundant. +/** Runs seg filtering functions, according to the filtering options, returns + * BlastSeqLoc*. Should combine all SeqLocs so they are non-redundant. * @param program_number Type of BLAST program [in] * @param sequence The sequence or part of the sequence to be filtered [in] * @param length Length of the (sub)sequence [in] @@ -188,23 +219,26 @@ BlastSetUp_Filter(EBlastProgramType program_number, */ NCBI_XBLAST_EXPORT Int2 -BlastSetUp_GetFilteringLocations(BLAST_SequenceBlk* query_blk, const BlastQueryInfo* query_info, - EBlastProgramType program_number, const SBlastFilterOptions* filter_options, - BlastMaskLoc** filter_out, Blast_Message* *blast_message); +BlastSetUp_GetFilteringLocations(BLAST_SequenceBlk* query_blk, + const BlastQueryInfo* query_info, + EBlastProgramType program_number, + const SBlastFilterOptions* filter_options, + BlastMaskLoc** filter_out, + Blast_Message* *blast_message); /** Masks the letters in buffer. * This is a low-level routine and takes a raw buffer which it assumes * to be in ncbistdaa (protein) or blastna (nucleotide). - * @param buffer the sequence to be masked (will be modified). [out] + * @param buffer the sequence to be masked (will be modified, cannot be NULL or + * undefined behavior will result).[in|out] * @param length length of the sequence to be masked . [in] * @param is_na nucleotide if TRUE [in] * @param mask_loc the BlastSeqLoc to use for masking [in] * @param reverse minus strand if TRUE [in] * @param offset how far along sequence is 1st residuse in buffer [in] - * */ NCBI_XBLAST_EXPORT -Int2 +void Blast_MaskTheResidues(Uint1 * buffer, Int4 length, Boolean is_na, const BlastSeqLoc* mask_loc, Boolean reverse, Int4 offset); @@ -215,9 +249,11 @@ Blast_MaskTheResidues(Uint1 * buffer, Int4 length, Boolean is_na, * @param program_number one of blastn,blastp,blastx,etc. [in] */ NCBI_XBLAST_EXPORT -Int2 -BlastSetUp_MaskQuery(BLAST_SequenceBlk* query_blk, const BlastQueryInfo* query_info, - const BlastMaskLoc *filter_maskloc, EBlastProgramType program_number); +void +BlastSetUp_MaskQuery(BLAST_SequenceBlk* query_blk, + const BlastQueryInfo* query_info, + const BlastMaskLoc *filter_maskloc, + EBlastProgramType program_number); /** Produces SBlastFilterOptions from a string that has been traditionally supported * in blast. @@ -229,8 +265,10 @@ BlastSetUp_MaskQuery(BLAST_SequenceBlk* query_blk, const BlastQueryInfo* query_i */ NCBI_XBLAST_EXPORT Int2 -BlastFilteringOptionsFromString(EBlastProgramType program_number, const char* instructions, - SBlastFilterOptions* *filtering_options, Blast_Message* *blast_message); +BlastFilteringOptionsFromString(EBlastProgramType program_number, + const char* instructions, + SBlastFilterOptions* *filtering_options, + Blast_Message* *blast_message); #ifdef __cplusplus } diff --git a/algo/blast/core/blast_gapalign.c b/algo/blast/core/blast_gapalign.c index 203d0c88..99e7dee6 100644 --- a/algo/blast/core/blast_gapalign.c +++ b/algo/blast/core/blast_gapalign.c @@ -1,4 +1,4 @@ -/* $Id: blast_gapalign.c,v 1.159 2005/08/22 17:57:09 madden Exp $ +/* $Id: blast_gapalign.c,v 1.163 2005/11/30 18:29:14 papadopo Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. @@ -23,7 +23,6 @@ * * =========================================================================== * - * Author: Ilya Dondoshansky * */ @@ -33,7 +32,7 @@ #ifndef SKIP_DOXYGEN_PROCESSING static char const rcsid[] = - "$Id: blast_gapalign.c,v 1.159 2005/08/22 17:57:09 madden Exp $"; + "$Id: blast_gapalign.c,v 1.163 2005/11/30 18:29:14 papadopo Exp $"; #endif /* SKIP_DOXYGEN_PROCESSING */ #include <algo/blast/core/blast_options.h> @@ -205,7 +204,8 @@ s_BlastGreedyAlignMemAlloc(const BlastScoringParameters* score_params, if (gap_open == 0 && gap_extend == 0) gap_extend = reward / 2 + penalty; - max_d = (Int4) (max_dbseq_length / GREEDY_MAX_COST_FRACTION + 1); + max_d = MIN(GREEDY_MAX_COST, + max_dbseq_length / GREEDY_MAX_COST_FRACTION + 1); gamp = (SGreedyAlignMem*) calloc(1, sizeof(SGreedyAlignMem)); @@ -253,8 +253,10 @@ s_BlastGreedyAlignMemAlloc(const BlastScoringParameters* score_params, for (i = 1; i <= max_cost; i++) gamp->last_seq2_off_affine[i] = gamp->last_seq2_off_affine[i-1] + 2*max_d_1 + 6; - if (!gamp->last_seq2_off_affine || !gamp->last_seq2_off_affine[0]) + if (!gamp->last_seq2_off_affine || !gamp->last_seq2_off_affine[0]) { s_BlastGreedyAlignsFree(gamp); + return NULL; + } } gamp->max_score = (Int4*) malloc(sizeof(Int4) * (max_d + 1 + d_diff)); @@ -280,6 +282,7 @@ BLAST_GapAlignStructFree(BlastGapAlignStruct* gap_align) if (gap_align->greedy_align_mem) s_BlastGreedyAlignsFree(gap_align->greedy_align_mem); GapStateFree(gap_align->state_struct); + sfree(gap_align->dp_mem); sfree(gap_align); return NULL; @@ -306,7 +309,16 @@ BLAST_GapAlignStructNew(const BlastScoringParameters* score_params, gap_align->gap_x_dropoff = ext_params->gap_x_dropoff; - if (ext_params->options->ePrelimGapExt != eDynProgExt) { + if (ext_params->options->ePrelimGapExt == eDynProgExt) { + /* allocate structures for ordinary dynamic programming */ + gap_align->dp_mem_alloc = 1000; + gap_align->dp_mem = (BlastGapDP *)malloc(gap_align->dp_mem_alloc * + sizeof(BlastGapDP)); + if (!gap_align->dp_mem) + gap_align = BLAST_GapAlignStructFree(gap_align); + } + else { + /* allocate structures for greedy dynamic programming */ max_subject_length = MIN(max_subject_length, MAX_DBSEQ_LEN); gap_align->greedy_align_mem = s_BlastGreedyAlignMemAlloc(score_params, ext_params, @@ -331,14 +343,10 @@ enum { SCRIPT_SUB = eGapAlignSub, /**< Substitution */ SCRIPT_GAP_IN_A = eGapAlignDel, /**< Deletion */ SCRIPT_GAP_IN_B = eGapAlignIns, /**< Insertion */ - SCRIPT_DECLINE = eGapAlignDecline, /**< Decline to align */ SCRIPT_OP_MASK = 0x07, /**< Mask for edit script operations */ - SCRIPT_EXTEND_DECLINE= 0x08, /**< continue declining alignment */ SCRIPT_EXTEND_GAP_A = 0x10, /**< continue a gap in A */ - SCRIPT_DECLINE_A = 0x20, /**< continue declining alignment for A */ SCRIPT_EXTEND_GAP_B = 0x40, /**< continue a gap in B */ - SCRIPT_DECLINE_B = 0x80 /**< continue declining alignment for B */ }; /** Low level function to perform dynamic programming gapped extension @@ -376,13 +384,10 @@ ALIGN_EX(Uint1* A, Uint1* B, Int4 M, Int4 N, Int4* a_offset, Uint1* b_ptr; BlastGapDP* score_array; - Int4 score_array_size; - Int4 score_array_origin; Int4 gap_open; Int4 gap_extend; Int4 gap_open_extend; - Int4 decline_penalty; Int4 x_dropoff; Int4 best_score; @@ -393,9 +398,7 @@ ALIGN_EX(Uint1* A, Uint1* B, Int4 M, Int4 N, Int4* a_offset, Int4 score; Int4 score_gap_row; Int4 score_gap_col; - Int4 score_decline; Int4 next_score; - Int4 next_score_decline; GapStateArrayStruct* state_struct; Uint1* edit_script_row; @@ -416,7 +419,6 @@ ALIGN_EX(Uint1* A, Uint1* B, Int4 M, Int4 N, Int4* a_offset, gap_open = score_params->gap_open; gap_extend = score_params->gap_extend; gap_open_extend = gap_open + gap_extend; - decline_penalty = score_params->decline_align; x_dropoff = gap_align->gap_x_dropoff; if (x_dropoff < gap_open_extend) @@ -448,30 +450,29 @@ ALIGN_EX(Uint1* A, Uint1* B, Int4 M, Int4 N, Int4* a_offset, the alignment can only go x_dropoff/gap_extend positions at most before failing the X dropoff criterion */ - if (gap_extend > 0) { + if (gap_extend > 0) num_extra_cells = x_dropoff / gap_extend + 3; - state_struct = s_GapGetState(&gap_align->state_struct, - num_extra_cells); - score_array_size = 2 * num_extra_cells; - } - else { - num_extra_cells = 0; - state_struct = s_GapGetState(&gap_align->state_struct, N + 3); - score_array_size = N + 3; + else + num_extra_cells = N + 3; + + if (num_extra_cells > gap_align->dp_mem_alloc) { + gap_align->dp_mem_alloc = MAX(num_extra_cells + 100, + 2 * gap_align->dp_mem_alloc); + sfree(gap_align->dp_mem); + gap_align->dp_mem = (BlastGapDP *)malloc(gap_align->dp_mem_alloc * + sizeof(BlastGapDP)); } + state_struct = s_GapGetState(&gap_align->state_struct, num_extra_cells); + edit_script[0] = state_struct->state_array; edit_start_offset[0] = 0; edit_script_row = state_struct->state_array; - score_array_size = MAX(100, score_array_size); - score_array_origin = 0; - score = -gap_open_extend; - score_array = (BlastGapDP*)malloc(score_array_size * sizeof(BlastGapDP)); + score_array = gap_align->dp_mem; score_array[0].best = 0; score_array[0].best_gap = -gap_open_extend; - score_array[0].best_decline = -gap_open_extend - decline_penalty; for (i = 1; i <= N; i++) { if (score < -x_dropoff) @@ -479,7 +480,6 @@ ALIGN_EX(Uint1* A, Uint1* B, Int4 M, Int4 N, Int4* a_offset, score_array[i].best = score; score_array[i].best_gap = score - gap_open_extend; - score_array[i].best_decline = score - gap_open_extend - decline_penalty; score -= gap_extend; edit_script_row[i] = SCRIPT_GAP_IN_A; } @@ -554,21 +554,13 @@ ALIGN_EX(Uint1* A, Uint1* B, Int4 M, Int4 N, Int4* a_offset, score = MININT; score_gap_row = MININT; - score_decline = MININT; last_b_index = first_b_index; for (b_index = first_b_index; b_index < b_size; b_index++) { - /* convert the current B offset into an offset - suitable for the current array of auxiliary - structures */ - - Int4 s_index = b_index - score_array_origin; - b_ptr += b_increment; - score_gap_col = score_array[s_index].best_gap; - next_score = score_array[s_index].best + matrix_row[ *b_ptr ]; - next_score_decline = score_array[s_index].best_decline; + score_gap_col = score_array[b_index].best_gap; + next_score = score_array[b_index].best + matrix_row[ *b_ptr ]; /* script, script_row and script_col contain the actions specified by the dynamic programming. @@ -578,36 +570,17 @@ ALIGN_EX(Uint1* A, Uint1* B, Int4 M, Int4 N, Int4* a_offset, this inner loop is exactly the same as the one in Blast_SemiGappedAlign() */ - if (score_decline > score) { - script = SCRIPT_DECLINE; - score = score_decline; - } - else { - script = SCRIPT_SUB; - } - - if (score_gap_col < score_decline) { - score_gap_col = score_decline; - script_col = SCRIPT_DECLINE_B; - } - else { - script_col = SCRIPT_EXTEND_GAP_B; - if (score < score_gap_col) { - script = SCRIPT_GAP_IN_B; - score = score_gap_col; - } - } + script = SCRIPT_SUB; + script_col = SCRIPT_EXTEND_GAP_B; + script_row = SCRIPT_EXTEND_GAP_A; - if (score_gap_row < score_decline) { - score_gap_row = score_decline; - script_row = SCRIPT_DECLINE_A; + if (score < score_gap_col) { + script = SCRIPT_GAP_IN_B; + score = score_gap_col; } - else { - script_row = SCRIPT_EXTEND_GAP_A; - if (score < score_gap_row) { - script = SCRIPT_GAP_IN_A; - score = score_gap_row; - } + if (score < score_gap_row) { + script = SCRIPT_GAP_IN_A; + score = score_gap_row; } if (best_score - score > x_dropoff) { @@ -615,7 +588,7 @@ ALIGN_EX(Uint1* A, Uint1* B, Int4 M, Int4 N, Int4* a_offset, if (first_b_index == b_index) first_b_index++; else - score_array[s_index].best = MININT; + score_array[b_index].best = MININT; } else { last_b_index = b_index; @@ -628,10 +601,10 @@ ALIGN_EX(Uint1* A, Uint1* B, Int4 M, Int4 N, Int4* a_offset, score_gap_row -= gap_extend; score_gap_col -= gap_extend; if (score_gap_col < (score - gap_open_extend)) { - score_array[s_index].best_gap = score - gap_open_extend; + score_array[b_index].best_gap = score - gap_open_extend; } else { - score_array[s_index].best_gap = score_gap_col; + score_array[b_index].best_gap = score_gap_col; script += script_col; } @@ -640,52 +613,35 @@ ALIGN_EX(Uint1* A, Uint1* B, Int4 M, Int4 N, Int4* a_offset, else script += script_row; - if (score_decline < (score - gap_open)) { - score_array[s_index].best_decline = score - - gap_open - decline_penalty; - } - else { - score_array[s_index].best_decline = score_decline - - decline_penalty; - script += SCRIPT_EXTEND_DECLINE; - } - score_array[s_index].best = score; + score_array[b_index].best = score; } score = next_score; - score_decline = next_score_decline; edit_script_row[b_index] = script; } if (first_b_index == b_size) break; - if (last_b_index + num_extra_cells + 3 >= - score_array_size + score_array_origin) { - BlastGapDP *new_array; - score_array_size = 2 * score_array_size; - new_array = (BlastGapDP *)malloc(score_array_size * - sizeof(BlastGapDP)); - memcpy(new_array, - score_array + (first_b_index - score_array_origin), - (last_b_index - first_b_index + 1) * sizeof(BlastGapDP)); - sfree(score_array); - score_array = new_array; - score_array_origin = first_b_index; + if (last_b_index + num_extra_cells + 3 >= gap_align->dp_mem_alloc) { + + gap_align->dp_mem_alloc = MAX(last_b_index + num_extra_cells + 100, + 2 * gap_align->dp_mem_alloc); + score_array = (BlastGapDP *)realloc(score_array, + gap_align->dp_mem_alloc * + sizeof(BlastGapDP)); + gap_align->dp_mem = score_array; } + if (last_b_index < b_size - 1) { b_size = last_b_index + 1; } else { while (score_gap_row >= (best_score - x_dropoff) && b_size <= N) { - Int4 s_index = b_size - score_array_origin; - - score_array[s_index].best = score_gap_row; - score_array[s_index].best_gap = score_gap_row - gap_open_extend; - score_array[s_index].best_decline = score_gap_row - gap_open - - decline_penalty; + score_array[b_size].best = score_gap_row; + score_array[b_size].best_gap = score_gap_row - gap_open_extend; score_gap_row -= gap_extend; edit_script_row[b_size] = SCRIPT_GAP_IN_A; b_size++; @@ -698,11 +654,8 @@ ALIGN_EX(Uint1* A, Uint1* B, Int4 M, Int4 N, Int4* a_offset, state_struct->used += MAX(b_index, b_size) - orig_b_index + 1; if (b_size <= N) { - Int4 s_index = b_size - score_array_origin; - - score_array[s_index].best = MININT; - score_array[s_index].best_gap = MININT; - score_array[s_index].best_decline = MININT; + score_array[b_size].best = MININT; + score_array[b_size].best_gap = MININT; b_size++; } } @@ -715,7 +668,7 @@ ALIGN_EX(Uint1* A, Uint1* B, Int4 M, Int4 N, Int4* a_offset, b_index = *b_offset; script = SCRIPT_SUB; - for (i = 0; a_index > 0 || b_index > 0; i++) { + while (a_index > 0 || b_index > 0) { /* Retrieve the next action to perform. Rows of the traceback array do not necessarily start at offset zero of B, so a correction is needed @@ -729,22 +682,12 @@ ALIGN_EX(Uint1* A, Uint1* B, Int4 M, Int4 N, Int4* a_offset, script = next_script & SCRIPT_OP_MASK; if (next_script & SCRIPT_EXTEND_GAP_A) script = SCRIPT_GAP_IN_A; - else if (next_script & SCRIPT_DECLINE_A) - script = SCRIPT_DECLINE; break; case SCRIPT_GAP_IN_B: script = next_script & SCRIPT_OP_MASK; if (next_script & SCRIPT_EXTEND_GAP_B) script = SCRIPT_GAP_IN_B; - else if (next_script & SCRIPT_DECLINE_B) - script = SCRIPT_DECLINE; - break; - - case SCRIPT_DECLINE: - script = next_script & SCRIPT_OP_MASK; - if (next_script & SCRIPT_EXTEND_DECLINE) - script = SCRIPT_DECLINE; break; default: @@ -767,7 +710,6 @@ ALIGN_EX(Uint1* A, Uint1* B, Int4 M, Int4 N, Int4* a_offset, sfree(edit_start_offset); sfree(edit_script); - sfree(score_array); return best_score; } @@ -802,13 +744,10 @@ Blast_SemiGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N, Uint1* b_ptr; BlastGapDP* score_array; - Int4 score_array_size; - Int4 score_array_origin; Int4 gap_open; /* alignment penalty variables */ Int4 gap_extend; Int4 gap_open_extend; - Int4 decline_penalty; Int4 x_dropoff; Int4** matrix = NULL; /* pointers to the score matrix */ @@ -818,9 +757,7 @@ Blast_SemiGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N, Int4 score; /* score tracking variables */ Int4 score_gap_row; Int4 score_gap_col; - Int4 score_decline; Int4 next_score; - Int4 next_score_decline; Int4 best_score; Int4 num_extra_cells; @@ -840,7 +777,6 @@ Blast_SemiGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N, gap_open = score_params->gap_open; gap_extend = score_params->gap_extend; gap_open_extend = gap_open + gap_extend; - decline_penalty = score_params->decline_align; x_dropoff = gap_align->gap_x_dropoff; if (x_dropoff < gap_open_extend) @@ -855,23 +791,23 @@ Blast_SemiGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N, set of DP computations. The initial window size is determined by the number of cells needed to fail the x-dropoff test */ - if (gap_extend > 0) { + if (gap_extend > 0) num_extra_cells = x_dropoff / gap_extend + 3; - score_array_size = 2 * num_extra_cells; - } - else { - num_extra_cells = 0; - score_array_size = N + 3; + else + num_extra_cells = N + 3; + + if (num_extra_cells > gap_align->dp_mem_alloc) { + gap_align->dp_mem_alloc = MAX(num_extra_cells + 100, + 2 * gap_align->dp_mem_alloc); + sfree(gap_align->dp_mem); + gap_align->dp_mem = (BlastGapDP *)malloc(gap_align->dp_mem_alloc * + sizeof(BlastGapDP)); } - score_array_size = MAX(100, score_array_size); - score_array_origin = 0; - + score_array = gap_align->dp_mem; score = -gap_open_extend; - score_array = (BlastGapDP*)malloc(score_array_size * sizeof(BlastGapDP)); score_array[0].best = 0; score_array[0].best_gap = -gap_open_extend; - score_array[0].best_decline = -gap_open_extend - decline_penalty; for (i = 1; i <= N; i++) { if (score < -x_dropoff) @@ -879,7 +815,6 @@ Blast_SemiGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N, score_array[i].best = score; score_array[i].best_gap = score - gap_open_extend; - score_array[i].best_decline = score - gap_open_extend - decline_penalty; score -= gap_extend; } @@ -919,48 +854,18 @@ Blast_SemiGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N, /* initialize running-score variables */ score = MININT; score_gap_row = MININT; - score_decline = MININT; last_b_index = first_b_index; for (b_index = first_b_index; b_index < b_size; b_index++) { - /* convert the current B offset into an offset - suitable for the current array of auxiliary - structures */ - - Int4 s_index = b_index - score_array_origin; - - /* Note that this formulation of dynamic programming - requires looking at an offset into B[] that is one - beyond b_index. Since b_index can equal the length - of B[], this means that a sentinel byte for B is - *required* by this aligner */ - b_ptr += b_increment; - score_gap_col = score_array[s_index].best_gap; - next_score = score_array[s_index].best + matrix_row[ *b_ptr ]; - next_score_decline = score_array[s_index].best_decline; - - /* decline the alignment if that improves the score */ + score_gap_col = score_array[b_index].best_gap; + next_score = score_array[b_index].best + matrix_row[ *b_ptr ]; - score = MAX(score, score_decline); - - /* decline the best row score if that improves it; - if not, make it the new high score if it's - an improvement */ - - if (score_gap_col < score_decline) - score_gap_col = score_decline; - else if (score < score_gap_col) + if (score < score_gap_col) score = score_gap_col; - /* decline the best column score if that improves it; - if not, make it the new high score if it's - an improvement */ - - if (score_gap_row < score_decline) - score_gap_row = score_decline; - else if (score < score_gap_row) + if (score < score_gap_row) score = score_gap_row; if (best_score - score > x_dropoff) { @@ -978,7 +883,7 @@ Blast_SemiGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N, if (b_index == first_b_index) first_b_index++; else - score_array[s_index].best = MININT; + score_array[b_index].best = MININT; } else { last_b_index = b_index; @@ -989,24 +894,20 @@ Blast_SemiGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N, } /* If starting a gap at this position will improve - the best row, column, or declined alignment score, - update them to reflect that. */ + the best row, or column, score, update them to + reflect that. */ score_gap_row -= gap_extend; score_gap_col -= gap_extend; - score_array[s_index].best_gap = MAX(score - gap_open_extend, + score_array[b_index].best_gap = MAX(score - gap_open_extend, score_gap_col); score_gap_row = MAX(score - gap_open_extend, score_gap_row); - - score_array[s_index].best_decline = - MAX(score_decline, score - gap_open) - decline_penalty; - score_array[s_index].best = score; + score_array[b_index].best = score; } score = next_score; - score_decline = next_score_decline; } - + /* Finish aligning if the best scores for all positions of B will fail the X-dropoff test, i.e. the inner loop bounds have converged to each other */ @@ -1014,21 +915,16 @@ Blast_SemiGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N, if (first_b_index == b_size) break; - /* enlarge the window for score data if necessary. If the - window expands, move its origin to first_b_index */ - - if (last_b_index + num_extra_cells + 3 >= - score_array_size + score_array_origin) { - BlastGapDP *new_array; - score_array_size = 2 * score_array_size; - new_array = (BlastGapDP *)malloc(score_array_size * - sizeof(BlastGapDP)); - memcpy(new_array, - score_array + (first_b_index - score_array_origin), - (last_b_index - first_b_index + 1) * sizeof(BlastGapDP)); - sfree(score_array); - score_array = new_array; - score_array_origin = first_b_index; + /* enlarge the window for score data if necessary */ + + if (last_b_index + num_extra_cells + 3 >= gap_align->dp_mem_alloc) { + + gap_align->dp_mem_alloc = MAX(last_b_index + num_extra_cells + 100, + 2 * gap_align->dp_mem_alloc); + score_array = (BlastGapDP *)realloc(score_array, + gap_align->dp_mem_alloc * + sizeof(BlastGapDP)); + gap_align->dp_mem = score_array; } if (last_b_index < b_size - 1) { @@ -1045,33 +941,20 @@ Blast_SemiGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N, The next inner loop will have larger bounds */ while (score_gap_row >= (best_score - x_dropoff) && b_size <= N) { - - /* convert the current B offset into an offset - suitable for the current array of auxiliary - structures. */ - - Int4 s_index = b_size - score_array_origin; - - score_array[s_index].best = score_gap_row; - score_array[s_index].best_gap = score_gap_row - gap_open_extend; - score_array[s_index].best_decline = score_gap_row - gap_open - - decline_penalty; + score_array[b_size].best = score_gap_row; + score_array[b_size].best_gap = score_gap_row - gap_open_extend; score_gap_row -= gap_extend; b_size++; } } if (b_size <= N) { - Int4 s_index = b_size - score_array_origin; - - score_array[s_index].best = MININT; - score_array[s_index].best_gap = MININT; - score_array[s_index].best_decline = MININT; + score_array[b_size].best = MININT; + score_array[b_size].best_gap = MININT; b_size++; } } - sfree(score_array); return best_score; } @@ -1115,9 +998,7 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N, Int4 a_index; Int4 b_index, b_size, first_b_index, last_b_index; - BlastGapSmallDP* score_array; - Int4 score_array_size; - Int4 score_array_origin; + BlastGapDP* score_array; Int4 gap_open; /* alignment penalty variables */ Int4 gap_extend; @@ -1190,31 +1071,31 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N, /* allocate storage for the first row of the traceback array. Because row elements correspond to gaps in A, - the alignment can only go x_dropoff/gap_extend positions - at most before failing the X dropoff criterion */ + the alignment can only go at most x_dropoff/gap_extend + positions, in all three frames, before failing the + X dropoff criterion */ - if (gap_extend > 0) { + if (gap_extend > 0) num_extra_cells = CODON_LENGTH * (x_dropoff / gap_extend + 5); - state_struct = s_GapGetState(&gap_align->state_struct, - num_extra_cells); - score_array_size = 3 * num_extra_cells; - } - else { - num_extra_cells = 0; - state_struct = s_GapGetState(&gap_align->state_struct, N + 5); - score_array_size = N + 5; + else + num_extra_cells = N + 5; + + if (num_extra_cells > gap_align->dp_mem_alloc) { + gap_align->dp_mem_alloc = MAX(num_extra_cells + 100, + 2 * gap_align->dp_mem_alloc); + sfree(gap_align->dp_mem); + gap_align->dp_mem = (BlastGapDP *)malloc(gap_align->dp_mem_alloc * + sizeof(BlastGapDP)); } + state_struct = s_GapGetState(&gap_align->state_struct, num_extra_cells); + edit_script[0] = state_struct->state_array; edit_start_offset[0] = 0; edit_script_row = state_struct->state_array; - score_array_size = MAX(CODON_LENGTH * 100, score_array_size); - score_array_origin = 0; - + score_array = gap_align->dp_mem; score = -gap_open_extend; - score_array = (BlastGapSmallDP*)malloc(score_array_size * - sizeof(BlastGapSmallDP)); score_array[0].best = 0; score_array[0].best_gap = -gap_open_extend; @@ -1324,12 +1205,6 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N, while (b_index < b_size) { - /* convert the current B offset into an offset - suitable for the current array of auxiliary - structures */ - - Int4 s_index = b_index - score_array_origin; - /* FRAME 0 */ score = MAX(score_other_frame1, score_other_frame2) - shift_penalty; @@ -1351,9 +1226,9 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N, } score += matrix_row[ B[ b_index * increment ] ]; - score_other_frame1 = MAX(score_col1, score_array[s_index].best); - score_col1 = score_array[s_index].best; - score_gap_col = score_array[s_index].best_gap; + score_other_frame1 = MAX(score_col1, score_array[b_index].best); + score_col1 = score_array[b_index].best; + score_gap_col = score_array[b_index].best_gap; if (score < MAX(score_gap_col, score_row1)) { if (score_gap_col > score_row1) { @@ -1369,12 +1244,12 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N, if (first_b_index == b_index) first_b_index = b_index + 1; else - score_array[s_index].best = MININT; + score_array[b_index].best = MININT; } else { last_b_index = b_index; - score_array[s_index].best = score; - score_array[s_index].best_gap = score_gap_col - gap_extend; + score_array[b_index].best = score; + score_array[b_index].best_gap = score_gap_col - gap_extend; score_row1 -= gap_extend; } } @@ -1383,11 +1258,11 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N, if (first_b_index == b_index) first_b_index = b_index + 1; else - score_array[s_index].best = MININT; + score_array[b_index].best = MININT; } else { last_b_index = b_index; - score_array[s_index].best = score; + score_array[b_index].best = score; if (score > best_score) { best_score = score; *a_offset = a_index; @@ -1403,11 +1278,11 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N, score_gap_col -= gap_extend; if (score < score_gap_col) { - score_array[s_index].best_gap = score_gap_col; + score_array[b_index].best_gap = score_gap_col; script |= SCRIPT_EXTEND_GAP_A; } else { - score_array[s_index].best_gap = score; + score_array[b_index].best_gap = score; } } } @@ -1423,7 +1298,6 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N, /* FRAME 1 */ - s_index++; score = MAX(score_other_frame1, score_other_frame2) - shift_penalty; score = MAX(score, score_col2); if (score == score_col2) { @@ -1442,9 +1316,9 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N, script = SCRIPT_NEXT_PLUS_TWO_FRAMES; } score += matrix_row[ B[ b_index * increment ] ]; - score_other_frame2 = MAX(score_col2, score_array[s_index].best); - score_col2 = score_array[s_index].best; - score_gap_col = score_array[s_index].best_gap; + score_other_frame2 = MAX(score_col2, score_array[b_index].best); + score_col2 = score_array[b_index].best; + score_gap_col = score_array[b_index].best_gap; if (score < MAX(score_gap_col, score_row2)) { score = MAX(score_gap_col, score_row2); @@ -1452,7 +1326,7 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N, if (first_b_index == b_index) first_b_index = b_index + 1; else - score_array[s_index].best = MININT; + score_array[b_index].best = MININT; } else { if (score == score_gap_col) @@ -1461,8 +1335,8 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N, script = SCRIPT_OOF_OPEN_GAP | SCRIPT_GAP_IN_B; last_b_index = b_index; - score_array[s_index].best = score; - score_array[s_index].best_gap = score_gap_col - gap_extend; + score_array[b_index].best = score; + score_array[b_index].best_gap = score_gap_col - gap_extend; score_row2 -= gap_extend; } } @@ -1471,11 +1345,11 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N, if (first_b_index == b_index) first_b_index = b_index + 1; else - score_array[s_index].best = MININT; + score_array[b_index].best = MININT; } else { last_b_index = b_index; - score_array[s_index].best = score; + score_array[b_index].best = score; if (score > best_score) { best_score = score; *a_offset = a_index; @@ -1490,11 +1364,11 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N, score_gap_col -= gap_extend; if (score < score_gap_col) { - score_array[s_index].best_gap = score_gap_col; + score_array[b_index].best_gap = score_gap_col; script |= SCRIPT_EXTEND_GAP_A; } else { - score_array[s_index].best_gap = score; + score_array[b_index].best_gap = score; } } } @@ -1510,7 +1384,6 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N, /* FRAME 2 */ - s_index++; score = MAX(score_other_frame1, score_other_frame2) - shift_penalty; score = MAX(score, score_col3); if (score == score_col3) { @@ -1530,9 +1403,9 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N, } score += matrix_row[ B[ b_index * increment ] ]; score_other_frame1 = score_other_frame2; - score_other_frame2 = MAX(score_col3, score_array[s_index].best); - score_col3 = score_array[s_index].best; - score_gap_col = score_array[s_index].best_gap; + score_other_frame2 = MAX(score_col3, score_array[b_index].best); + score_col3 = score_array[b_index].best; + score_gap_col = score_array[b_index].best_gap; if (score < MAX(score_gap_col, score_row3)) { score = MAX(score_gap_col, score_row3); @@ -1540,7 +1413,7 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N, if (first_b_index == b_index) first_b_index = b_index + 1; else - score_array[s_index].best = MININT; + score_array[b_index].best = MININT; } else { if (score == score_gap_col) @@ -1549,8 +1422,8 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N, script = SCRIPT_OOF_OPEN_GAP | SCRIPT_GAP_IN_B; last_b_index = b_index; - score_array[s_index].best = score; - score_array[s_index].best_gap = score_gap_col - gap_extend; + score_array[b_index].best = score; + score_array[b_index].best_gap = score_gap_col - gap_extend; score_row3 -= gap_extend; } } @@ -1559,11 +1432,11 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N, if (first_b_index == b_index) first_b_index = b_index + 1; else - score_array[s_index].best = MININT; + score_array[b_index].best = MININT; } else { last_b_index = b_index; - score_array[s_index].best = score; + score_array[b_index].best = score; if (score > best_score) { best_score = score; *a_offset = a_index; @@ -1578,11 +1451,11 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N, score_gap_col -= gap_extend; if (score < score_gap_col) { - score_array[s_index].best_gap = score_gap_col; + score_array[b_index].best_gap = score_gap_col; script |= SCRIPT_EXTEND_GAP_A; } else { - score_array[s_index].best_gap = score; + score_array[b_index].best_gap = score; } } } @@ -1596,22 +1469,16 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N, if (first_b_index == b_size) break; - /* Enlarge the window for score data, if necessary. If the - window must be enlarged, move the window origin to - first_b_index */ - - if (last_b_index + num_extra_cells + 5 >= - score_array_size + score_array_origin) { - BlastGapSmallDP *new_array; - score_array_size = 2 * score_array_size; - new_array = (BlastGapSmallDP *)malloc(score_array_size * - sizeof(BlastGapSmallDP)); - memcpy(new_array, - score_array + (first_b_index - score_array_origin), - (last_b_index - first_b_index + 1) * sizeof(BlastGapSmallDP)); - sfree(score_array); - score_array = new_array; - score_array_origin = first_b_index; + /* Enlarge the window for score data if necessary */ + + if (last_b_index + num_extra_cells + 5 >= gap_align->dp_mem_alloc) { + + gap_align->dp_mem_alloc = MAX(last_b_index + num_extra_cells + 100, + 2 * gap_align->dp_mem_alloc); + score_array = (BlastGapDP *)realloc(score_array, + gap_align->dp_mem_alloc * + sizeof(BlastGapDP)); + gap_align->dp_mem = score_array; } if (last_b_index < b_size - 1) { @@ -1634,22 +1501,20 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N, score = MAX(score, score_row3); while (score >= (best_score - x_dropoff) && b_size < N + 1) { - Int4 s_index = b_size - score_array_origin; - - score_array[s_index].best = score_row1; - score_array[s_index].best_gap = score_row1 - gap_open_extend; + score_array[b_size].best = score_row1; + score_array[b_size].best_gap = score_row1 - gap_open_extend; score_row1 -= gap_extend; edit_script_row[b_size] = SCRIPT_OOF_OPEN_GAP | SCRIPT_GAP_IN_B; - score_array[s_index+1].best = score_row2; - score_array[s_index+1].best_gap = score_row2 - gap_open_extend; + score_array[b_size+1].best = score_row2; + score_array[b_size+1].best_gap = score_row2 - gap_open_extend; score_row2 -= gap_extend; edit_script_row[b_size+1] = SCRIPT_OOF_OPEN_GAP | SCRIPT_GAP_IN_B; - score_array[s_index+2].best = score_row3; - score_array[s_index+2].best_gap = score_row3 - gap_open_extend; + score_array[b_size+2].best = score_row3; + score_array[b_size+2].best_gap = score_row3 - gap_open_extend; score_row3 -= gap_extend; edit_script_row[b_size+2] = SCRIPT_OOF_OPEN_GAP | SCRIPT_GAP_IN_B; @@ -1668,9 +1533,8 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N, last_b_index = MIN(b_size + 4, N + 3); while (b_size < last_b_index) { - Int4 s_index = b_size - score_array_origin; - score_array[s_index].best = MININT; - score_array[s_index].best_gap = MININT; + score_array[b_size].best = MININT; + score_array[b_size].best_gap = MININT; b_size++; } } @@ -1717,7 +1581,6 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N, sfree(edit_start_offset); sfree(edit_script); - sfree(score_array); if (!reversed) *b_offset -= 2; @@ -1758,9 +1621,7 @@ s_OutOfFrameGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N, Int4 shift_penalty; Int4 x_dropoff; - BlastGapSmallDP* score_array; - Int4 score_array_size; - Int4 score_array_origin; + BlastGapDP* score_array; Int4 num_extra_cells; Int4** matrix = NULL; /* pointers to the score matrix */ @@ -1812,21 +1673,21 @@ s_OutOfFrameGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N, set of DP computations. The initial window size is determined by the number of cells needed to fail the x-dropoff test */ - if (gap_extend > 0) { + if (gap_extend > 0) num_extra_cells = CODON_LENGTH * (x_dropoff / gap_extend + 5); - score_array_size = 3 * num_extra_cells; - } - else { - num_extra_cells = 0; - score_array_size = N + 5; + else + num_extra_cells = N + 5; + + if (num_extra_cells > gap_align->dp_mem_alloc) { + gap_align->dp_mem_alloc = MAX(num_extra_cells + 100, + 2 * gap_align->dp_mem_alloc); + sfree(gap_align->dp_mem); + gap_align->dp_mem = (BlastGapDP *)malloc(gap_align->dp_mem_alloc * + sizeof(BlastGapDP)); } - score_array_size = MAX(CODON_LENGTH*100, score_array_size); - score_array_origin = 0; - + score_array = gap_align->dp_mem; score = -gap_open_extend; - score_array = (BlastGapSmallDP*)malloc(score_array_size * - sizeof(BlastGapSmallDP)); score_array[0].best = 0; score_array[0].best_gap = -gap_open_extend; @@ -1891,21 +1752,15 @@ s_OutOfFrameGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N, while (b_index < b_size) { - /* convert the current B offset into an offset - suitable for the current array of auxiliary - structures */ - - Int4 s_index = b_index - score_array_origin; - /* FRAME 0 */ /* Pick the best score among all frames */ score = MAX(score_other_frame1, score_other_frame2) - shift_penalty; score = MAX(score, score_col1) + matrix_row[ B[ b_index * increment ] ]; - score_other_frame1 = MAX(score_col1, score_array[s_index].best); - score_col1 = score_array[s_index].best; - score_gap_col = score_array[s_index].best_gap; + score_other_frame1 = MAX(score_col1, score_array[b_index].best); + score_col1 = score_array[b_index].best; + score_gap_col = score_array[b_index].best_gap; /* Use the row and column scores if they improve the score overall */ @@ -1927,13 +1782,13 @@ s_OutOfFrameGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N, if (first_b_index == b_index) first_b_index = b_index + 1; else - score_array[s_index].best = MININT; + score_array[b_index].best = MININT; } else { /* update the row and column running scores */ last_b_index = b_index; - score_array[s_index].best = score; - score_array[s_index].best_gap = score_gap_col - gap_extend; + score_array[b_index].best = score; + score_array[b_index].best_gap = score_gap_col - gap_extend; score_row1 -= gap_extend; } } @@ -1946,7 +1801,7 @@ s_OutOfFrameGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N, if (first_b_index == b_index) first_b_index = b_index + 1; else - score_array[s_index].best = MININT; + score_array[b_index].best = MININT; } else { /* The current best score exceeds the @@ -1954,7 +1809,7 @@ s_OutOfFrameGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N, improve on the current optimal score */ last_b_index = b_index; - score_array[s_index].best = score; + score_array[b_index].best = score; if (score > best_score) { best_score = score; *a_offset = a_index; @@ -1967,7 +1822,7 @@ s_OutOfFrameGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N, score -= gap_open_extend; score_row1 -= gap_extend; score_row1 = MAX(score, score_row1); - score_array[s_index].best_gap = MAX(score, + score_array[b_index].best_gap = MAX(score, score_gap_col - gap_extend); } } @@ -1990,13 +1845,12 @@ s_OutOfFrameGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N, same as the preceeding code. The only real difference is the updating of the other_frame best scores */ - s_index++; score = MAX(score_other_frame1, score_other_frame2) - shift_penalty; score = MAX(score, score_col2) + matrix_row[ B[ b_index * increment ] ]; - score_other_frame2 = MAX(score_col2, score_array[s_index].best); - score_col2 = score_array[s_index].best; - score_gap_col = score_array[s_index].best_gap; + score_other_frame2 = MAX(score_col2, score_array[b_index].best); + score_col2 = score_array[b_index].best; + score_gap_col = score_array[b_index].best_gap; if (score < MAX(score_gap_col, score_row2)) { score = MAX(score_gap_col, score_row2); @@ -2004,12 +1858,12 @@ s_OutOfFrameGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N, if (first_b_index == b_index) first_b_index = b_index + 1; else - score_array[s_index].best = MININT; + score_array[b_index].best = MININT; } else { last_b_index = b_index; - score_array[s_index].best = score; - score_array[s_index].best_gap = score_gap_col - gap_extend; + score_array[b_index].best = score; + score_array[b_index].best_gap = score_gap_col - gap_extend; score_row2 -= gap_extend; } } @@ -2018,11 +1872,11 @@ s_OutOfFrameGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N, if (first_b_index == b_index) first_b_index = b_index + 1; else - score_array[s_index].best = MININT; + score_array[b_index].best = MININT; } else { last_b_index = b_index; - score_array[s_index].best = score; + score_array[b_index].best = score; if (score > best_score) { best_score = score; *a_offset = a_index; @@ -2031,7 +1885,7 @@ s_OutOfFrameGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N, score -= gap_open_extend; score_row2 -= gap_extend; score_row2 = MAX(score, score_row2); - score_array[s_index].best_gap = MAX(score, + score_array[b_index].best_gap = MAX(score, score_gap_col - gap_extend); } } @@ -2046,14 +1900,13 @@ s_OutOfFrameGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N, /* FRAME 2 */ - s_index++; score = MAX(score_other_frame1, score_other_frame2) - shift_penalty; score = MAX(score, score_col3) + matrix_row[ B[ b_index * increment ] ]; score_other_frame1 = score_other_frame2; - score_other_frame2 = MAX(score_col3, score_array[s_index].best); - score_col3 = score_array[s_index].best; - score_gap_col = score_array[s_index].best_gap; + score_other_frame2 = MAX(score_col3, score_array[b_index].best); + score_col3 = score_array[b_index].best; + score_gap_col = score_array[b_index].best_gap; if (score < MAX(score_gap_col, score_row3)) { score = MAX(score_gap_col, score_row3); @@ -2061,12 +1914,12 @@ s_OutOfFrameGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N, if (first_b_index == b_index) first_b_index = b_index + 1; else - score_array[s_index].best = MININT; + score_array[b_index].best = MININT; } else { last_b_index = b_index; - score_array[s_index].best = score; - score_array[s_index].best_gap = score_gap_col - gap_extend; + score_array[b_index].best = score; + score_array[b_index].best_gap = score_gap_col - gap_extend; score_row3 -= gap_extend; } } @@ -2075,11 +1928,11 @@ s_OutOfFrameGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N, if (first_b_index == b_index) first_b_index = b_index + 1; else - score_array[s_index].best = MININT; + score_array[b_index].best = MININT; } else { last_b_index = b_index; - score_array[s_index].best = score; + score_array[b_index].best = score; if (score > best_score) { best_score = score; *a_offset = a_index; @@ -2088,7 +1941,7 @@ s_OutOfFrameGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N, score -= gap_open_extend; score_row3 -= gap_extend; score_row3 = MAX(score, score_row3); - score_array[s_index].best_gap = MAX(score, + score_array[b_index].best_gap = MAX(score, score_gap_col - gap_extend); } } @@ -2102,22 +1955,16 @@ s_OutOfFrameGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N, if (first_b_index == b_size) break; - /* Enlarge the window for score data, if necessary. If the - window must be enlarged, move the window origin to - first_b_index */ - - if (b_size + num_extra_cells + 5 >= - score_array_size + score_array_origin) { - BlastGapSmallDP *new_array; - score_array_size = 2 * score_array_size; - new_array = (BlastGapSmallDP *)malloc(score_array_size * - sizeof(BlastGapSmallDP)); - memcpy(new_array, - score_array + (first_b_index - score_array_origin), - (b_size - first_b_index + 1) * sizeof(BlastGapSmallDP)); - sfree(score_array); - score_array = new_array; - score_array_origin = first_b_index; + /* Enlarge the window for score data, if necessary */ + + if (b_size + num_extra_cells + 5 >= gap_align->dp_mem_alloc) { + + gap_align->dp_mem_alloc = MAX(b_size + num_extra_cells + 100, + 2 * gap_align->dp_mem_alloc); + score_array = (BlastGapDP *)realloc(score_array, + gap_align->dp_mem_alloc * + sizeof(BlastGapDP)); + gap_align->dp_mem = score_array; } if (last_b_index < b_size - 1) { @@ -2140,18 +1987,16 @@ s_OutOfFrameGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N, score = MAX(score, score_row3); while (score >= (best_score - x_dropoff) && b_size < N + 1) { - Int4 s_index = b_size - score_array_origin; - - score_array[s_index].best = score_row1; - score_array[s_index].best_gap = score_row1 - gap_open_extend; + score_array[b_size].best = score_row1; + score_array[b_size].best_gap = score_row1 - gap_open_extend; score_row1 -= gap_extend; - score_array[s_index+1].best = score_row2; - score_array[s_index+1].best_gap = score_row2 - gap_open_extend; + score_array[b_size+1].best = score_row2; + score_array[b_size+1].best_gap = score_row2 - gap_open_extend; score_row2 -= gap_extend; - score_array[s_index+2].best = score_row3; - score_array[s_index+2].best_gap = score_row3 - gap_open_extend; + score_array[b_size+2].best = score_row3; + score_array[b_size+2].best_gap = score_row3 - gap_open_extend; score_row3 -= gap_extend; b_size += 3; @@ -2163,9 +2008,8 @@ s_OutOfFrameGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N, b_size = MIN(b_size, N + 1); last_b_index = MIN(b_size + 4, N + 3); while (b_size < last_b_index) { - Int4 s_index = b_size - score_array_origin; - score_array[s_index].best = MININT; - score_array[s_index].best_gap = MININT; + score_array[b_size].best = MININT; + score_array[b_size].best_gap = MININT; b_size++; } } @@ -2174,7 +2018,6 @@ s_OutOfFrameGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N, /* The sequence was shifted, so length should be adjusted as well */ *b_offset -= 2; } - sfree(score_array); return best_score; } @@ -2620,9 +2463,7 @@ s_BlastAlignPackedNucl(Uint1* B, Uint1* A, Int4 N, Int4 M, Int4 b_index, b_size, first_b_index, last_b_index, b_increment; Uint1* b_ptr; - BlastGapSmallDP* score_array; - Int4 score_array_size; - Int4 score_array_origin; + BlastGapDP* score_array; Int4 num_extra_cells; Int4 gap_open; /* alignment penalty variables */ @@ -2666,21 +2507,21 @@ s_BlastAlignPackedNucl(Uint1* B, Uint1* A, Int4 N, Int4 M, set of DP computations. The initial window size is determined by the number of cells needed to fail the x-dropoff test */ - if (gap_extend > 0) { + if (gap_extend > 0) num_extra_cells = x_dropoff / gap_extend + 3; - score_array_size = 2 * num_extra_cells; - } - else { - num_extra_cells = 0; - score_array_size = N + 3; + else + num_extra_cells = N + 3; + + if (num_extra_cells > gap_align->dp_mem_alloc) { + gap_align->dp_mem_alloc = MAX(num_extra_cells + 100, + 2 * gap_align->dp_mem_alloc); + sfree(gap_align->dp_mem); + gap_align->dp_mem = (BlastGapDP *)malloc(gap_align->dp_mem_alloc * + sizeof(BlastGapDP)); } + score_array = gap_align->dp_mem; score = -gap_open_extend; - score_array_size = MAX(100, score_array_size); - score_array_origin = 0; - - score_array = (BlastGapSmallDP*)malloc(score_array_size * - sizeof(BlastGapSmallDP)); score = -gap_open_extend; score_array[0].best = 0; score_array[0].best_gap = -gap_open_extend; @@ -2733,15 +2574,9 @@ s_BlastAlignPackedNucl(Uint1* B, Uint1* A, Int4 N, Int4 M, for (b_index = first_b_index; b_index < b_size; b_index++) { - /* convert the current B offset into an offset - suitable for the current array of auxiliary - structures */ - - Int4 s_index = b_index - score_array_origin; - b_ptr += b_increment; - score_gap_col = score_array[s_index].best_gap; - next_score = score_array[s_index].best + matrix_row[ *b_ptr ]; + score_gap_col = score_array[b_index].best_gap; + next_score = score_array[b_index].best + matrix_row[ *b_ptr ]; if (score < score_gap_col) score = score_gap_col; @@ -2764,7 +2599,7 @@ s_BlastAlignPackedNucl(Uint1* B, Uint1* A, Int4 N, Int4 M, if (b_index == first_b_index) first_b_index++; else - score_array[s_index].best = MININT; + score_array[b_index].best = MININT; } else { last_b_index = b_index; @@ -2775,16 +2610,16 @@ s_BlastAlignPackedNucl(Uint1* B, Uint1* A, Int4 N, Int4 M, } /* If starting a gap at this position will improve - the best row, or column, score, update them to + the best row or column score, update them to reflect that. */ score_gap_row -= gap_extend; score_gap_col -= gap_extend; - score_array[s_index].best_gap = MAX(score - gap_open_extend, + score_array[b_index].best_gap = MAX(score - gap_open_extend, score_gap_col); score_gap_row = MAX(score - gap_open_extend, score_gap_row); - score_array[s_index].best = score; + score_array[b_index].best = score; } score = next_score; @@ -2797,18 +2632,14 @@ s_BlastAlignPackedNucl(Uint1* B, Uint1* A, Int4 N, Int4 M, if (first_b_index == b_size) break; - if (last_b_index + num_extra_cells + 3 >= - score_array_size + score_array_origin) { - BlastGapSmallDP *new_array; - score_array_size = 2 * score_array_size; - new_array = (BlastGapSmallDP *)malloc(score_array_size * - sizeof(BlastGapSmallDP)); - memcpy(new_array, - score_array + (first_b_index - score_array_origin), - (last_b_index - first_b_index + 1) * sizeof(BlastGapSmallDP)); - sfree(score_array); - score_array = new_array; - score_array_origin = first_b_index; + if (last_b_index + num_extra_cells + 3 >= gap_align->dp_mem_alloc) { + + gap_align->dp_mem_alloc = MAX(last_b_index + num_extra_cells + 100, + 2 * gap_align->dp_mem_alloc); + score_array = (BlastGapDP *)realloc(score_array, + gap_align->dp_mem_alloc * + sizeof(BlastGapDP)); + gap_align->dp_mem = score_array; } if (last_b_index < b_size - 1) { @@ -2825,30 +2656,20 @@ s_BlastAlignPackedNucl(Uint1* B, Uint1* A, Int4 N, Int4 M, The next inner loop will have larger bounds */ while (score_gap_row >= (best_score - x_dropoff) && b_size <= N) { - - /* convert the current B offset into an offset - suitable for the current array of auxiliary - structures. */ - - Int4 s_index = b_size - score_array_origin; - - score_array[s_index].best = score_gap_row; - score_array[s_index].best_gap = score_gap_row - gap_open_extend; + score_array[b_size].best = score_gap_row; + score_array[b_size].best_gap = score_gap_row - gap_open_extend; score_gap_row -= gap_extend; b_size++; } } if (b_size <= N) { - Int4 s_index = b_size - score_array_origin; - - score_array[s_index].best = MININT; - score_array[s_index].best_gap = MININT; + score_array[b_size].best = MININT; + score_array[b_size].best_gap = MININT; b_size++; } } - sfree(score_array); return best_score; } diff --git a/algo/blast/core/blast_gapalign.h b/algo/blast/core/blast_gapalign.h index 49525522..9b3767f6 100644 --- a/algo/blast/core/blast_gapalign.h +++ b/algo/blast/core/blast_gapalign.h @@ -1,4 +1,4 @@ -/* $Id: blast_gapalign.h,v 1.62 2005/04/27 19:47:57 dondosha Exp $ +/* $Id: blast_gapalign.h,v 1.63 2005/11/30 18:30:00 papadopo Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -53,6 +53,15 @@ extern "C" { /** Split subject sequences if longer than this */ #define MAX_DBSEQ_LEN 5000000 +/** Auxiliary structure for dynamic programming gapped extension */ +typedef struct { + Int4 best; /**< score of best path that ends in a match + at this position */ + Int4 best_gap; /**< score of best path that ends in a gap + at this position */ +} BlastGapDP; + + /** Structure supporting the gapped alignment */ typedef struct BlastGapAlignStruct { Boolean positionBased; /**< Is this PSI-BLAST? */ @@ -63,6 +72,8 @@ typedef struct BlastGapAlignStruct { GapPrelimEditBlock *rev_prelim_tback; /**< traceback from right extensions */ SGreedyAlignMem* greedy_align_mem;/**< Preallocated memory for the greedy gapped extension */ + BlastGapDP* dp_mem; /**< scratch structures for dynamic programming */ + Int4 dp_mem_alloc; /**< current number of structures allocated */ BlastScoreBlk* sbp; /**< Pointer to the scoring information block */ Int4 gap_x_dropoff; /**< X-dropoff parameter to use */ Int4 query_start; /**< query start offset of current alignment */ diff --git a/algo/blast/core/blast_gapalign_priv.h b/algo/blast/core/blast_gapalign_priv.h index 83f628d6..e5975689 100644 --- a/algo/blast/core/blast_gapalign_priv.h +++ b/algo/blast/core/blast_gapalign_priv.h @@ -1,7 +1,7 @@ #ifndef ALGO_BLAST_CORE___BLAST_GAPALIGN_PRI__H #define ALGO_BLAST_CORE___BLAST_GAPALIGN_PRI__H -/* $Id: blast_gapalign_priv.h,v 1.11 2005/05/02 13:07:34 madden Exp $ +/* $Id: blast_gapalign_priv.h,v 1.12 2005/11/30 18:25:03 papadopo Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -39,26 +39,6 @@ extern "C" { #endif -/** Auxiliary structure for dynamic programming gapped extension */ -typedef struct BlastGapDP { - Int4 best; /**< score of best path that ends in a match - at this position */ - Int4 best_gap; /**< score of best path that ends in a gap - at this position */ - Int4 best_decline; /**< score of best path that ends in a decline - at this position */ -} BlastGapDP; - -/** Reduced version of BlastGapDP, for alignments that - * don't use a decline penalty - */ -typedef struct { - Int4 best; /**< score of best path that ends in a match - at this position */ - Int4 best_gap; /**< score of best path that ends in a gap - at this position */ -} BlastGapSmallDP; - Int4 ALIGN_EX(Uint1* A, Uint1* B, Int4 M, Int4 N, Int4* a_offset, Int4* b_offset, GapPrelimEditBlock *edit_block, @@ -159,6 +139,9 @@ void RPSPsiMatrixDetach(BlastScoreBlk* sbp); * =========================================================================== * * $Log: blast_gapalign_priv.h,v $ + * Revision 1.12 2005/11/30 18:25:03 papadopo + * move BlastGapDP, remove BlastGapSmallDP + * * Revision 1.11 2005/05/02 13:07:34 madden * Remove Blast_CheckHSPsForCommonEndpoints * diff --git a/algo/blast/core/blast_hits.c b/algo/blast/core/blast_hits.c index 9480e947..9a72ee04 100644 --- a/algo/blast/core/blast_hits.c +++ b/algo/blast/core/blast_hits.c @@ -1,4 +1,4 @@ -/* $Id: blast_hits.c,v 1.169 2005/08/15 16:11:20 dondosha Exp $ +/* $Id: blast_hits.c,v 1.173 2005/11/16 14:27:03 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. @@ -23,7 +23,6 @@ * * =========================================================================== * - * Author: Ilya Dondoshansky * */ @@ -33,7 +32,7 @@ #ifndef SKIP_DOXYGEN_PROCESSING static char const rcsid[] = - "$Id: blast_hits.c,v 1.169 2005/08/15 16:11:20 dondosha Exp $"; + "$Id: blast_hits.c,v 1.173 2005/11/16 14:27:03 madden Exp $"; #endif /* SKIP_DOXYGEN_PROCESSING */ #include <algo/blast/core/blast_options.h> @@ -1599,6 +1598,7 @@ Int2 Blast_HSPListGetEvalues(const BlastQueryInfo* query_info, ASSERT(hsp != NULL); ASSERT(scaling_factor != 0.0); + ASSERT(sbp->round_down == FALSE || (hsp->score & 1) == 0); /* Divide Lambda by the scaling factor, so e-value is calculated correctly from a scaled score. This is needed only @@ -1638,6 +1638,7 @@ Int2 Blast_HSPListGetBitScores(BlastHSPList* hsp_list, for (index=0; index<hsp_list->hspcnt; index++) { hsp = hsp_list->hsp_array[index]; ASSERT(hsp != NULL); + ASSERT(sbp->round_down == FALSE || (hsp->score & 1) == 0); hsp->bit_score = (hsp->score*kbp[hsp->context]->Lambda - kbp[hsp->context]->logK) / NCBIMATH_LN2; @@ -2189,6 +2190,8 @@ Blast_HSPListReevaluateWithAmbiguities(EBlastProgramType program, /* Sort the HSP array by score (scores may have changed!) */ Blast_HSPListSortByScore(hsp_list); + Blast_HSPListAdjustOddBlastnScores(hsp_list, gapped, sbp); + return status; } @@ -2440,12 +2443,18 @@ void Blast_HSPListAdjustOffsets(BlastHSPList* hsp_list, Int4 offset) } } -void Blast_HSPListAdjustOddBlastnScores(BlastHSPList* hsp_list) +void Blast_HSPListAdjustOddBlastnScores(BlastHSPList* hsp_list, Boolean gapped_calculation, BlastScoreBlk* sbp) { int index; if (!hsp_list || hsp_list->hspcnt == 0) return; + + if (gapped_calculation == FALSE) + return; + + if (sbp->round_down == FALSE) + return; for (index = 0; index < hsp_list->hspcnt; ++index) { hsp_list->hsp_array[index]->score -= @@ -2985,9 +2994,6 @@ Blast_HSPResultsSaveRPSHSPList(EBlastProgramType program, Int2 Blast_HSPResultsSaveHSPList(EBlastProgramType program, BlastHSPResults* results, BlastHSPList* hsp_list, const SBlastHitsParameters* blasthit_params) { - Int2 status = 0; - BlastHSP* hsp; - if (!hsp_list) return 0; @@ -3002,46 +3008,59 @@ Int2 Blast_HSPResultsSaveHSPList(EBlastProgramType program, BlastHSPResults* res /* Rearrange HSPs into multiple hit lists if more than one query */ if (results->num_queries > 1) { + BlastHSP* hsp; BlastHSPList** hsp_list_array; BlastHSPList* tmp_hsp_list; Int4 index; hsp_list_array = calloc(results->num_queries, sizeof(BlastHSPList*)); + if (hsp_list_array == NULL) + return -1; for (index = 0; index < hsp_list->hspcnt; index++) { + Boolean can_insert = TRUE; Int4 query_index; hsp = hsp_list->hsp_array[index]; query_index = Blast_GetQueryIndexFromContext(hsp->context, program); - tmp_hsp_list = hsp_list_array[query_index]; - if (!tmp_hsp_list) { + if (!(tmp_hsp_list = hsp_list_array[query_index])) { hsp_list_array[query_index] = tmp_hsp_list = Blast_HSPListNew(blasthit_params->options->hsp_num_max); + if (tmp_hsp_list == NULL) + { + sfree(hsp_list_array); + return -1; + } tmp_hsp_list->oid = hsp_list->oid; } - if (!tmp_hsp_list || tmp_hsp_list->do_not_reallocate) { - tmp_hsp_list = NULL; - } else if (tmp_hsp_list->hspcnt >= tmp_hsp_list->allocated) { - BlastHSP** new_hsp_array; - Int4 new_size = - MIN(2*tmp_hsp_list->allocated, tmp_hsp_list->hsp_max); - if (new_size == tmp_hsp_list->hsp_max) - tmp_hsp_list->do_not_reallocate = TRUE; + if (tmp_hsp_list->hspcnt >= tmp_hsp_list->allocated) { + if (tmp_hsp_list->do_not_reallocate == FALSE) { + BlastHSP** new_hsp_array; + Int4 new_size = + MIN(2*tmp_hsp_list->allocated, tmp_hsp_list->hsp_max); + if (new_size == tmp_hsp_list->hsp_max) + tmp_hsp_list->do_not_reallocate = TRUE; - new_hsp_array = realloc(tmp_hsp_list->hsp_array, + new_hsp_array = realloc(tmp_hsp_list->hsp_array, new_size*sizeof(BlastHSP*)); - if (!new_hsp_array) { - tmp_hsp_list->do_not_reallocate = TRUE; - tmp_hsp_list = NULL; - } else { - tmp_hsp_list->hsp_array = new_hsp_array; - tmp_hsp_list->allocated = new_size; + if (!new_hsp_array) { + tmp_hsp_list->do_not_reallocate = TRUE; + can_insert = FALSE; + } else { + tmp_hsp_list->hsp_array = new_hsp_array; + tmp_hsp_list->allocated = new_size; + } + } + else + { + can_insert = FALSE; } } - if (tmp_hsp_list) { + if (can_insert) { tmp_hsp_list->hsp_array[tmp_hsp_list->hspcnt++] = hsp; } else { + /* FIXME: what if this is not the least significant HSP?? */ /* Cannot add more HSPs; free the memory */ hsp_list->hsp_array[index] = Blast_HSPFree(hsp); } @@ -3077,7 +3096,7 @@ Int2 Blast_HSPResultsSaveHSPList(EBlastProgramType program, BlastHSPResults* res Blast_HSPListFree(hsp_list); } - return status; + return 0; } Int2 Blast_HSPResultsInsertHSPList(BlastHSPResults* results, diff --git a/algo/blast/core/blast_hits.h b/algo/blast/core/blast_hits.h index d771faf3..f458e6fe 100644 --- a/algo/blast/core/blast_hits.h +++ b/algo/blast/core/blast_hits.h @@ -1,4 +1,4 @@ -/* $Id: blast_hits.h,v 1.83 2005/08/15 16:09:58 dondosha Exp $ +/* $Id: blast_hits.h,v 1.84 2005/09/27 14:42:20 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -521,10 +521,12 @@ void Blast_HSPListAdjustOffsets(BlastHSPList* hsp_list, Int4 offset); * random alignments are dominated by runs of exact matches, which all have even * scores. This makes it impossible to estimate statistical parameters correctly * for odd scores. Hence the raw score formula is adjusted - all scores are - * rounded down to the nearest even value. + * rounded down to the nearest even value in order to provide a conservative estimate. * @param hsp_list HSP list structure to adjust scores for. [in] [out] + * @param gapped_calculation not an ungapped alignment [in] + * @param sbp used for round_down Boolean */ -void Blast_HSPListAdjustOddBlastnScores(BlastHSPList* hsp_list); +void Blast_HSPListAdjustOddBlastnScores(BlastHSPList* hsp_list, Boolean gapped_calculation, BlastScoreBlk* sbp); /** Check if HSP list is sorted by score. * @param hsp_list The list to check [in] diff --git a/algo/blast/core/blast_inline.h b/algo/blast/core/blast_inline.h index d3ad23c6..000fc097 100644 --- a/algo/blast/core/blast_inline.h +++ b/algo/blast/core/blast_inline.h @@ -1,4 +1,4 @@ -/* $Id: blast_inline.h,v 1.8 2005/06/23 16:18:46 camacho Exp $ +/* $Id: blast_inline.h,v 1.9 2005/11/16 14:27:03 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. diff --git a/algo/blast/core/blast_itree.c b/algo/blast/core/blast_itree.c index e43e3866..ab0745fa 100644 --- a/algo/blast/core/blast_itree.c +++ b/algo/blast/core/blast_itree.c @@ -1,4 +1,4 @@ -/* $Id: blast_itree.c,v 1.10 2005/04/27 14:52:08 papadopo Exp $ +/* $Id: blast_itree.c,v 1.11 2005/11/16 14:27:03 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. @@ -33,7 +33,7 @@ #ifndef SKIP_DOXYGEN_PROCESSING static char const rcsid[] = - "$Id: blast_itree.c,v 1.10 2005/04/27 14:52:08 papadopo Exp $"; + "$Id: blast_itree.c,v 1.11 2005/11/16 14:27:03 madden Exp $"; #endif /* SKIP_DOXYGEN_PROCESSING */ #include "blast_itree.h" diff --git a/algo/blast/core/blast_itree.h b/algo/blast/core/blast_itree.h index 9e19fcf1..1bec4856 100644 --- a/algo/blast/core/blast_itree.h +++ b/algo/blast/core/blast_itree.h @@ -1,4 +1,4 @@ -/* $Id: blast_itree.h,v 1.4 2005/04/27 14:52:08 papadopo Exp $ +/* $Id: blast_itree.h,v 1.5 2005/11/16 14:27:03 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. diff --git a/algo/blast/core/blast_kappa.c b/algo/blast/core/blast_kappa.c index dddc0313..2af2194e 100644 --- a/algo/blast/core/blast_kappa.c +++ b/algo/blast/core/blast_kappa.c @@ -1,4 +1,4 @@ -/* $Id: blast_kappa.c,v 1.59 2005/07/21 13:51:19 camacho Exp $ +/* $Id: blast_kappa.c,v 1.62 2005/12/02 17:16:51 madden Exp $ * ========================================================================== * * PUBLIC DOMAIN NOTICE @@ -28,15 +28,16 @@ */ /** @file blast_kappa.c - * Utilities for doing Smith-Waterman alignments and adjusting the scoring + * Utilities for doing Smith-Waterman alignments and adjusting the scoring * system for each match in blastpgp */ #ifndef SKIP_DOXYGEN_PROCESSING -static char const rcsid[] = - "$Id: blast_kappa.c,v 1.59 2005/07/21 13:51:19 camacho Exp $"; +static char const rcsid[] = +"$Id: blast_kappa.c,v 1.62 2005/12/02 17:16:51 madden Exp $"; #endif /* SKIP_DOXYGEN_PROCESSING */ +#include <float.h> #include <algo/blast/core/blast_def.h> #include <algo/blast/core/blast_hits.h> #include <algo/blast/core/blast_stat.h> @@ -52,13 +53,12 @@ static char const rcsid[] = #include "blast_posit.h" #include "blast_hits_priv.h" -/** by what factor might initially reported E-value exceed true Evalue */ -#define EVALUE_STRETCH 5 - -/** For translated subject sequences, the number of amino acids to - include before and after the existing aligned segment when - generating a composition-based scoring system. */ -#define KAPPA_WINDOW_BORDER 200 +#include <algo/blast/composition_adjustment/nlm_linear_algebra.h> +#include <algo/blast/composition_adjustment/composition_constants.h> +#include <algo/blast/composition_adjustment/composition_adjustment.h> +#include <algo/blast/composition_adjustment/compo_heap.h> +#include <algo/blast/composition_adjustment/smith_waterman.h> +#include <algo/blast/composition_adjustment/redo_alignment.h> /** * Scale the scores in an HSP list and reset the bit scores. @@ -70,20 +70,21 @@ static char const rcsid[] = * @todo rename to something which is more intention revealing, merge with * function of the same name in blast_traceback.c */ +/* WHY */ static void s_HSPListRescaleScores(BlastHSPList * hsp_list, - double lambda, - double logK, - double scoreDivisor) + double lambda, + double logK, + double scoreDivisor) { - int hsp_index; - for(hsp_index = 0; hsp_index < hsp_list->hspcnt; hsp_index++) { - BlastHSP * hsp = hsp_list->hsp_array[hsp_index]; - - hsp->score = BLAST_Nint(((double) hsp->score) / scoreDivisor); - /* Compute the bit score using the newly computed scaled score. */ - hsp->bit_score = (hsp->score*lambda*scoreDivisor - logK)/NCBIMATH_LN2; - } + int hsp_index; + for(hsp_index = 0; hsp_index < hsp_list->hspcnt; hsp_index++) { + BlastHSP * hsp = hsp_list->hsp_array[hsp_index]; + + hsp->score = BLAST_Nint(((double) hsp->score) / scoreDivisor); + /* Compute the bit score using the newly computed scaled score. */ + hsp->bit_score = (hsp->score*lambda*scoreDivisor - logK)/NCBIMATH_LN2; + } } /** @@ -97,121 +98,78 @@ s_HSPListRescaleScores(BlastHSPList * hsp_list, * @param hspcnt length of hsp_array */ static void -HitlistReapContained( - BlastHSP * hsp_array[], - Int4 * hspcnt) +s_HitlistReapContained( + BlastHSP * hsp_array[], + Int4 * hspcnt) { - Int4 iread; /* iteration index used to read the hitlist */ - Int4 iwrite; /* iteration index used to write to the hitlist */ - Int4 old_hspcnt; /* number of HSPs in the hitlist on entry */ - - old_hspcnt = *hspcnt; - - for( iread = 1; iread < *hspcnt; iread++ ) { - /* for all HSPs in the hitlist */ - Int4 ireadBack; /* iterator over indices less than iread */ - BlastHSP *hsp1; /* an HSP that is a candidate for deletion */ - - hsp1 = hsp_array[iread]; - for( ireadBack = 0; ireadBack < iread && hsp1 != NULL; ireadBack++ ) { - /* for all HSPs before hsp1 in the hitlist and while hsp1 has not - been deleted */ - BlastHSP *hsp2; /* an HSP that occurs earlier in hsp_array - * than hsp1 */ - hsp2 = hsp_array[ireadBack]; - - if( hsp2 == NULL ) { /* hsp2 was deleted in a prior iteration. */ - continue; - } - if(SIGN(hsp2->query.frame) == SIGN(hsp1->query.frame) && - SIGN(hsp2->subject.frame) == SIGN(hsp1->subject.frame)) { - /* hsp1 and hsp2 are in the same query/subject frame. */ - if(CONTAINED_IN_HSP - (hsp2->query.offset, hsp2->query.end, hsp1->query.offset, - hsp2->subject.offset, hsp2->subject.end, - hsp1->subject.offset) && - CONTAINED_IN_HSP - (hsp2->query.offset, hsp2->query.end, hsp1->query.end, - hsp2->subject.offset, hsp2->subject.end, - hsp1->subject.end) && - hsp1->score <= hsp2->score) { - hsp1 = hsp_array[iread] = Blast_HSPFree(hsp_array[iread]); - } - } /* end if hsp1 and hsp2 are in the same query/subject frame */ - } /* end for all HSPs before hsp1 in the hitlist */ - } /* end for all HSPs in the hitlist */ - - /* Condense the hsp_array, removing any NULL items. */ - iwrite = 0; - for( iread = 0; iread < *hspcnt; iread++ ) { - if( hsp_array[iread] != NULL ) { - hsp_array[iwrite++] = hsp_array[iread]; - } - } - *hspcnt = iwrite; - /* Fill the remaining memory in hsp_array with NULL pointers. */ - for( ; iwrite < old_hspcnt; iwrite++ ) { - hsp_array[iwrite] = NULL; - } + Int4 iread; /* iteration index used to read the hitlist */ + Int4 iwrite; /* iteration index used to write to the hitlist */ + Int4 old_hspcnt; /* number of HSPs in the hitlist on entry */ + + old_hspcnt = *hspcnt; + + for (iread = 1; iread < *hspcnt; iread++) { + /* for all HSPs in the hitlist */ + Int4 ireadBack; /* iterator over indices less than iread */ + BlastHSP *hsp1; /* an HSP that is a candidate for deletion */ + + hsp1 = hsp_array[iread]; + for (ireadBack = 0; ireadBack < iread && hsp1 != NULL; ireadBack++) { + /* for all HSPs before hsp1 in the hitlist and while hsp1 has not + been deleted */ + BlastHSP *hsp2; /* an HSP that occurs earlier in hsp_array + * than hsp1 */ + hsp2 = hsp_array[ireadBack]; + + if( hsp2 == NULL ) { /* hsp2 was deleted in a prior iteration. */ + continue; + } + if (SIGN(hsp2->query.frame) == SIGN(hsp1->query.frame) && + SIGN(hsp2->subject.frame) == SIGN(hsp1->subject.frame)) { + /* hsp1 and hsp2 are in the same query/subject frame. */ + if (CONTAINED_IN_HSP + (hsp2->query.offset, hsp2->query.end, hsp1->query.offset, + hsp2->subject.offset, hsp2->subject.end, + hsp1->subject.offset) && + CONTAINED_IN_HSP + (hsp2->query.offset, hsp2->query.end, hsp1->query.end, + hsp2->subject.offset, hsp2->subject.end, + hsp1->subject.end) && + hsp1->score <= hsp2->score) { + hsp1 = hsp_array[iread] = Blast_HSPFree(hsp_array[iread]); + } + } /* end if hsp1 and hsp2 are in the same query/subject frame */ + } /* end for all HSPs before hsp1 in the hitlist */ + } /* end for all HSPs in the hitlist */ + + /* Condense the hsp_array, removing any NULL items. */ + iwrite = 0; + for (iread = 0; iread < *hspcnt; iread++) { + if (hsp_array[iread] != NULL) { + hsp_array[iwrite++] = hsp_array[iread]; + } + } + *hspcnt = iwrite; + /* Fill the remaining memory in hsp_array with NULL pointers. */ + for ( ; iwrite < old_hspcnt; iwrite++) { + hsp_array[iwrite] = NULL; + } } -/** - * An object of type Kappa_DistinctAlignment represents a distinct - * alignment of the query sequence to the current subject sequence. - * These objects are typically part of a singly linked list of - * distinct alignments, stored in the reverse of the order in which - * they were computed. - */ -typedef struct Kappa_DistinctAlignment { - Int4 score; /**< the score of this alignment */ - Int4 queryStart; /**< the start of the alignment in the query */ - Int4 queryEnd; /**< one past the end of the alignment in the query */ - Int4 matchStart; /**< the start of the alignment in the subject */ - Int4 matchEnd; /**< one past the end of the alignment in the - subject */ - Int4 frame; /**< the subject frame */ - GapEditScript * editScript; /**< the alignment info for a gapped - alignment */ - struct Kappa_DistinctAlignment * next; /**< the next alignment in the - list */ -} Kappa_DistinctAlignment; - - -/** - * Recursively free all alignments in the singly linked list whose - * head is *palign. Set *palign to NULL. - * - * @param palign pointer to the head of a singly linked list - * of alignments. - */ -static void -Kappa_DistinctAlignmentsFree(Kappa_DistinctAlignment ** palign) +static void s_FreeEditScript(void * edit_script) { - Kappa_DistinctAlignment * align; /* represents the current - alignment in loops */ - align = *palign; *palign = NULL; - while(align != NULL) { - /* Save the value of align->next, because align is to be deleted. */ - Kappa_DistinctAlignment * align_next = align->next; - align_next = align->next; - - if(align->editScript) { - GapEditScriptDelete(align->editScript); - } - sfree(align); - - align = align_next; - } + if (edit_script != NULL) + GapEditScriptDelete(edit_script); } /** - * Converts a list of objects of type Kappa_DistinctAlignment to an + * Converts a list of objects of type BlastCompo_Alignment to an * new object of type BlastHSPList and returns the result. Conversion * in this direction is lossless. The list passed to this routine is * freed to ensure that there is no aliasing of fields between the - * list of Kappa_DistinctAlignments and the new hitlist. + * list of BlastCompo_Alignments and the new hitlist. * * @param alignments A list of distinct alignments; freed before return [in] * @param oid Ordinal id of a database sequence [in] @@ -219,1454 +177,97 @@ Kappa_DistinctAlignmentsFree(Kappa_DistinctAlignment ** palign) */ static BlastHSPList * s_HSPListFromDistinctAlignments( - Kappa_DistinctAlignment ** alignments, - int oid) -{ - const int unknown_value = 0; - BlastHSPList * hsp_list = Blast_HSPListNew(0); - Kappa_DistinctAlignment * align; - - hsp_list->oid = oid; - - for(align = *alignments; NULL != align; align = align->next) { - BlastHSP * new_hsp = NULL; - - Blast_HSPInit(align->queryStart, align->queryEnd, - align->matchStart, align->matchEnd, - unknown_value, unknown_value, - 0, 0, align->frame, align->score, - &align->editScript, &new_hsp); - - /* At this point, the subject and possibly the query sequence have - * been filtered; since it is not clear that num_ident of the - * filtered sequences, rather than the original, is desired, - * explictly leave num_ident blank. */ - new_hsp->num_ident = 0; - - Blast_HSPListSaveHSP(hsp_list, new_hsp); - } - Kappa_DistinctAlignmentsFree(alignments); - Blast_HSPListSortByScore(hsp_list); - - return hsp_list; -} - - -/** - * Given a list of alignments and a new alignment, create a new list - * of alignments that conditionally includes the new alignment. - * - * If there is an equal or higher-scoring alignment in the preexisting - * list of alignments that shares an endpoint with the new alignment, - * then preexisting list is returned. Otherwise, a new list is - * returned with the new alignment as its head and the elements of - * preexisting list that do not share an endpoint with the new - * alignment as its tail. The order of elements is preserved. - * - * Typically, a list of alignments is built one alignment at a time - * through a call to withDistinctEnds. All alignments in the resulting - * list have distinct endpoints. Which items are retained in the list - * depends on the order in which they were added. - * - * Note that an endpoint is a triple, specifying a frame, a location - * in the query and a location in the subject. In other words, - * alignments that are not in the same frame never share endpoints. - * - * @param p_newAlign on input the alignment that may be added to - * the list; on output NULL - * @param p_oldAlignments on input the existing list of alignments; - * on output the new list - */ -static void -withDistinctEnds( - Kappa_DistinctAlignment **p_newAlign, - Kappa_DistinctAlignment **p_oldAlignments) -{ - /* Deference the input parameters. */ - Kappa_DistinctAlignment * newAlign = *p_newAlign; - Kappa_DistinctAlignment * oldAlignments = *p_oldAlignments; - Kappa_DistinctAlignment * align; /* represents the current - alignment in loops */ - Boolean include_new_align; /* true if the new alignment - may be added to the list */ - *p_newAlign = NULL; - include_new_align = 1; - - for(align = oldAlignments; align != NULL; align = align->next) { - if(align->frame == newAlign->frame && - ( ( align->queryStart == newAlign->queryStart - && align->matchStart == newAlign->matchStart) - || ( align->queryEnd == newAlign->queryEnd - && align->matchEnd == newAlign->matchEnd))) { - /* At least one of the endpoints of newAlign matches an endpoint - of align. */ - if( newAlign->score <= align->score ) { - /* newAlign cannot be added to the list. */ - include_new_align = 0; - break; - } - } - } - - if(include_new_align) { - Kappa_DistinctAlignment **tail; /* tail of the list being created */ - - tail = &newAlign->next; - align = oldAlignments; - while(align != NULL) { - /* Save align->next because align may be deleted. */ - Kappa_DistinctAlignment * align_next = align->next; - align->next = NULL; - if(align->frame == newAlign->frame && - ( ( align->queryStart == newAlign->queryStart - && align->matchStart == newAlign->matchStart) - || ( align->queryEnd == newAlign->queryEnd - && align->matchEnd == newAlign->matchEnd))) { - /* The alignment shares an end with newAlign; */ - /* delete the alignment. */ - Kappa_DistinctAlignmentsFree(&align); - } else { /* The alignment does not share an end with newAlign; */ - /* add it to the output list. */ - *tail = align; - tail = &align->next; - } - align = align_next; - } /* end while align != NULL */ - *p_oldAlignments = newAlign; - } else { /* do not include_new_align */ - Kappa_DistinctAlignmentsFree(&newAlign); - } /* end else do not include newAlign */ -} - - -/** - * The number of bits by which the score of a previously computed - * alignment must exceed the score of the HSP under consideration for - * a containment relationship to be reported by the isAlreadyContained - * routine. */ -#define KAPPA_BIT_TOL 2 - - -/** - * Return true if the HSP is already contained in a - * previously-computed alignment of sufficiently high score. - * - * @param hsp HSP to be tested - * @param alignments list of alignments - * @param lambda Karlin-Altschul statistical parameter - * @param localScalingFactor factor by which scores were scaled to - * obtain higher precision - */ - -static Boolean -isAlreadyContained( - BlastHSP * hsp, - Kappa_DistinctAlignment * alignments, - double lambda, - double localScalingFactor) -{ - Kappa_DistinctAlignment * align; /* represents the current alignment - in the main loop */ - double scoreTol; /* the amount by which the score of the current - alignment must exceed the score of the HSP for a - containment relationship to be reported. */ - scoreTol = KAPPA_BIT_TOL * NCBIMATH_LN2/lambda; - - for( align = alignments; align != NULL; align = align->next ) { - /* for all elements of alignments */ - if(SIGN(hsp->query.frame) == SIGN(align->frame)) { - /* hsp1 and hsp2 are in the same query/subject frame */ - if(CONTAINED_IN_HSP - (align->queryStart, align->queryEnd, hsp->query.offset, - align->matchStart, align->matchEnd, hsp->subject.offset) && - CONTAINED_IN_HSP - (align->queryStart, align->queryEnd, hsp->query.end, - align->matchStart, align->matchEnd, hsp->subject.end) && - hsp->score * localScalingFactor + scoreTol <= align->score) { - return 1; - } - } /* hsp1 and hsp2 are in the same query/subject frame */ - } /* end for all items in alignments */ - - return 0; -} - - -/** - * The struct SWheapRecord data type is used below to define the - * internal structure of a SWheap (see below). A SWheapRecord - * represents all alignments of a query sequence to a particular - * matching sequence. - */ -typedef struct SWheapRecord { - double bestEvalue; /**< best (smallest) evalue of all alignments - in the record */ - Int4 bestScore; /**< best (largest) score; used to break - ties between records with the same - e-value */ - Int4 subject_index; /**< index of the subject sequence in - the database */ - BlastHSPList * theseAlignments; /**< a list of alignments */ -} SWheapRecord; - - -/** Compare two records in the heap. */ -static Boolean -SWheapRecordCompare(SWheapRecord * place1, - SWheapRecord * place2) -{ - int result; - if(0 == (result = BLAST_CMP(place1->bestEvalue, place2->bestEvalue)) && - 0 == (result = BLAST_CMP(place2->bestScore, place1->bestScore))) { - result = BLAST_CMP(place1->subject_index, place2->subject_index); - } - return result > 0; -} - - -/** swap two records in the heap*/ -static void -SWheapRecordSwap(SWheapRecord * record1, - SWheapRecord * record2) -{ - /* bestEvalue, bestScore, theseAlignments and subject_index are temporary - * variables used to perform the swap. */ - double bestEvalue; - Int4 bestScore, subject_index; - BlastHSPList * theseAlignments; - - bestEvalue = record1->bestEvalue; - record1->bestEvalue = record2->bestEvalue; - record2->bestEvalue = bestEvalue; - - bestScore = record1->bestScore; - record1->bestScore = record2->bestScore; - record2->bestScore = bestScore; - - subject_index = record1->subject_index; - record1->subject_index = record2->subject_index; - record2->subject_index = subject_index; - - theseAlignments = record1->theseAlignments; - record1->theseAlignments = record2->theseAlignments; - record2->theseAlignments = theseAlignments; -} - - -/** - * Verifies that the array heapArray[i] .. heapArray[n] is ordered so - * as to be a valid heap. This routine checks every element in the array, - * an so is very time consuming. It is for debugging purposes only. - */ -static Boolean -SWheapIsValid(SWheapRecord * heapArray, - Int4 i, - Int4 n) -{ - /* indices of nodes to the left and right of node i */ - Int4 left = 2 * i, right = 2 * i + 1; - - if(right <= n) { - return !SWheapRecordCompare(&(heapArray[right]), &(heapArray[i])) && - SWheapIsValid(heapArray, right, n); - } - if(left <= n) { - return !SWheapRecordCompare(&(heapArray[left]), &(heapArray[i])) && - SWheapIsValid(heapArray, left, n); - } - return TRUE; -} - -/** convenience debugging macro for this module */ -#ifdef KAPPA_INTENSE_DEBUG -#define KAPPA_ASSERT(expr) ((expr) ? 0 : \ -(fprintf( stderr, "KAPPA_ASSERT failed line %d: %s", __LINE__, #expr ), \ -exit(1))) -#else -#define KAPPA_ASSERT(expr) (void)(0) -#endif - - -/** On entry, all but the first element of the array heapArray[i] to - * heapArray[n] are in valid heap order. This routine rearranges - * the elements so that on exit they all are in heap order. - * @param heapArray holds the heap [in][out] - * @param i element of heapArray that may be out of order [in] - * @param n size of heapArray [in] - */ -static void -SWheapifyDown(SWheapRecord * heapArray, - Int4 i, - Int4 n) -{ - Boolean moreswap = TRUE; /* is more swapping needed */ - Int4 left, right, largest; /* placeholders for indices in swapping */ - do { - left = 2 * i; - right = 2 * i + 1; - if((left <= n) && - (SWheapRecordCompare(&(heapArray[left]), &(heapArray[i])))) - largest = left; - else - largest = i; - if((right <= n) && - (SWheapRecordCompare(&(heapArray[right]), &(heapArray[largest])))) - largest = right; - if(largest != i) { - SWheapRecordSwap(&heapArray[i], &heapArray[largest]); - /* push largest up the heap */ - i = largest; /* check next level down */ - } else - moreswap = FALSE; - } while(moreswap); /* function builds the heap */ - KAPPA_ASSERT(SWheapIsValid(heapArray, i, n)); -} - - -/** On entry, all but the last element of the array heapArray[0] to - * heapArray[i] are in valid heap order. This routine rearranges - * the elements so that on exit they all are in heap order. - * - * @param heapArray holds the heap [in][out] - * @param i element in heap array that may be out of order [in] - * @param n size of heapArray - */ -static void -SWheapifyUp(SWheapRecord * heapArray, - Int4 i, - Int4 n) -{ - Int4 parent = i / 2; /* index to the node that is the - parent of node i */ - while(parent >= 1 && - SWheapRecordCompare(&(heapArray[i]), &(heapArray[parent]))){ - SWheapRecordSwap(&heapArray[i], &heapArray[parent]); - - i = parent; - parent /= 2; - } - KAPPA_ASSERT(SWheapIsValid(heapArray, 1, n)); -} - -/** - * A SWheap represents a collection of alignments between one query - * sequence and several matching subject sequences. - * - * Each matching sequence is allocated one record in a SWheap. The - * eValue of a query-subject pair is the best (smallest positive) - * evalue of all alignments between the two sequences. - * - * A match will be inserted in the the SWheap if: - * - there are fewer that SWheap::heapThreshold elements in the SWheap; - * - the eValue of the match is <= SWheap::ecutoff; or - * - the eValue of the match is less than the largest (worst) eValue - * already in the SWheap. - * - * If there are >= SWheap::heapThreshold matches already in the SWheap - * when a new match is to be inserted, then the match with the largest - * (worst) eValue is removed, unless the largest eValue <= - * SWheap::ecutoff. Matches with eValue <= SWheap::ecutoff are never - * removed by the insertion routine. As a consequence, the SWheap can - * hold an arbitrarily large number of matches, although it is - * atypical for the number of matches to be greater than - * SWheap::heapThreshold. - * - * Once all matches have been collected, the SWheapToFlatList routine - * may be invoked to return a list of all alignments. (see below). - * - * While the number of elements in a heap < SWheap::heapThreshold, the - * SWheap is implemented as an unordered array, rather than a - * heap-ordered array. The SWheap is converted to a heap-ordered - * array as soon as it becomes necessary to order the matches by - * evalue. The routines that operate on a SWheap should behave - * properly whichever state the SWheap is in. - */ -typedef struct SWheap { - Int4 n; /**< The current number of elements */ - Int4 capacity; /**< The maximum number of elements - that may be inserted before the - SWheap must be resized, this - number must be >= heapThreshold */ - Int4 heapThreshold; /**< see above */ - double ecutoff; /**< matches with evalue below ecutoff may - always be inserted in the SWheap */ - double worstEvalue; /**< the worst (biggest) evalue currently in - the heap */ - - SWheapRecord *array; /**< the SWheapRecord array if the SWheap is - being represented as an unordered array */ - SWheapRecord *heapArray; /**< the SWheapRecord array if the SWheap is - being represented as an heap-ordered - array. At least one of (array, heapArray) - is NULL */ - -} SWheap; - - -/** Convert a SWheap from a representation as an unordered array to - * a representation as a heap-ordered array. - * - * @param self the SWheap to convert - */ -static void -ConvertToHeap(SWheap * self) + BlastCompo_Alignment ** alignments, + int oid) { - if(NULL != self->array) { - Int4 i; /* heap node index */ - Int4 n; /* number of elements in the heap */ - /* We aren't already a heap */ - self->heapArray = self->array; - self->array = NULL; - - n = self->n; - for(i = n / 2; i >= 1; --i) { - SWheapifyDown(self->heapArray, i, n); + const int unknown_value = 0; + BlastHSPList * hsp_list = Blast_HSPListNew(0); + BlastCompo_Alignment * align; + + hsp_list->oid = oid; + + for (align = *alignments; NULL != align; align = align->next) { + BlastHSP * new_hsp = NULL; + GapEditScript * editScript = align->context; + align->context = NULL; + Blast_HSPInit(align->queryStart, align->queryEnd, + align->matchStart, align->matchEnd, + unknown_value, unknown_value, + 0, 0, align->frame, align->score, + &editScript, &new_hsp); + + /* At this point, the subject and possibly the query sequence have + * been filtered; since it is not clear that num_ident of the + * filtered sequences, rather than the original, is desired, + * explictly leave num_ident blank. */ + new_hsp->num_ident = 0; + + Blast_HSPListSaveHSP(hsp_list, new_hsp); } - } - KAPPA_ASSERT(SWheapIsValid(self->heapArray, 1, self->n)); -} + BlastCompo_AlignmentsFree(alignments, s_FreeEditScript); + Blast_HSPListSortByScore(hsp_list); -/** When the heap is about to exceed its capacity, it will be grown by - * the minimum of a multiplicative factor of SWHEAP_RESIZE_FACTOR - * and an additive factor of SWHEAP_MIN_RESIZE. The heap never - * decreases in size */ -#define SWHEAP_RESIZE_FACTOR 1.5 -/** @sa SWHEAP_RESIZE_FACTOR */ -#define SWHEAP_MIN_RESIZE 100 - -/** Return true if self may insert a match that had the given eValue, - * score and subject_index. - * - * @param self a SWheap - * @param eValue the evalue to be tested. - * @param score the score to be tested - * @param subject_index the subject_index to be tested. - */ -static Boolean -SWheapWouldInsert(SWheap * self, - double eValue, - Int4 score, - Int4 subject_index) -{ - if(self->n < self->heapThreshold || - eValue <= self->ecutoff || - eValue < self->worstEvalue) { - return TRUE; - } else { - /* self is either currently a heap, or must be converted to one; - * use SWheapRecordCompare to compare against the worst element in - * the heap */ - SWheapRecord heapRecord; /* temporary record to compare against */ - - if(self->heapArray == NULL) ConvertToHeap(self); - - heapRecord.bestEvalue = eValue; - heapRecord.bestScore = score; - heapRecord.subject_index = subject_index; - heapRecord.theseAlignments = NULL; - - return SWheapRecordCompare(&self->heapArray[1], &heapRecord); - } + return hsp_list; } -/** - * Try to insert matchRecord into the SWheap. The list of SeqAligns - * passed to this routine is used directly, i.e. the list is not copied, - * but is rather stored in the SWheap or deleted. - * - * @param self the heap - * @param alignments a list of alignments - * @param eValue the best evalue among the alignments - * @param score the best score among the alignments - * @param subject_index the index of the subject sequence in the database - */ static void -SWheapInsert( - SWheap * self, - BlastHSPList * alignments, - double eValue, - Int4 score, - Int4 subject_index) +s_HitlistEvaluateAndPurge(int * pbestScore, double *pbestEvalue, + BlastHSPList * hsp_list, + int subject_length, + EBlastProgramType program_number, + BlastQueryInfo* queryInfo, + BlastScoreBlk* sbp, + const BlastHitSavingParameters* hitParams, + int do_link_hsps) { - if(self->array && self->n >= self->heapThreshold) { - ConvertToHeap(self); - } - if(self->array != NULL) { - /* "self" is currently a list. Add the new alignments to the end */ - SWheapRecord *heapRecord; /* destination for the new alignments */ - heapRecord = &self->array[++self->n]; - heapRecord->bestEvalue = eValue; - heapRecord->bestScore = score; - heapRecord->theseAlignments = alignments; - heapRecord->subject_index = subject_index; - if( self->worstEvalue < eValue ) { - self->worstEvalue = eValue; - } - } else { /* "self" is currently a heap */ - if(self->n < self->heapThreshold || - (eValue <= self->ecutoff && - self->worstEvalue <= self->ecutoff)) { - SWheapRecord *heapRecord; /* Destination for the new alignments */ - /* The new alignments must be inserted into the heap, and all old - * alignments retained */ - if(self->n >= self->capacity) { - /* The heap must be resized */ - Int4 newCapacity; /* capacity the heap will have after - * it is resized */ - newCapacity = MAX(SWHEAP_MIN_RESIZE + self->capacity, - (Int4) (SWHEAP_RESIZE_FACTOR * self->capacity)); - self->heapArray = (SWheapRecord *) - realloc(self->heapArray, (newCapacity + 1) * sizeof(SWheapRecord)); - self->capacity = newCapacity; - } - /* end if the heap must be resized */ - heapRecord = &self->heapArray[++self->n]; - heapRecord->bestEvalue = eValue; - heapRecord->bestScore = score; - heapRecord->theseAlignments = alignments; - heapRecord->subject_index = subject_index; - - SWheapifyUp(self->heapArray, self->n, self->n); + *pbestEvalue = DBL_MAX; + *pbestScore = 0; + if (do_link_hsps) { + BLAST_LinkHsps(program_number, hsp_list, + queryInfo, subject_length, + sbp, hitParams->link_hsp_params, TRUE); } else { - /* Some set of alignments must be discarded; discardedAlignments - * will hold a pointer to these alignments. */ - BlastHSPList * discardedAlignments = NULL; - SWheapRecord heapRecord; /* Candidate record for insertion */ - - heapRecord.bestEvalue = eValue; - heapRecord.bestScore = score; - heapRecord.theseAlignments = alignments; - heapRecord.subject_index = subject_index; - - if(SWheapRecordCompare(&self->heapArray[1], &heapRecord)) { - /* The new record should be inserted, and the largest - * element currently in the heap may be disgarded */ - discardedAlignments = self->heapArray[1].theseAlignments; - memcpy(&self->heapArray[1], &heapRecord, sizeof(SWheapRecord)); - } else { - discardedAlignments = heapRecord.theseAlignments; - } - SWheapifyDown(self->heapArray, 1, self->n); - - if(discardedAlignments != NULL) { - Blast_HSPListFree(discardedAlignments); - } - /* end while there are discarded alignments that have not been freed */ + Blast_HSPListGetEvalues(queryInfo, hsp_list, TRUE, sbp, + 0.0, /* use a non-zero gap decay only when + linking hsps */ + 1.0); /* Use scaling factor equal to + 1, because both scores and + Lambda are scaled, so they + will cancel each other. */ } - /* end else some set of alignments must be discarded */ - - self->worstEvalue = self->heapArray[1].bestEvalue; - KAPPA_ASSERT(SWheapIsValid(self->heapArray, 1, self->n)); - } - /* end else "self" is currently a heap. */ -} - - -/** - * Return true if only matches with evalue <= self->ecutoff may be - * inserted. - * - * @param self a SWheap - */ -static Boolean -SWheapWillAcceptOnlyBelowCutoff(SWheap * self) -{ - return self->n >= self->heapThreshold && self->worstEvalue <= self->ecutoff; -} - - -/** Initialize a new SWheap; parameters to this function correspond - * directly to fields in the SWheap */ -static void -SWheapInitialize(SWheap * self, - Int4 heapThreshold, - double ecutoff) -{ - self->n = 0; - self->heapThreshold = heapThreshold; - self->ecutoff = ecutoff; - self->heapArray = NULL; - self->capacity = heapThreshold; - self->worstEvalue = 0; - /* Begin life as a list */ - self->array = - (SWheapRecord *) malloc((self->capacity + 1) * sizeof(SWheapRecord)); -} - - -/** - * Release the storage associated with the fields of a SWheap. Don't - * delete the SWheap structure itself. - * - * @param self SWheap whose storage will be released - */ -static void -SWheapRelease(SWheap * self) -{ - if(self->heapArray) sfree(self->heapArray); - if(self->array) sfree(self->array); - - self->n = self->capacity = self->heapThreshold = 0; - self->heapArray = NULL; self->array = NULL; -} - - -/** - * Remove and return the element in the SWheap with largest (worst) evalue - * - * @param self a SWheap - */ -static BlastHSPList * -SWheapPop(SWheap * self) -{ - BlastHSPList * results = NULL; /* the list of HSPs to be returned */ - - ConvertToHeap(self); - if(self->n > 0) { /* The heap is not empty */ - SWheapRecord *first, *last; /* The first and last elements of the - * array that represents the heap. */ - first = &self->heapArray[1]; - last = &self->heapArray[self->n]; - - results = first->theseAlignments; - if( --self->n > 0 ) { - /* The heap is still not empty */ - memcpy(first, last, sizeof(SWheapRecord)); - - SWheapifyDown(self->heapArray, 1, self->n); + Blast_HSPListReapByEvalue(hsp_list, hitParams->options); + if (hsp_list->hspcnt > 0) { + *pbestEvalue = hsp_list->best_evalue; + *pbestScore = hsp_list->hsp_array[0]->score; } - } - - KAPPA_ASSERT(SWheapIsValid(self->heapArray, 1, self->n)); - - return results; -} - - -/** - * Convert a SWheap to a flat list of SeqAligns. Note that there may - * be more than one alignment per element in the heap. The new list - * preserves the order of the SeqAligns associated with each - * HeapRecord. (@todo this function is named as it is for compatibility with - * kappa.c, rename in the future) - * - * @param self a SWheap - * @param results BLAST core external results structure (pre-SeqAlign) - * [out] - * @param hitlist_size size of each list in the results structure above [in] - */ -static void -SWheapToFlatList(SWheap * self, BlastHSPResults * results, Int4 hitlist_size) -{ - BlastHSPList* hsp_list; - BlastHitList* hitlist = - results->hitlist_array[0] = Blast_HitListNew(hitlist_size); - - hsp_list = NULL; - while(NULL != (hsp_list = SWheapPop(self))) { - Blast_HitListUpdate(hitlist, hsp_list); - } } -/** keeps one row of the Smith-Waterman matrix - */ -typedef struct SWpairs { - Int4 noGap; /**< @todo document me */ - Int4 gapExists; /**< @todo document me */ -} SWpairs; - - -/** - * computes Smith-Waterman local alignment score and returns the - * evalue - * - * @param matchSeq is a database sequence matched by this query [in] - * @param matchSeqLength is the length of matchSeq in amino acids [in] - * @param query is the input query sequence [in] - * @param queryLength is the length of query [in] - * @param matrix is the position-specific matrix associated with - * query [in] - * @param gapOpen is the cost of opening a gap [in] - * @param gapExtend is the cost of extending an existing gap by 1 - * position [in] - * @param matchSeqEnd returns the final position in the matchSeq of an - * optimal local alignment [in] - * @param queryEnd returns the final position in query of an optimal - * local alignment [in]. matchSeqEnd and queryEnd can - * be used to run the local alignment - * in reverse to find optimal starting positions [in] - * @param score is used to pass back the optimal score [in] - * @param kbp holds the Karlin-Altschul parameters [in] - * @param effSearchSpace effective search space [in] - * @param positionSpecific determines whether matrix is position - * specific or not [in] - * @return the expect value of the alignment - */ static double -BLbasicSmithWatermanScoreOnly(Uint1 * matchSeq, - Int4 matchSeqLength, Uint1 *query, Int4 queryLength, - Int4 **matrix, - Int4 gapOpen, Int4 gapExtend, Int4 *matchSeqEnd, Int4 *queryEnd, - Int4 *score, - Blast_KarlinBlk* kbp, Int8 effSearchSpace, Boolean positionSpecific) +s_CalcLambda(double probs[], int min_score, int max_score, double lambda0) { - - Int4 bestScore; /*best score seen so far*/ - Int4 newScore; /* score of next entry*/ - Int4 bestMatchSeqPos, bestQueryPos; /*position ending best score in - matchSeq and query sequences*/ - SWpairs *scoreVector; /*keeps one row of the Smith-Waterman matrix - overwrite old row with new row*/ - Int4 *matrixRow; /*one row of score matrix*/ - Int4 newGapCost; /*cost to have a gap of one character*/ - Int4 prevScoreNoGapMatchSeq; /*score one row and column up - with no gaps*/ - Int4 prevScoreGapMatchSeq; /*score if a gap already started in matchSeq*/ - Int4 continueGapScore; /*score for continuing a gap in matchSeq*/ - Int4 matchSeqPos, queryPos; /*positions in matchSeq and query*/ - double returnEvalue; /*e-value to return*/ - - - scoreVector = (SWpairs *) calloc(matchSeqLength, sizeof(SWpairs)); - bestMatchSeqPos = 0; - bestQueryPos = 0; - bestScore = 0; - newGapCost = gapOpen + gapExtend; - for (matchSeqPos = 0; matchSeqPos < matchSeqLength; matchSeqPos++) { - scoreVector[matchSeqPos].noGap = 0; - scoreVector[matchSeqPos].gapExists = -(gapOpen); - } - for(queryPos = 0; queryPos < queryLength; queryPos++) { - if (positionSpecific) - matrixRow = matrix[queryPos]; - else - matrixRow = matrix[query[queryPos]]; - newScore = 0; - prevScoreNoGapMatchSeq = 0; - prevScoreGapMatchSeq = -(gapOpen); - for(matchSeqPos = 0; matchSeqPos < matchSeqLength; matchSeqPos++) { - /*testing scores with a gap in matchSeq, either starting a new - gap or extending an existing gap*/ - if ((newScore = newScore - newGapCost) > - (prevScoreGapMatchSeq = prevScoreGapMatchSeq - gapExtend)) - prevScoreGapMatchSeq = newScore; - /*testing scores with a gap in query, either starting a new - gap or extending an existing gap*/ - if ((newScore = scoreVector[matchSeqPos].noGap - newGapCost) > - (continueGapScore = scoreVector[matchSeqPos].gapExists - gapExtend)) - continueGapScore = newScore; - /*compute new score extending one position in matchSeq and query*/ - newScore = prevScoreNoGapMatchSeq + matrixRow[matchSeq[matchSeqPos]]; - if (newScore < 0) - newScore = 0; /*Smith-Waterman locality condition*/ - /*test two alternatives*/ - if (newScore < prevScoreGapMatchSeq) - newScore = prevScoreGapMatchSeq; - if (newScore < continueGapScore) - newScore = continueGapScore; - prevScoreNoGapMatchSeq = scoreVector[matchSeqPos].noGap; - scoreVector[matchSeqPos].noGap = newScore; - scoreVector[matchSeqPos].gapExists = continueGapScore; - if (newScore > bestScore) { - bestScore = newScore; - bestQueryPos = queryPos; - bestMatchSeqPos = matchSeqPos; - } - } - } - sfree(scoreVector); - if (bestScore < 0) - bestScore = 0; - *matchSeqEnd = bestMatchSeqPos; - *queryEnd = bestQueryPos; - *score = bestScore; - returnEvalue = BLAST_KarlinStoE_simple(bestScore,kbp, effSearchSpace); - return(returnEvalue); -} - - -/** - * computes where optimal Smith-Waterman local alignment starts given - * the ending positions and score matchSeqEnd and queryEnd can be used - * to run the local alignment in reverse to find optimal starting - * positions these are passed back in matchSeqStart and queryStart the - * optimal score is passed in to check when it has been reached going - * backwards the score is also returned - * @param matchSeq is a database sequence matched by this query [in] - * @param matchSeqLength is the length of matchSeq in amino acids [in] - * @param query is the input query sequence [in] - * @param matrix is the position-specific matrix associated with - * query or the standard matrix [in] - * @param gapOpen is the cost of opening a gap [in] - * @param gapExtend is the cost of extending an existing gap by 1 - * position [in] - * @param matchSeqEnd is the final position in the matchSeq of an optimal - * local alignment [in] - * @param queryEnd is the final position in query of an optimal - * local alignment [in] - * @param score optimal score to be obtained [in] - * @param matchSeqStart starting point of optimal alignment [out] - * @param queryStart starting point of optimal alignment [out] - * @param positionSpecific determines whether matrix is position specific - * or not - */ -static Int4 -BLSmithWatermanFindStart(Uint1 * matchSeq, - Int4 matchSeqLength, Uint1 *query, Int4 **matrix, - Int4 gapOpen, Int4 gapExtend, Int4 matchSeqEnd, Int4 queryEnd, Int4 score, - Int4 *matchSeqStart, Int4 *queryStart, Boolean positionSpecific) -{ - - Int4 bestScore; /*best score seen so far*/ - Int4 newScore; /* score of next entry*/ - Int4 bestMatchSeqPos, bestQueryPos; /*position starting best score in - matchSeq and database sequences*/ - SWpairs *scoreVector; /*keeps one row of the Smith-Waterman matrix - overwrite old row with new row*/ - Int4 *matrixRow; /*one row of score matrix*/ - Int4 newGapCost; /*cost to have a gap of one character*/ - Int4 prevScoreNoGapMatchSeq; /*score one row and column up - with no gaps*/ - Int4 prevScoreGapMatchSeq; /*score if a gap already started in matchSeq*/ - Int4 continueGapScore; /*score for continuing a gap in query*/ - Int4 matchSeqPos, queryPos; /*positions in matchSeq and query*/ - - scoreVector = (SWpairs *) calloc(matchSeqLength, sizeof(SWpairs)); - bestMatchSeqPos = 0; - bestQueryPos = 0; - bestScore = 0; - newGapCost = gapOpen + gapExtend; - for (matchSeqPos = 0; matchSeqPos < matchSeqLength; matchSeqPos++) { - scoreVector[matchSeqPos].noGap = 0; - scoreVector[matchSeqPos].gapExists = -(gapOpen); - } - for(queryPos = queryEnd; queryPos >= 0; queryPos--) { - if (positionSpecific) - matrixRow = matrix[queryPos]; - else - matrixRow = matrix[query[queryPos]]; - newScore = 0; - prevScoreNoGapMatchSeq = 0; - prevScoreGapMatchSeq = -(gapOpen); - for(matchSeqPos = matchSeqEnd; matchSeqPos >= 0; matchSeqPos--) { - /*testing scores with a gap in matchSeq, either starting a new - gap or extending an existing gap*/ - if ((newScore = newScore - newGapCost) > - (prevScoreGapMatchSeq = prevScoreGapMatchSeq - gapExtend)) - prevScoreGapMatchSeq = newScore; - /*testing scores with a gap in query, either starting a new - gap or extending an existing gap*/ - if ((newScore = scoreVector[matchSeqPos].noGap - newGapCost) > - (continueGapScore = scoreVector[matchSeqPos].gapExists - gapExtend)) - continueGapScore = newScore; - /*compute new score extending one position in matchSeq and query*/ - newScore = prevScoreNoGapMatchSeq + matrixRow[matchSeq[matchSeqPos]]; - if (newScore < 0) - newScore = 0; /*Smith-Waterman locality condition*/ - /*test two alternatives*/ - if (newScore < prevScoreGapMatchSeq) - newScore = prevScoreGapMatchSeq; - if (newScore < continueGapScore) - newScore = continueGapScore; - prevScoreNoGapMatchSeq = scoreVector[matchSeqPos].noGap; - scoreVector[matchSeqPos].noGap = newScore; - scoreVector[matchSeqPos].gapExists = continueGapScore; - if (newScore > bestScore) { - bestScore = newScore; - bestQueryPos = queryPos; - bestMatchSeqPos = matchSeqPos; - } - if (bestScore >= score) - break; - } - if (bestScore >= score) - break; - } - sfree(scoreVector); - if (bestScore < 0) - bestScore = 0; - *matchSeqStart = bestMatchSeqPos; - *queryStart = bestQueryPos; - return(bestScore); -} - - -/** - * computes Smith-Waterman local alignment score and returns the - * evalue assuming some positions are forbidden matchSeqEnd and query - * can be used to run the local alignment in reverse to find optimal - * starting positions - * @param matchSeq is the matchSeq sequence [in] - * @param matchSeqLength is the length of matchSeq in amino acids [in] - * @param query is the input query sequence [in] - * @param queryLength is the length of query [in] - * @param matrix is either the position-specific matrix associated - * with query or the standard matrix [in] - * @param gapOpen is the cost of opening a gap [in] - * @param gapExtend is the cost of extending an existing gap by 1 - * position [in] - * @param matchSeqEnd returns the final position in the matchSeq of an - * optimal local alignment [in] - * @param queryEnd returns the final position in query of an optimal - * local alignment [in] - * @param score is used to pass back the optimal score [out] - * @param kbp holds the Karlin-Altschul parameters [in] - * @param effSearchSpace effective search space [in] - * @param numForbidden number of forbidden ranges [in] - * @param forbiddenRanges lists areas that should not be aligned [in] - * @param positionSpecific determines whether matrix is position specific - * or not [in] - */ -static double -BLspecialSmithWatermanScoreOnly(Uint1 * matchSeq, - Int4 matchSeqLength, Uint1 *query, Int4 queryLength, Int4 **matrix, - Int4 gapOpen, Int4 gapExtend, - Int4 *matchSeqEnd, Int4 *queryEnd, Int4 *score, - Blast_KarlinBlk* kbp, Int8 effSearchSpace, - Int4 *numForbidden, Int4 ** forbiddenRanges, Boolean positionSpecific) -{ - - Int4 bestScore; /*best score seen so far*/ - Int4 newScore; /* score of next entry*/ - Int4 bestMatchSeqPos, bestQueryPos; /*position ending best score in - matchSeq and database sequences*/ - SWpairs *scoreVector; /*keeps one row of the Smith-Waterman matrix - overwrite old row with new row*/ - Int4 *matrixRow; /*one row of score matrix*/ - Int4 newGapCost; /*cost to have a gap of one character*/ - Int4 prevScoreNoGapMatchSeq; /*score one row and column up - with no gaps*/ - Int4 prevScoreGapMatchSeq; /*score if a gap already started in matchSeq*/ - Int4 continueGapScore; /*score for continuing a gap in query*/ - Int4 matchSeqPos, queryPos; /*positions in matchSeq and query*/ - double returnEvalue; /*e-value to return*/ - Boolean forbidden; /*is this position forbidden?*/ - Int4 f; /*index over forbidden positions*/ - - - scoreVector = (SWpairs *) calloc(1, matchSeqLength * sizeof(SWpairs)); - bestMatchSeqPos = 0; - bestQueryPos = 0; - bestScore = 0; - newGapCost = gapOpen + gapExtend; - for (matchSeqPos = 0; matchSeqPos < matchSeqLength; matchSeqPos++) { - scoreVector[matchSeqPos].noGap = 0; - scoreVector[matchSeqPos].gapExists = -(gapOpen); - } - for(queryPos = 0; queryPos < queryLength; queryPos++) { - if (positionSpecific) - matrixRow = matrix[queryPos]; - else - matrixRow = matrix[query[queryPos]]; - newScore = 0; - prevScoreNoGapMatchSeq = 0; - prevScoreGapMatchSeq = -(gapOpen); - for(matchSeqPos = 0; matchSeqPos < matchSeqLength; matchSeqPos++) { - /*testing scores with a gap in matchSeq, either starting a new - gap or extending an existing gap*/ - if ((newScore = newScore - newGapCost) > - (prevScoreGapMatchSeq = prevScoreGapMatchSeq - gapExtend)) - prevScoreGapMatchSeq = newScore; - /*testing scores with a gap in query, either starting a new - gap or extending an existing gap*/ - if ((newScore = scoreVector[matchSeqPos].noGap - newGapCost) > - (continueGapScore = scoreVector[matchSeqPos].gapExists - gapExtend)) - continueGapScore = newScore; - /*compute new score extending one position in matchSeq and query*/ - forbidden = FALSE; - for(f = 0; f < numForbidden[queryPos]; f++) { - if ((matchSeqPos >= forbiddenRanges[queryPos][2 * f]) && - (matchSeqPos <= forbiddenRanges[queryPos][2*f + 1])) { - forbidden = TRUE; - break; - } - } - if (forbidden) - newScore = BLAST_SCORE_MIN; - else - newScore = prevScoreNoGapMatchSeq + matrixRow[matchSeq[matchSeqPos]]; - if (newScore < 0) - newScore = 0; /*Smith-Waterman locality condition*/ - /*test two alternatives*/ - if (newScore < prevScoreGapMatchSeq) - newScore = prevScoreGapMatchSeq; - if (newScore < continueGapScore) - newScore = continueGapScore; - prevScoreNoGapMatchSeq = scoreVector[matchSeqPos].noGap; - scoreVector[matchSeqPos].noGap = newScore; - scoreVector[matchSeqPos].gapExists = continueGapScore; - if (newScore > bestScore) { - bestScore = newScore; - bestQueryPos = queryPos; - bestMatchSeqPos = matchSeqPos; - - } - } - } - sfree(scoreVector); - if (bestScore < 0) - bestScore = 0; - *matchSeqEnd = bestMatchSeqPos; - *queryEnd = bestQueryPos; - *score = bestScore; - returnEvalue = BLAST_KarlinStoE_simple(bestScore,kbp, effSearchSpace); - return(returnEvalue); -} - - -/** - * computes where optimal Smith-Waterman local alignment starts given - * the ending positions. matchSeqEnd and queryEnd can be used to run - * the local alignment in reverse to find optimal starting positions - * these are passed back in matchSeqStart and queryStart the optimal - * score is passed in to check when it has been reached going - * backwards the score is also returned - * @param matchSeq is the matchSeq sequence [in] - * @param matchSeqLength is the length of matchSeq in amino acids [in] - * @param query is the sequence corresponding to some matrix - * profile [in] - * @param matrix is the position-specific matrix associated with - * query [in] - * @param gapOpen is the cost of opening a gap [in] - * @param gapExtend is the cost of extending an existing gap by 1 - * position [in] - * @param matchSeqEnd is the final position in the matchSeq of an optimal - * local alignment [in] - * @param queryEnd is the final position in query of an optimal - * local alignment [in] - * @param score optimal score is passed in to check when it has - * been reached going backwards [in] - * @param matchSeqStart optimal starting point [in] - * @param queryStart optimal starting point [in] - * @param numForbidden array of regions not to be aligned. [in] - * @param numForbidden array of regions not to be aligned. [in] - * @param forbiddenRanges regions not to be aligned. [in] - * @param positionSpecific determines whether matrix is position specific - * or not - * @return the score found - */ -static Int4 BLspecialSmithWatermanFindStart(Uint1 * matchSeq, - Int4 matchSeqLength, Uint1 *query, Int4 **matrix, - Int4 gapOpen, Int4 gapExtend, Int4 matchSeqEnd, Int4 queryEnd, Int4 score, - Int4 *matchSeqStart, Int4 *queryStart, Int4 *numForbidden, - Int4 ** forbiddenRanges, Boolean positionSpecific) -{ - - Int4 bestScore; /*best score seen so far*/ - Int4 newScore; /* score of next entry*/ - Int4 bestMatchSeqPos, bestQueryPos; /*position starting best score in - matchSeq and database sequences*/ - SWpairs *scoreVector; /*keeps one row of the Smith-Waterman matrix - overwrite old row with new row*/ - Int4 *matrixRow; /*one row of score matrix*/ - Int4 newGapCost; /*cost to have a gap of one character*/ - Int4 prevScoreNoGapMatchSeq; /*score one row and column up - with no gaps*/ - Int4 prevScoreGapMatchSeq; /*score if a gap already started in matchSeq*/ - Int4 continueGapScore; /*score for continuing a gap in query*/ - Int4 matchSeqPos, queryPos; /*positions in matchSeq and query*/ - Boolean forbidden; /*is this position forbidden?*/ - Int4 f; /*index over forbidden positions*/ - - scoreVector = (SWpairs *) calloc(matchSeqLength, sizeof(SWpairs)); - bestMatchSeqPos = 0; - bestQueryPos = 0; - bestScore = 0; - newGapCost = gapOpen + gapExtend; - for (matchSeqPos = 0; matchSeqPos < matchSeqLength; matchSeqPos++) { - scoreVector[matchSeqPos].noGap = 0; - scoreVector[matchSeqPos].gapExists = -(gapOpen); - } - for(queryPos = queryEnd; queryPos >= 0; queryPos--) { - if (positionSpecific) - matrixRow = matrix[queryPos]; - else - matrixRow = matrix[query[queryPos]]; - newScore = 0; - prevScoreNoGapMatchSeq = 0; - prevScoreGapMatchSeq = -(gapOpen); - for(matchSeqPos = matchSeqEnd; matchSeqPos >= 0; matchSeqPos--) { - /*testing scores with a gap in matchSeq, either starting a new - gap or extending an existing gap*/ - if ((newScore = newScore - newGapCost) > - (prevScoreGapMatchSeq = prevScoreGapMatchSeq - gapExtend)) - prevScoreGapMatchSeq = newScore; - /*testing scores with a gap in query, either starting a new - gap or extending an existing gap*/ - if ((newScore = scoreVector[matchSeqPos].noGap - newGapCost) > - (continueGapScore = scoreVector[matchSeqPos].gapExists - gapExtend)) - continueGapScore = newScore; - /*compute new score extending one position in matchSeq and query*/ - forbidden = FALSE; - for(f = 0; f < numForbidden[queryPos]; f++) { - if ((matchSeqPos >= forbiddenRanges[queryPos][2 * f]) && - (matchSeqPos <= forbiddenRanges[queryPos][2*f + 1])) { - forbidden = TRUE; - break; - } - } - if (forbidden) - newScore = BLAST_SCORE_MIN; - else - newScore = prevScoreNoGapMatchSeq + matrixRow[matchSeq[matchSeqPos]]; - if (newScore < 0) - newScore = 0; /*Smith-Waterman locality condition*/ - /*test two alternatives*/ - if (newScore < prevScoreGapMatchSeq) - newScore = prevScoreGapMatchSeq; - if (newScore < continueGapScore) - newScore = continueGapScore; - prevScoreNoGapMatchSeq = scoreVector[matchSeqPos].noGap; - scoreVector[matchSeqPos].noGap = newScore; - scoreVector[matchSeqPos].gapExists = continueGapScore; - if (newScore > bestScore) { - bestScore = newScore; - bestQueryPos = queryPos; - bestMatchSeqPos = matchSeqPos; - } - if (bestScore >= score) - break; - } - if (bestScore >= score) - break; - } - sfree(scoreVector); - if (bestScore < 0) - bestScore = 0; - *matchSeqStart = bestMatchSeqPos; - *queryStart = bestQueryPos; - return(bestScore); -} - - -/** - * Kappa_SequenceData - represents a string of amino acids or nucleotides - */ -typedef struct Kappa_SequenceData { - Uint1 *data; /**< amino acid or nucleotide data */ - Int4 length; /**< the length of data. For amino acid data - &data[-1] is a valid address and - data[-1] == 0. */ - Uint1 *buffer; /**< if non-nil, points to memory that - must be freed when this instance of - Kappa_SequenceData is deleted. */ -} Kappa_SequenceData; - - -/** Release the data associated with this object. */ -static void -Kappa_SequenceDataRelease(Kappa_SequenceData * self) -{ - if(self->buffer) sfree(self->buffer); - - self->data = NULL; - self->buffer = NULL; -} - - -/** - * An instance of Kappa_ForbiddenRanges is used by the Smith-Waterman - * algorithm to represent ranges in the database that are not to be - * aligned. - */ -typedef struct Kappa_ForbiddenRanges { - Boolean isEmpty; /**< True if there are no forbidden ranges */ - Int4 *numForbidden; /**< how many forbidden ranges at each db - position */ - Int4 **ranges; /**< forbidden ranges for each database - position */ - Int4 queryLength; /**< length of the query sequence */ -} Kappa_ForbiddenRanges; - - -/** - * Initialize a new, empty Kappa_ForbiddenRanges - * - * @param self object to be initialized - * @param queryLength the length of the query - */ -static void -Kappa_ForbiddenRangesInitialize( - Kappa_ForbiddenRanges * self, - Int4 queryLength) -{ - Int4 f; - self->queryLength = queryLength; - self->numForbidden = (Int4 *) malloc(queryLength * sizeof(Int4)); - self->ranges = (Int4 **) malloc(queryLength * sizeof(Int4 *)); - self->isEmpty = TRUE; - - for(f = 0; f < queryLength; f++) { - self->numForbidden[f] = 0; - self->ranges[f] = (Int4 *) malloc(2 * sizeof(Int4)); - self->ranges[f][0] = 0; - self->ranges[f][1] = 0; - } -} - - -/** Reset self to be empty */ -static void -Kappa_ForbiddenRangesClear(Kappa_ForbiddenRanges * self) -{ - Int4 f; - for(f = 0; f < self->queryLength; f++) { - self->numForbidden[f] = 0; - } - self->isEmpty = TRUE; -} - - -/** Add some ranges to self - * @param self an instance of Kappa_ForbiddenRanges [in][out] - * @param queryStart start of the alignment in the query sequence - * @param queryAlignmentExtent length of the alignment in the query sequence - * @param matchStart start of the alignment in the subject sequence - * @param matchAlignmentExtent length of the alignment in the - * subject sequence - */ -static void -Kappa_ForbiddenRangesPush( - Kappa_ForbiddenRanges * self, - Int4 queryStart, - Int4 queryAlignmentExtent, - Int4 matchStart, - Int4 matchAlignmentExtent) -{ - Int4 f; - for(f = queryStart; f < (queryStart + queryAlignmentExtent); f++) { - Int4 last = 2 * self->numForbidden[f]; - if(0 != last) { /* we must resize the array */ - self->ranges[f] = - (Int4 *) realloc(self->ranges[f], (last + 2) * sizeof(Int4)); + int i, n; + double avg; + Blast_ScoreFreq freq; + + n = max_score - min_score + 1; + avg = 0.0; + for (i = 0; i < n; i++) { + avg += (min_score + i) * probs[i]; } - self->ranges[f][last] = matchStart; - self->ranges[f][last + 1] = matchStart + matchAlignmentExtent; - - self->numForbidden[f]++; - } - self->isEmpty = FALSE; -} - - -/** - * Release the storage associated with the fields of self, but do not - * delete self - * - * @param self an instance of Kappa_ForbiddenRanges [in][out] - */ -static void -Kappa_ForbiddenRangesRelease(Kappa_ForbiddenRanges * self) -{ - Int4 f; - for(f = 0; f < self->queryLength; f++) sfree(self->ranges[f]); - - sfree(self->ranges); self->ranges = NULL; - sfree(self->numForbidden); self->numForbidden = NULL; + freq.score_min = min_score; + freq.score_max = max_score; + freq.obs_min = min_score; + freq.obs_max = max_score; + freq.sprob0 = probs; + freq.sprob = &probs[-min_score]; + freq.score_avg = avg; + + return Blast_KarlinLambdaNR(&freq, lambda0); } -/** - * Calls BLbasicSmithWatermanScoreOnly if forbiddenRanges is empty and - * calls BLspecialSmithWatermanScoreOnly otherwise. This routine has - * the same parameters and return value as - * BLspecialSmithWatermanScoreOnly. - */ -static double -SmithWatermanScoreOnly(Kappa_SequenceData * subject, - Kappa_SequenceData * query, - Int4 **matrix, - Int4 gapOpen, - Int4 gapExtend, - Int4 *matchSeqEnd, - Int4 *queryEnd, - Int4 *score, - Blast_KarlinBlk * kbp, - Int8 effSearchSpace, - Boolean positionSpecific, - Kappa_ForbiddenRanges * forbiddenRanges ) -{ - if( forbiddenRanges->isEmpty ) { - return - BLbasicSmithWatermanScoreOnly(subject->data, subject->length, - query ->data, query ->length, - matrix, gapOpen, gapExtend, matchSeqEnd, - queryEnd, score, kbp, effSearchSpace, - positionSpecific); - } else { - return - BLspecialSmithWatermanScoreOnly(subject->data, subject->length, - query ->data, query ->length, - matrix, gapOpen, gapExtend, matchSeqEnd, - queryEnd, score, kbp, effSearchSpace, - forbiddenRanges->numForbidden, - forbiddenRanges->ranges, - positionSpecific); - } -} - - -/** - * Calls BLSmithWatermanFindStart if forbiddenRanges is empty and - * calls BLspecialSmithWatermanFindStart otherwise. This routine has - * the same parameters and return value as - * BLspecialSmithWatermanFindStart. - */ -static Int4 -SmithWatermanFindStart(Kappa_SequenceData * subject, - Kappa_SequenceData * query, - Int4 **matrix, - Int4 gapOpen, - Int4 gapExtend, - Int4 matchSeqEnd, - Int4 queryEnd, - Int4 score, - Int4 *matchSeqStart, - Int4 *queryStart, - Boolean positionSpecific, - Kappa_ForbiddenRanges * forbiddenRanges) -{ - if( forbiddenRanges->isEmpty ) { - return - BLSmithWatermanFindStart(subject->data, subject->length, - query ->data, - matrix, gapOpen, gapExtend, - matchSeqEnd, queryEnd, score, - matchSeqStart, queryStart, - positionSpecific); - } else { - return - BLspecialSmithWatermanFindStart(subject->data, subject->length, - query ->data, - matrix, gapOpen, gapExtend, - matchSeqEnd, queryEnd, score, - matchSeqStart, queryStart, - forbiddenRanges->numForbidden, - forbiddenRanges->ranges, - positionSpecific); - } -} - - -/** - * @param matrix is a position-specific score matrix with matrixLength - * positions - * @param subjectProbArray is an array containing the probability of - * occurrence of each residue in the subject - * @param queryProbArray is an array containing the probability of - * occurrence of each residue in the query - * @param scoreArray is an array of probabilities for each score that is - * to be used as a field in return_sfp - * @param return_sfp is a the structure to be filled in and returned - * @param range is the size of scoreArray and is an upper bound on - * the difference between maximum score and minimum - * score in the matrix - * the routine posfillSfp computes the probability of each score - * weighted by the probability of each query residue and fills those - * probabilities into scoreArray and puts scoreArray as a field in - * that in the structure that is returned for indexing convenience the - * field storing scoreArray points to the entry for score 0, so that - * referring to the -k index corresponds to score -k - */ -static Blast_ScoreFreq* notposfillSfp(Int4 **matrix, double *subjectProbArray, double *queryProbArray, double *scoreArray, Blast_ScoreFreq* return_sfp, Int4 range) -{ - Int4 minScore, maxScore; /*observed minimum and maximum scores*/ - Int4 i,j,k; /* indices */ - - minScore = maxScore = 0; - - for(i = 0; i < BLASTAA_SIZE; i++) { - for(j = 0 ; j < PRO_TRUE_ALPHABET_SIZE; j++) { - k = trueCharPositions[j]; - if ((matrix[i][k] != BLAST_SCORE_MIN) && (matrix[i][k] < minScore)) - minScore = matrix[i][k]; - if (matrix[i][k] > maxScore) - maxScore = matrix[i][k]; - } - } - return_sfp->obs_min = minScore; - return_sfp->obs_max = maxScore; - for (i = 0; i < range; i++) - scoreArray[i] = 0.0; - return_sfp->sprob = &(scoreArray[-minScore]); /*center around 0*/ - for(i = 0; i < BLASTAA_SIZE; i++) { - for (j = 0; j < PRO_TRUE_ALPHABET_SIZE; j++) { - k = trueCharPositions[j]; - if(matrix[i][k] >= minScore) { - return_sfp->sprob[matrix[i][k]] += (queryProbArray[i] * subjectProbArray[k]); - } - } - } - return_sfp->score_avg = 0; - for(i = minScore; i <= maxScore; i++) - return_sfp->score_avg += i * return_sfp->sprob[i]; - return(return_sfp); -} - - -/** - * - * @param matrix is a position-specific score matrix with - * matrixLength positions - * @param matrixLength length of the position-specific matrix above - * @param subjectProbArray is an array containing the probability of - * occurrence of each residue in the matching - * sequence often called the subject - * @param scoreArray is an array of probabilities for each score - * that is to be used as a field in return_sfp - * @param return_sfp is a the structure to be filled in and returned - * range is the size of scoreArray and is an upper - * bound on the difference between maximum score - * and minimum score in the matrix - * @param range is the size of scoreArray and is an upper bound on - * the difference between maximum score and minimum - * score in the matrix - * the routine posfillSfp computes the probability of each score - * weighted by the probability of each query residue and fills those - * probabilities into scoreArray and puts scoreArray as a field in - * that in the structure that is returned for indexing convenience the - * field storing scoreArray points to the entry for score 0, so that - * referring to the -k index corresponds to score -k - */ -static Blast_ScoreFreq* posfillSfp(Int4 **matrix, Int4 matrixLength, double *subjectProbArray, double *scoreArray, Blast_ScoreFreq* return_sfp, Int4 range) -{ - Int4 minScore, maxScore; /*observed minimum and maximum scores*/ - Int4 i,j,k; /* indices */ - double onePosFrac; /*1/matrix length as a double*/ - - minScore = maxScore = 0; - - for(i = 0; i < matrixLength; i++) { - for(j = 0 ; j < PRO_TRUE_ALPHABET_SIZE; j++) { - k = trueCharPositions[j]; - if ((matrix[i][k] != BLAST_SCORE_MIN) && (matrix[i][k] < minScore)) - minScore = matrix[i][k]; - if (matrix[i][k] > maxScore) - maxScore = matrix[i][k]; - } - } - return_sfp->obs_min = minScore; - return_sfp->obs_max = maxScore; - for (i = 0; i < range; i++) - scoreArray[i] = 0.0; - return_sfp->sprob = &(scoreArray[-minScore]); /*center around 0*/ - onePosFrac = 1.0/ ((double) matrixLength); - for(i = 0; i < matrixLength; i++) { - for (j = 0; j < PRO_TRUE_ALPHABET_SIZE; j++) { - k = trueCharPositions[j]; - if(matrix[i][k] >= minScore) { - return_sfp->sprob[matrix[i][k]] += (onePosFrac * subjectProbArray[k]); - } - } - } - return_sfp->score_avg = 0; - for(i = minScore; i <= maxScore; i++) - return_sfp->score_avg += i * return_sfp->sprob[i]; - return(return_sfp); -} - /** Return the a matrix of the frequency ratios that underlie the * score matrix being used on this pass. The returned matrix * is position-specific, so if we are in the first pass, use @@ -1675,7 +276,7 @@ static Blast_ScoreFreq* posfillSfp(Int4 **matrix, Int4 matrixLength, double *sub * score matrix used. numPositions is the length of the query; * startNumerator is the matrix of frequency ratios as stored * in posit.h. It needs to be divided by the frequency of the - * second character to get the intended ratio + * second character to get the intended ratio * @param sbp statistical information for blast [in] * @param query the query sequence [in] * @param matrixName name of the underlying matrix [in] @@ -1684,82 +285,51 @@ static Blast_ScoreFreq* posfillSfp(Int4 **matrix, Int4 matrixLength, double *sub * second character to get the intended ratio [in] * @param numPositions length of the query [in] */ -static double **getStartFreqRatios(BlastScoreBlk* sbp, - Uint1* query, - const char *matrixName, - double **startNumerator, - Int4 numPositions) -{ - double** returnRatios; /*frequency ratios to start investigating each pair*/ - double *standardProb; /*probabilities of each letter*/ - Int4 i,j; /* Loop indices. */ - SFreqRatios* freqRatios=NULL; /* frequency ratio container for given matrix */ - const double kPosEpsilon = 0.0001; - - returnRatios = (double**) _PSIAllocateMatrix(numPositions, - BLASTAA_SIZE, - sizeof(double)); - - freqRatios = _PSIMatrixFrequencyRatiosNew(matrixName); - if (freqRatios == NULL) - return NULL; - - for(i = 0; i < numPositions; i++) { - for(j = 0; j < BLASTAA_SIZE; j++) { - returnRatios[i][j] = freqRatios->data[query[i]][j]; - } - } - - freqRatios = _PSIMatrixFrequencyRatiosFree(freqRatios); - - standardProb = BLAST_GetStandardAaProbabilities(); - - /*reverse multiplication done in posit.c*/ - for(i = 0; i < numPositions; i++) - for(j = 0; j < BLASTAA_SIZE; j++) - if ((standardProb[query[i]] > kPosEpsilon) && (standardProb[j] > kPosEpsilon) && - (j != AMINOACID_TO_NCBISTDAA['X']) && (j != AMINOACID_TO_NCBISTDAA['*']) - && (startNumerator[i][j] > kPosEpsilon)) - returnRatios[i][j] = startNumerator[i][j]/standardProb[j]; - - sfree(standardProb); - - return(returnRatios); -} - - -/** - * take every entry of startFreqRatios that is not corresponding to a - * score of BLAST_SCORE_MIN and take its log, divide by Lambda and - * multiply by LambdaRatio then round to the nearest integer and put - * the result in the corresponding entry of matrix. startMatrix and - * matrix have dimensions numPositions X BLASTAA_SIZE - * - * @param matrix preallocated matrix to be filled in [out] - * @param startFreqRatios frequency ratios of starting matrix [in] - * @param numPositions length of query [in] - * @param Lambda A Karlin-Altschul parameter. [in] - * @param LambdaRatio ratio of correct Lambda to it's original value [in] -*/ -static void scaleMatrix(Int4 **matrix, - double **startFreqRatios, Int4 numPositions, - double Lambda, double LambdaRatio) +static void +s_GetStartFreqRatios(double ** returnRatios, + Uint1 * query, + const char *matrixName, + double **startNumerator, + Int4 numPositions, + Boolean positionSpecific) { - Int4 p, c; /*indices over positions and characters*/ - double temp; /*intermediate term in computation*/ - - for (p = 0; p < numPositions; p++) { - for (c = 0; c < BLASTAA_SIZE; c++) { - if (0.0 == startFreqRatios[p][c]) { - matrix[p][c] = BLAST_SCORE_MIN; - } else { - temp = log(startFreqRatios[p][c]); - temp = temp/Lambda; - temp = temp * LambdaRatio; - matrix[p][c] = BLAST_Nint(temp); - } - } - } + Int4 i,j; + SFreqRatios * stdFreqRatios = NULL; + const double kPosEpsilon = 0.0001; + + stdFreqRatios = _PSIMatrixFrequencyRatiosNew(matrixName); + if (positionSpecific) { + for (i = 0; i < numPositions; i++) { + for (j = 0; j < BLASTAA_SIZE; j++) { + returnRatios[i][j] = stdFreqRatios->data[query[i]][j]; + } + } + } else { + for (i = 0; i < BLASTAA_SIZE; i++) { + for (j = 0; j < BLASTAA_SIZE; j++) { + returnRatios[i][j] = stdFreqRatios->data[i][j]; + } + } + } + stdFreqRatios = _PSIMatrixFrequencyRatiosFree(stdFreqRatios); + + if (positionSpecific) { + double *standardProb; /*probabilities of each letter*/ + standardProb = BLAST_GetStandardAaProbabilities(); + + /*reverse multiplication done in posit.c*/ + for (i = 0; i < numPositions; i++) { + for (j = 0; j < BLASTAA_SIZE; j++) { + if ((standardProb[query[i]] > kPosEpsilon) && + (standardProb[j] > kPosEpsilon) && + (j != eStopChar) && (j != eXchar) && + (startNumerator[i][j] > kPosEpsilon)) { + returnRatios[i][j] = startNumerator[i][j]/standardProb[j]; + } + } + } + sfree(standardProb); + } } @@ -1771,42 +341,6 @@ static void scaleMatrix(Int4 **matrix, /** - * Compute a scaled up version of the standard matrix encoded by - * matrix name. Standard matrices are in half-bit units. - * - * @param matrix preallocated matrix [in][out] - * @param matrixName name of matrix (e.g., BLOSUM62, PAM30). [in] - * @param Lambda A Karlin-Altschul parameter. [in] - */ -static void -computeScaledStandardMatrix( - Int4 **matrix, - char *matrixName, - double Lambda) -{ - int i,j; /*loop indices*/ - SFreqRatios* freqRatios=NULL; /* frequency ratios for the matrix */ - - freqRatios = _PSIMatrixFrequencyRatiosNew(matrixName); - ASSERT(freqRatios); - if (freqRatios == NULL) - return; - - for(i = 0; i < BLASTAA_SIZE; i++) - for(j = 0; j < BLASTAA_SIZE; j++) { - if(0.0 == freqRatios->data[i][j]) - matrix[i][j] = BLAST_SCORE_MIN; - else { - double temp = log(freqRatios->data[i][j])/Lambda; - matrix[i][j] = BLAST_Nint(temp); - } - } - - freqRatios = _PSIMatrixFrequencyRatiosFree(freqRatios); -} - - -/** * produce a scaled-up version of the position-specific matrix * starting from posFreqs * @@ -1821,20 +355,20 @@ computeScaledStandardMatrix( * @param queryLength Length of the query sequence above [in] */ static int -scalePosMatrix(int **fillPosMatrix, - int **nonposMatrix, - const char *matrixName, - double **posFreqs, - Uint1 *query, - int queryLength, - BlastScoreBlk* sbp) +s_ScalePosMatrix(int **fillPosMatrix, + int **nonposMatrix, + const char *matrixName, + double **posFreqs, + Uint1 *query, + int queryLength, + BlastScoreBlk* sbp) { Kappa_posSearchItems *posSearch = NULL; Kappa_compactSearchItems *compactSearch = NULL; _PSIInternalPssmData* internal_pssm = NULL; int status = PSI_SUCCESS; - posSearch = Kappa_posSearchItemsNew(queryLength, matrixName, + posSearch = Kappa_posSearchItemsNew(queryLength, matrixName, fillPosMatrix, posFreqs); compactSearch = Kappa_compactSearchItemsNew(query, queryLength, sbp); @@ -1842,11 +376,13 @@ scalePosMatrix(int **fillPosMatrix, internal_pssm = _PSIInternalPssmDataNew(queryLength, BLASTAA_SIZE); _PSICopyMatrix_int(internal_pssm->pssm, posSearch->posMatrix, internal_pssm->ncols, internal_pssm->nrows); - _PSICopyMatrix_int(internal_pssm->scaled_pssm, posSearch->posPrivateMatrix, + _PSICopyMatrix_int(internal_pssm->scaled_pssm, + posSearch->posPrivateMatrix, internal_pssm->ncols, internal_pssm->nrows); - _PSICopyMatrix_double(internal_pssm->freq_ratios, posSearch->posFreqs, - internal_pssm->ncols, internal_pssm->nrows); - status = _PSIConvertFreqRatiosToPSSM(internal_pssm, query, sbp, + _PSICopyMatrix_double(internal_pssm->freq_ratios, + posSearch->posFreqs, internal_pssm->ncols, + internal_pssm->nrows); + status = _PSIConvertFreqRatiosToPSSM(internal_pssm, query, sbp, compactSearch->standardProb); if (status != PSI_SUCCESS) { internal_pssm = _PSIInternalPssmDataFree(internal_pssm); @@ -1854,26 +390,23 @@ scalePosMatrix(int **fillPosMatrix, compactSearch = Kappa_compactSearchItemsFree(compactSearch); return status; } - /* Copy data from new structures to posSearchItems */ _PSICopyMatrix_int(posSearch->posMatrix, internal_pssm->pssm, internal_pssm->ncols, internal_pssm->nrows); - _PSICopyMatrix_int(posSearch->posPrivateMatrix, internal_pssm->scaled_pssm, + _PSICopyMatrix_int(posSearch->posPrivateMatrix, + internal_pssm->scaled_pssm, internal_pssm->ncols, internal_pssm->nrows); - _PSICopyMatrix_double(posSearch->posFreqs, internal_pssm->freq_ratios, + _PSICopyMatrix_double(posSearch->posFreqs, + internal_pssm->freq_ratios, internal_pssm->ncols, internal_pssm->nrows); - status = Kappa_impalaScaling(posSearch, - compactSearch, - (double) SCALING_FACTOR, - FALSE, - sbp); + status = Kappa_impalaScaling(posSearch, compactSearch, (double) + SCALING_FACTOR, FALSE, sbp); if (status != 0) { internal_pssm = _PSIInternalPssmDataFree(internal_pssm); posSearch = Kappa_posSearchItemsFree(posSearch); compactSearch = Kappa_compactSearchItemsFree(compactSearch); return status; } - internal_pssm = _PSIInternalPssmDataFree(internal_pssm); posSearch = Kappa_posSearchItemsFree(posSearch); compactSearch = Kappa_compactSearchItemsFree(compactSearch); @@ -1881,167 +414,49 @@ scalePosMatrix(int **fillPosMatrix, } -/** - * Kappa_WindowInfo - a struct whose instances represent a range - * of data in a sequence. */ -typedef struct Kappa_WindowInfo +static BlastCompo_Alignment * +s_ResultHspToDistinctAlign(BlastQueryInfo* queryInfo, + BlastHSP * hsp_array[], Int4 hspcnt, + double localScalingFactor) { - Int4 begin; /**< the starting index of the range */ - Int4 end; /**< one beyond the last item in the range */ - Int4 frame; /**< the translation frame of this window */ - Int4 hspcnt; /**< the number of HSPs aligned to a subset of the data - in this window's range. */ -} Kappa_WindowInfo; - - -/** - * A datatype used solely to enable a list of windows and of indices - * to be simultaneously sorted in the WindowsFromHSPs routine. - */ -typedef struct Kappa_WindowIndexPair { - Kappa_WindowInfo * window; /**< a window */ - Int4 index; /**< an index associated with - "window," typically the index of - the window in a list, before the - list is sorted. */ -} Kappa_WindowIndexPair; - -/** - * A comparison routine used to sort a list of Kappa_WindowIndexPair - * objects first by frame and then by location. - */ -static int -location_compare_windows(const void * vp1, const void *vp2) -{ - /* w1 and w2 are the windows being compared */ - Kappa_WindowInfo * w1 = ((Kappa_WindowIndexPair *) vp1)->window; - Kappa_WindowInfo * w2 = ((Kappa_WindowIndexPair *) vp2)->window; - - Int4 result; /* result of the comparison */ - if(0 == (result = BLAST_CMP(w1->frame, w2->frame)) && - 0 == (result = BLAST_CMP(w1->begin, w2->begin))) { - result = BLAST_CMP(w1->end, w2->end); - } - return (int) result; -} - - -/** - * Reads a array of HSPs and creates a new array of pointers to - * Kappa_WindowInfo so that each element in the array of HSPs is - * contained in exactly one window - * - * @param hsp_array hsp array to be read [in] - * @param hspcnt length of hsp_array [in] - * @param border Number of extra amino acids to include - * at the start and end of each HSP. - * @param sequence_length length of the sequence containing these - * HSPs, in nucleotide coordinates. - * @param pwindows a pointer to an array of windows; - * the array may be resized by this routine. [in][out] - * @param nWindows the number of windows in *pwindows [in][out] - * @param lWindows the allocated length of *pwindows [in][out] - * @param window_of_hsp HSP i is contained in the bounds of - * window_of_hsp[i] [in][out] - */ -static void -WindowsFromHSPs( - BlastHSP * hsp_array[], - Int4 hspcnt, - Int4 border, - Int4 sequence_length, - Kappa_WindowInfo ***pwindows, - Int4 * nWindows, - Int4 * lWindows, - Int4 * window_of_hsp) -{ - Int4 k, ell; - Kappa_WindowIndexPair * window_and_index; /* an array of windows - * paired with the index - * of the HSP that - * generated them */ - Kappa_WindowInfo ** windows; /* the output list of windows */ - Int4 start_cluster; /* start of a cluster of windows to be joined */ - Int4 length_joined; /* the current length of the list of joined windows */ - - windows = *pwindows; - /* Make the window list have exactly hspcnt windows. */ - if( *lWindows < hspcnt ) { - *lWindows = 2 * hspcnt; - windows = *pwindows = - realloc(*pwindows, *lWindows * sizeof(Kappa_WindowInfo*)); - } - for( k = *nWindows; k < hspcnt; k++ ) { - windows[k] = malloc(sizeof(Kappa_WindowInfo)); - } - for( k = hspcnt; k < *nWindows; k++ ) { - sfree(windows[k]); - } - *nWindows = hspcnt; - - window_and_index = calloc(hspcnt, sizeof(Kappa_WindowIndexPair)); - - for( k = 0; k < hspcnt; k++ ) { /* for all HSPs */ - /* length of the translation of the nucleotide sequence in this frame */ - Int4 translated_length; - - windows[k]->frame = hsp_array[k]->subject.frame; - - if( windows[k]->frame > 0 ) { - translated_length = (sequence_length - windows[k]->frame + 1)/3; - } else { - translated_length = (sequence_length + windows[k]->frame - 1)/3; + BlastCompo_Alignment *aligns = NULL, *tail = NULL, *new_align = NULL; + int hsp_index; + for (hsp_index = 0; hsp_index < hspcnt; hsp_index++) { + int queryIndex, queryEnd, matchEnd; + BlastHSP * hsp = hsp_array[hsp_index]; + queryEnd = hsp->query.end; + matchEnd = hsp->subject.end; + /* YIKES! how do we handle multiple queries */ + /* + if(search->mult_queries != NULL) { + queryIndex = + GetQueryNum(search->mult_queries, + hsp->query_offset, queryEnd - 1, 0); + } else { + queryIndex = 0; + } + */ + queryIndex = 0; + new_align = + BlastCompo_AlignmentNew(hsp->score * localScalingFactor, + eNoCompositionAdjustment, + hsp->query.offset, queryEnd, queryIndex, + hsp->subject.offset, matchEnd, + hsp->subject.frame, hsp); + if (new_align == NULL) /* out of memory */ + goto error_return; + if (tail == NULL) { + aligns = new_align; + } else { + tail->next = new_align; + } + tail = new_align; } - windows[k]->begin = MAX(0, hsp_array[k]->subject.offset - border); - windows[k]->end = MIN(translated_length, - hsp_array[k]->subject.end + border); - windows[k]->hspcnt = 1; - - window_and_index[k].index = k; - window_and_index[k].window = windows[k]; - } - qsort(window_and_index, hspcnt, sizeof(Kappa_WindowIndexPair), - location_compare_windows); - - /* Join windows that overlap or are too close together. */ - start_cluster = 0; - length_joined = 0; - for( k = 0; k < hspcnt; k++ ) { /* for all windows in the - original list */ - Kappa_WindowInfo * window; /* window at this value of k */ - Kappa_WindowInfo * nextWindow; /* window at the next value of k, or - NULL if no such window exists */ - window = window_and_index[k].window; - nextWindow = ( k + 1 < hspcnt ) ? window_and_index[k+1].window : NULL; - - if(nextWindow != NULL && /* there is a next window; and */ - window->frame == nextWindow->frame && /* it is in the same frame; and - it is very near this one */ - window->end >= nextWindow->begin) { - /* Join the current window with the next window. Do not add the - current window to the output list. */ - nextWindow->begin = MIN(window->begin, nextWindow->begin); - nextWindow->end = MAX(window->end, nextWindow->end ); - - sfree(window); - window_and_index[k].window = NULL; /* Set the now dangling - pointer to NULL */ - } else { - /* Don't join the current window with the next window. Add the - current window to the output list instead */ - windows[length_joined] = window; - for( ell = start_cluster; ell <= k; ell++ ) { - window_of_hsp[window_and_index[ell].index] = length_joined; - } - length_joined++; - start_cluster = k + 1; - } /* end else don't join the current window with the next window */ - } /* end for all windows in the original list */ - *nWindows = length_joined; - for( k = length_joined; k < hspcnt; k++ ) { - windows[k] = NULL; - } - sfree(window_and_index); + goto normal_return; + error_return: + BlastCompo_AlignmentsFree(&aligns, NULL); + normal_return: + return aligns; } @@ -2075,45 +490,45 @@ WindowsFromHSPs( * algorithm */ static void -Kappa_SWFindFinalEndsUsingXdrop( - Kappa_SequenceData * query, - Int4 queryStart, - Int4 queryEnd, - Kappa_SequenceData * subject, - Int4 matchStart, - Int4 matchEnd, - BlastGapAlignStruct* gap_align, - const BlastScoringParameters* scoringParams, - Int4 score, - double localScalingFactor, - Int4 * queryAlignmentExtent, - Int4 * matchAlignmentExtent, - Int4 * newScore) +s_SWFindFinalEndsUsingXdrop( + BlastCompo_SequenceData * query, + Int4 queryStart, + Int4 queryEnd, + BlastCompo_SequenceData * subject, + Int4 matchStart, + Int4 matchEnd, + BlastGapAlignStruct* gap_align, + const BlastScoringParameters* scoringParams, + Int4 score, + double localScalingFactor, + Int4 * queryAlignmentExtent, + Int4 * matchAlignmentExtent, + Int4 * newScore) { - Int4 XdropAlignScore; /* alignment score obtained using X-dropoff - * method rather than Smith-Waterman */ - Int4 doublingCount = 0; /* number of times X-dropoff had to be - * doubled */ - - GapPrelimEditBlockReset(gap_align->rev_prelim_tback); - GapPrelimEditBlockReset(gap_align->fwd_prelim_tback); - do { - XdropAlignScore = - ALIGN_EX(&(query->data[queryStart]) - 1, - &(subject->data[matchStart]) - 1, - queryEnd - queryStart + 1, matchEnd - matchStart + 1, - queryAlignmentExtent, - matchAlignmentExtent, gap_align->fwd_prelim_tback, - gap_align, scoringParams, queryStart - 1, FALSE, FALSE); - - gap_align->gap_x_dropoff *= 2; - doublingCount++; - if((XdropAlignScore < score) && (doublingCount < 3)) { - GapPrelimEditBlockReset(gap_align->fwd_prelim_tback); - } - } while((XdropAlignScore < score) && (doublingCount < 3)); + Int4 XdropAlignScore; /* alignment score obtained using X-dropoff + * method rather than Smith-Waterman */ + Int4 doublingCount = 0; /* number of times X-dropoff had to be + * doubled */ + + GapPrelimEditBlockReset(gap_align->rev_prelim_tback); + GapPrelimEditBlockReset(gap_align->fwd_prelim_tback); + do { + XdropAlignScore = + ALIGN_EX(&(query->data[queryStart]) - 1, + &(subject->data[matchStart]) - 1, + queryEnd - queryStart + 1, matchEnd - matchStart + 1, + queryAlignmentExtent, + matchAlignmentExtent, gap_align->fwd_prelim_tback, + gap_align, scoringParams, queryStart - 1, FALSE, FALSE); + + gap_align->gap_x_dropoff *= 2; + doublingCount++; + if((XdropAlignScore < score) && (doublingCount < 3)) { + GapPrelimEditBlockReset(gap_align->fwd_prelim_tback); + } + } while((XdropAlignScore < score) && (doublingCount < 3)); - *newScore = XdropAlignScore; + *newScore = XdropAlignScore; } @@ -2127,24 +542,23 @@ Kappa_SWFindFinalEndsUsingXdrop( * We draw a distinction between a sequence itself, and strings of * data that may be obtained from the sequence. The amino * acid/nucleotide data is represented by an object of type - * Kappa_SequenceData. There may be more than one instance of - * Kappa_SequenceData per Kappa_MatchingSequence, each representing a + * BlastCompo_SequenceData. There may be more than one instance of + * BlastCompo_SequenceData per Kappa_MatchingSequence, each representing a * different range in the sequence, or a different translation frame. */ -typedef struct Kappa_MatchingSequence { - Int4 length; /**< length of this matching sequence */ - Int4 index; /**< index of this sequence in the database */ - EBlastProgramType prog_number; /**< identifies the type of blast search being - performed. The type of search determines - how sequence data should be obtained. */ - const Uint1* genetic_code; /**< genetic code for translated searches */ - const BlastSeqSrc* seq_src; /**< BLAST sequence data source */ - BlastSeqSrcGetSeqArg seq_arg; /**< argument to GetSequence method of the - BlastSeqSrc (@todo this structure was - designed to be allocated on the stack, i.e.: - in Kappa_MatchingSequenceInitialize) - */ -} Kappa_MatchingSequence; +typedef struct Kappa_SequenceLocalData { + EBlastProgramType prog_number; /**< identifies the type of blast + search being performed. The type + of search determines how sequence + data should be obtained. */ + const Uint1* genetic_code; /**< genetic code for translated searches */ + const BlastSeqSrc* seq_src; /**< BLAST sequence data source */ + BlastSeqSrcGetSeqArg seq_arg; /**< argument to GetSequence method + of the BlastSeqSrc (@todo this + structure was designed to be + allocated on the stack, i.e.: in + Kappa_MatchingSequenceInitialize) */ +} Kappa_SequenceLocalData; /** @@ -2155,47 +569,59 @@ typedef struct Kappa_MatchingSequence { * @param seqSrc A pointer to a source from which sequence data * may be obtained * @param program_number identifies the type of blast search being - performed. + * performed. * @param gen_code_string genetic code for translated queries * @param subject_index index of the matching sequence in the database */ static void -Kappa_MatchingSequenceInitialize( - Kappa_MatchingSequence * self, - EBlastProgramType program_number, - const BlastSeqSrc* seqSrc, - const Uint1* gen_code_string, - Int4 subject_index) +s_MatchingSequenceInitialize( + BlastCompo_MatchingSequence * self, + EBlastProgramType program_number, + const BlastSeqSrc* seqSrc, + const Uint1* gen_code_string, + Int4 subject_index) { - self->seq_src = seqSrc; - self->prog_number = program_number; - self->genetic_code = gen_code_string; - - memset((void*) &self->seq_arg, 0, sizeof(self->seq_arg)); - self->seq_arg.oid = self->index = subject_index; - - if( program_number == eBlastTypeTblastn ) { - self->seq_arg.encoding = eBlastEncodingNcbi4na; - } else { - self->seq_arg.encoding = eBlastEncodingProtein; - } - - if (BlastSeqSrcGetSequence(seqSrc, (void*) &self->seq_arg) < 0) - return; - self->length = BlastSeqSrcGetSeqLen(seqSrc, (void*) &self->seq_arg); + Kappa_SequenceLocalData * local_data = + malloc(sizeof(Kappa_SequenceLocalData)); + self->local_data = local_data; + + local_data->seq_src = seqSrc; + local_data->prog_number = program_number; + local_data->genetic_code = gen_code_string; + + memset((void*) &local_data->seq_arg, 0, sizeof(local_data ->seq_arg)); + local_data->seq_arg.oid = self->index = subject_index; + + if( program_number == eBlastTypeTblastn ) { + local_data->seq_arg.encoding = eBlastEncodingNcbi4na; + } else { + local_data->seq_arg.encoding = eBlastEncodingProtein; + } + if (BlastSeqSrcGetSequence(seqSrc, (void*) &local_data->seq_arg) >= 0) { + self->length = + BlastSeqSrcGetSeqLen(seqSrc, (void*) &local_data->seq_arg); + } else { + self->length = 0; + } } /** Release the resources associated with a matching sequence. */ static void -Kappa_MatchingSequenceRelease(Kappa_MatchingSequence * self) +s_MatchingSequenceRelease(BlastCompo_MatchingSequence * self) { - BlastSeqSrcReleaseSequence(self->seq_src, (void*)&self->seq_arg); - BlastSequenceBlkFree(self->seq_arg.seq); + if (self != NULL) { + Kappa_SequenceLocalData * local_data = self->local_data; + BlastSeqSrcReleaseSequence(local_data->seq_src, + (void*)&local_data->seq_arg); + BlastSequenceBlkFree(local_data->seq_arg.seq); + free(self->local_data); + self->local_data = NULL; + } } -/** NCBIstdaa encoding for 'X' character (@todo is this really needed?) */ +/** NCBIstdaa encoding for 'X' character */ #define BLASTP_MASK_RESIDUE 21 /** Default instructions and mask residue for SEG filtering */ #define BLASTP_MASK_INSTRUCTIONS "S 10 1.8 2.1" @@ -2205,86 +631,87 @@ Kappa_MatchingSequenceRelease(Kappa_MatchingSequence * self) * Obtain a string of translated data * * @param self the sequence from which to obtain the data [in] - * @param window the range and tranlation frame to get [in] + * @param range the range and translation frame to get [in] * @param seqData the resulting data [out] */ static void -Kappa_SequenceGetTranslatedWindow(Kappa_MatchingSequence * self, - Kappa_WindowInfo * window, - Kappa_SequenceData * seqData ) +s_SequenceGetTranslatedRange(const BlastCompo_MatchingSequence * self, + const BlastCompo_SequenceRange * range, + BlastCompo_SequenceData * seqData ) { - ASSERT( 0 && "Not implemented" ); + ASSERT( 0 && "Not implemented" ); } /** - * Obtain the sequence data that lies within the given window. + * Obtain the sequence data that lies within the given range. * * @param self sequence information [in] - * @param window window specifying the range of data [in] + * @param range range specifying the range of data [in] * @param seqData the sequence data obtained [out] */ -static void -Kappa_SequenceGetWindow( - Kappa_MatchingSequence * self, - Kappa_WindowInfo * window, - Kappa_SequenceData * seqData ) +static int +s_SequenceGetRange( + const BlastCompo_MatchingSequence * self, + const BlastCompo_SequenceRange * range, + BlastCompo_SequenceData * seqData ) { - if(self->prog_number == eBlastTypeTblastn) { - /* The sequence must be translated. */ - Kappa_SequenceGetTranslatedWindow(self, window, seqData); - } else { - /* The sequence does not need to be translated. */ - Int4 idx; - Uint1 *origData; /* the unfiltered data for the sequence */ - - /* Copy the entire sequence (necessary for SEG filtering.) */ - seqData->buffer = calloc((self->length + 2), sizeof(Uint1)); - /* First and last characters of the buffer MUST be '\0', which is - * true here because the buffer was allocated using calloc. */ - seqData->data = seqData->buffer + 1; - seqData->length = self->length; - - origData = self->seq_arg.seq->sequence; - for( idx = 0; idx < seqData->length; idx++ ) { - /* Copy the sequence data, replacing occurrences of amino acid - * number 24 (Selenocysteine) with number 21 (Undetermined or - * atypical). */ - if(origData[idx] != 24) { - seqData->data[idx] = origData[idx]; - } else { - seqData->data[idx] = 21; - fprintf(stderr, "Selenocysteine (U) at position %ld" - " replaced by X\n", - (long) idx + 1); - } - } + Kappa_SequenceLocalData * local_data = self->local_data; + if (local_data->prog_number == eBlastTypeTblastn) { + /* The sequence must be translated. */ + s_SequenceGetTranslatedRange(self, range, seqData); + } else { + /* The sequence does not need to be translated. */ + Int4 idx; + Uint1 *origData; /* the unfiltered data for the sequence */ + + /* Copy the entire sequence (necessary for SEG filtering.) */ + seqData->buffer = calloc((self->length + 2), sizeof(Uint1)); + /* First and last characters of the buffer MUST be '\0', which is + * true here because the buffer was allocated using calloc. */ + seqData->data = seqData->buffer + 1; + seqData->length = self->length; + + origData = local_data->seq_arg.seq->sequence; + for (idx = 0; idx < seqData->length; idx++) { + /* Copy the sequence data, replacing occurrences of amino acid + * number 24 (Selenocysteine) with number 21 (Undetermined or + * atypical). */ + if (origData[idx] != 24) { + seqData->data[idx] = origData[idx]; + } else { + seqData->data[idx] = 21; + fprintf(stderr, "Selenocysteine (U) at position %ld" + " replaced by X\n", + (long) idx + 1); + } + } #ifndef KAPPA_NO_SEG_SEQUENCE - /*take as input an amino acid string and its length; compute a filtered - amino acid string and return the filtered string*/ - {{ - BlastSeqLoc* mask_seqloc; - const EBlastProgramType k_program_name = eBlastTypeBlastp; - SBlastFilterOptions* filter_options; - - BlastFilteringOptionsFromString(k_program_name, BLASTP_MASK_INSTRUCTIONS, &filter_options, NULL); - - BlastSetUp_Filter(k_program_name, seqData->data, seqData->length, - 0, filter_options, &mask_seqloc, NULL); - - filter_options = SBlastFilterOptionsFree(filter_options); - - Blast_MaskTheResidues(seqData->data, seqData->length, - FALSE, mask_seqloc, FALSE, 0); - - mask_seqloc = BlastSeqLocFree(mask_seqloc); - }} + /* take as input an amino acid string and its length; compute + * a filtered amino acid string and return the filtered string */ + {{ + BlastSeqLoc* mask_seqloc; + const EBlastProgramType k_program_name = eBlastTypeBlastp; + SBlastFilterOptions* filter_options; + + BlastFilteringOptionsFromString(k_program_name, + BLASTP_MASK_INSTRUCTIONS, + &filter_options, NULL); + BlastSetUp_Filter(k_program_name, seqData->data, seqData->length, + 0, filter_options, &mask_seqloc, NULL); + filter_options = SBlastFilterOptionsFree(filter_options); + + Blast_MaskTheResidues(seqData->data, seqData->length, + FALSE, mask_seqloc, FALSE, 0); + mask_seqloc = BlastSeqLocFree(mask_seqloc); + }} #endif - /* Fit the data to the window. */ - seqData ->data = &seqData->data[window->begin - 1]; - *seqData->data++ = '\0'; - seqData ->length = window->end - window->begin; - } /* end else the sequence does not need to be translated */ + /* Fit the data to the range. */ + seqData ->data = &seqData->data[range->begin - 1]; + *seqData->data++ = '\0'; + seqData ->length = range->end - range->begin; + } /* end else the sequence does not need to be translated */ + return 0; } @@ -2303,49 +730,95 @@ Kappa_SequenceGetWindow( * @param query the query data [in] * @param subject the subject data [in] */ +/* WHY */ static void -StartingPointForHit( - Int4 * q_start, - Int4 * s_start, - const BlastScoreBlk* sbp, - Boolean positionBased, - BlastHSP * hsp, - Kappa_WindowInfo * window, - Kappa_SequenceData * query, - Kappa_SequenceData * subject) +s_StartingPointForHit(Int4 * q_start, + Int4 * s_start, + const BlastScoreBlk* sbp, + Boolean positionBased, + BlastHSP * hsp, + BlastCompo_SequenceRange * range, + BlastCompo_SequenceData * query, + BlastCompo_SequenceData * subject) +{ + hsp->subject.offset -= range->begin; + hsp->subject.gapped_start -= range->begin; + + if(BLAST_CheckStartForGappedAlignment(hsp, query->data, + subject->data, sbp)) { + /* We may use the starting point supplied by the HSP. */ + *q_start = hsp->query.gapped_start; + *s_start = hsp->subject.gapped_start; + } else { + /* We must recompute the start for the gapped alignment, as the + one in the HSP was unacceptable.*/ + *q_start = + BlastGetStartForGappedAlignment(query->data, + subject->data, sbp, + hsp->query.offset, + hsp->query.end - + hsp->query.offset, + hsp->subject.offset, + hsp->subject.end - + hsp->subject.offset); + *s_start = + (hsp->subject.offset - hsp->query.offset) + *q_start; + } +} + + +struct Blast_GappingParamsContext { + BlastGapAlignStruct * gap_align; + const BlastScoringParameters* scoringParams; + BlastScoreBlk* sbp; + double localScalingFactor; + Int4 prog_number; +}; +typedef struct Blast_GappingParamsContext Blast_GappingParamsContext; + + +/** + * Reads a GapAlignBlk that has been used to compute a traceback, and + * return a BlastCompo_Alignment representing the alignment. + * + * @param gap_align the GapAlignBlk + * @param window the window used to compute the traceback + */ +static BlastCompo_Alignment * +s_NewAlignmentFromGapAlign(BlastGapAlignStruct * gap_align, + BlastCompo_SequenceRange * query_range, + BlastCompo_SequenceRange * subject_range, + int whichMode) { - hsp->subject.offset -= window->begin; - hsp->subject.gapped_start -= window->begin; - - if(BLAST_CheckStartForGappedAlignment(hsp, query->data, - subject->data, sbp)) { - /* We may use the starting point supplied by the HSP. */ - *q_start = hsp->query.gapped_start; - *s_start = hsp->subject.gapped_start; - } else { - /* We must recompute the start for the gapped alignment, as the - one in the HSP was unacceptable.*/ - *q_start = - BlastGetStartForGappedAlignment(query->data, subject->data, sbp, - hsp->query.offset, hsp->query.end - hsp->query.offset, - hsp->subject.offset, hsp->subject.end - hsp->subject.offset); - - *s_start = - (hsp->subject.offset - hsp->query.offset) + *q_start; - } + int queryStart, queryEnd, queryIndex, matchStart, matchEnd, frame; + BlastCompo_Alignment * obj; /* the new alignment */ + + queryStart = gap_align->query_start + query_range->begin; + queryEnd = gap_align->query_stop + query_range->begin; + queryIndex = query_range->context; + matchStart = gap_align->subject_start + subject_range->begin; + matchEnd = gap_align->subject_stop + subject_range->begin; + frame = subject_range->context; + + obj = BlastCompo_AlignmentNew(gap_align->score, whichMode, + queryStart, queryEnd, queryIndex, + matchStart, matchEnd, frame, + gap_align->edit_script); + gap_align->edit_script = NULL; + return obj; } /** - * Create a new Kappa_DistinctAlignment and append the list of + * Create a new BlastCompo_Alignment and append the list of * alignments represented by "next." * * @param query query sequence data * @param queryStart the start of the alignment in the query * @param queryEnd the end of the alignment in the query * @param subject subject sequence data - * @param matchStart the start of the alignment in the subject window - * @param matchEnd the end of the alignment in the subject window + * @param matchStart the start of the alignment in the subject range + * @param matchEnd the end of the alignment in the subject range * @param score the score of this alignment * @param window the subject window of this alignment * @param gap_align alignment info for gapped alignments @@ -2356,210 +829,163 @@ StartingPointForHit( * @param prog_number the type of alignment being performed * @param next preexisting list of alignments [out] */ -static Kappa_DistinctAlignment * -NewAlignmentUsingXdrop( - Kappa_SequenceData * query, - Int4 queryStart, - Int4 queryEnd, - Kappa_SequenceData * subject, - Int4 matchStart, - Int4 matchEnd, - Int4 score, - Kappa_WindowInfo * window, - BlastGapAlignStruct * gap_align, - const BlastScoringParameters* scoringParams, - double localScalingFactor, - Int4 prog_number, - Kappa_DistinctAlignment * next) +static int +s_NewAlignmentUsingXdrop(BlastCompo_Alignment ** pnewAlign, + Int4 * pqueryEnd, Int4 *pmatchEnd, + Int4 queryStart, Int4 matchStart, Int4 score, + BlastCompo_SequenceData * query, + BlastCompo_SequenceRange * query_range, + Int4 queryLength, + BlastCompo_SequenceData * subject, + BlastCompo_SequenceRange * subject_range, + Int4 subjectLength, + BlastCompo_GappingParams * gapping_params, + ECompoAdjustModes whichMode) { - Int4 newScore; - /* Extent of the alignment as computed by an x-drop alignment - * (usually the same as (queryEnd - queryStart) and (matchEnd - - * matchStart)) */ - Int4 queryExtent, matchExtent; - Kappa_DistinctAlignment * obj; /* the new object */ - - Kappa_SWFindFinalEndsUsingXdrop(query, queryStart, queryEnd, - subject, matchStart, matchEnd, - gap_align, scoringParams, - score, localScalingFactor, - &queryExtent, &matchExtent, - &newScore); - obj = malloc(sizeof(Kappa_DistinctAlignment)); - obj->editScript = - Blast_PrelimEditBlockToGapEditScript(gap_align->rev_prelim_tback, - gap_align->fwd_prelim_tback); - - obj->score = newScore; - obj->queryStart = queryStart; - obj->queryEnd = obj->queryStart + queryExtent; - obj->matchStart = matchStart + window->begin; - obj->matchEnd = obj->matchStart + matchExtent; - obj->frame = window->frame; - - obj->next = next; - - return obj; + Int4 newScore; + /* Extent of the alignment as computed by an x-drop alignment + * (usually the same as (queryEnd - queryStart) and (matchEnd - + * matchStart)) */ + Int4 queryExtent, matchExtent; + BlastCompo_Alignment * obj; /* the new object */ + Blast_GappingParamsContext * context = gapping_params->context; + BlastGapAlignStruct * gap_align = context->gap_align; + const BlastScoringParameters* scoringParams = context->scoringParams; + double localScalingFactor = context->localScalingFactor; + GapEditScript* editScript; + + s_SWFindFinalEndsUsingXdrop(query, queryStart, *pqueryEnd, + subject, matchStart, *pmatchEnd, + gap_align, scoringParams, + score, localScalingFactor, + &queryExtent, &matchExtent, + &newScore); + *pqueryEnd = queryStart + queryExtent; + *pmatchEnd = matchStart + matchExtent; + + editScript = + Blast_PrelimEditBlockToGapEditScript(gap_align->rev_prelim_tback, + gap_align->fwd_prelim_tback); + obj = BlastCompo_AlignmentNew(newScore, whichMode, + queryStart, *pqueryEnd, + query_range->context, + matchStart, *pmatchEnd, + subject_range->context, editScript); + *pnewAlign = obj; + + return 0; } -/** - * Reads a GapAlignBlk that has been used to compute a traceback, and - * return a Kappa_DistinctAlignment representing the alignment. - * - * @param gap_align the GapAlignBlk - * @param window the window used to compute the traceback - */ -static Kappa_DistinctAlignment * -NewAlignmentFromGapAlign( - BlastGapAlignStruct * gap_align, - Kappa_WindowInfo * window) +static BlastCompo_Alignment * +s_RedoOneAlignment(BlastCompo_Alignment * in_align, + ECompoAdjustModes whichMode, + BlastCompo_SequenceData * query_data, + BlastCompo_SequenceRange * query_range, + int ccat_query_length, + BlastCompo_SequenceData * subject_data, + BlastCompo_SequenceRange * subject_range, + int full_subject_length, + BlastCompo_GappingParams * gapping_params) { - Kappa_DistinctAlignment * obj; /* the new alignment */ - obj = malloc(sizeof(Kappa_DistinctAlignment)); - - obj->score = gap_align->score; - obj->queryStart = gap_align->query_start; - obj->queryEnd = gap_align->query_stop; - obj->matchStart = gap_align->subject_start + window->begin; - obj->matchEnd = gap_align->subject_stop + window->begin; - obj->frame = window->frame; - - obj->editScript = gap_align->edit_script; - gap_align->edit_script = NULL; /* set to NULL to avoid aliasing */ - obj->next = NULL; - - return obj; + Int4 q_start, s_start; + Blast_GappingParamsContext * context = gapping_params->context; + BlastScoreBlk* sbp = context->sbp; + BlastGapAlignStruct* gapAlign = context->gap_align; + Boolean positionBased = (sbp->psi_matrix ? TRUE : FALSE); + BlastHSP * hsp = in_align->context; + + s_StartingPointForHit(&q_start, &s_start, sbp, positionBased, + hsp, subject_range, query_data, subject_data); + gapAlign->gap_x_dropoff = gapping_params->x_dropoff; + + BLAST_GappedAlignmentWithTraceback(context->prog_number, + query_data->data, + subject_data->data, gapAlign, + context->scoringParams, + q_start, s_start, + query_data->length, + subject_data->length); + return s_NewAlignmentFromGapAlign(gapAlign, query_range, subject_range, + whichMode); } /** - * A Kappa_SearchParameters represents the data needed by + * A s_SearchParameters represents the data needed by * RedoAlignmentCore to adjust the parameters of a search, including * the original value of these parameters */ -typedef struct Kappa_SearchParameters { - Int4 gapOpen; /**< a penalty for the existence of a gap */ - Int4 gapExtend; /**< a penalty for each residue (or - nucleotide) in the gap */ - Int4 gapDecline; /**< a penalty for declining to align a pair - of residues */ - Int4 mRows; /**< the number of rows in a scoring matrix. */ - Int4 nCols; /**< the number of columns in a scoring - matrix */ - - double scaledUngappedLambda; /**< The value of Karlin-Altschul - parameter lambda, rescaled - to allow scores to have - greater precision */ - Int4 **origMatrix; /**< The original matrix values */ - Int4 **startMatrix; /**< Rescaled values of the original matrix */ - - double **startFreqRatios; /**< frequency ratios to start - investigating each pair */ - double *scoreArray; /**< array of score probabilities */ - double *resProb; /**< array of probabilities for each residue - in a matching sequence */ - double *queryProb; /**< array of probabilities for each residue - in the query */ - Boolean adjustParameters; /**< Use composition-based statistics - if true. */ - - Blast_ScoreFreq* return_sfp; /**< score frequency pointers to - compute lambda */ - Blast_KarlinBlk *kbp_gap_orig; /**< copy of the original gapped - Karlin-Altschul block corresponding to - the first context */ - Blast_KarlinBlk **orig_kbp_gap_array; /**< pointer to the array of gapped - Karlin-Altschul block for all - contexts (@todo is this really - needed?) */ - double scale_factor; /**< The original scale factor (to be restored). */ -} Kappa_SearchParameters; +typedef struct s_SearchParameters { + Int4 gap_open; /**< a penalty for the existence of a gap */ + Int4 gapExtend; /**< a penalty for each residue in the + gap */ + Int4 gapDecline; /**< a penalty for declining to align a pair + of residues */ + double scale_factor; /**< the original scale factor */ + Int4 **origMatrix; /**< The original matrix values */ + double original_expect_value; /**< expect value on entry */ + /** copy of the original gapped Karlin-Altschul block + * corresponding to the first context */ + Blast_KarlinBlk* kbp_gap_orig; + /** pointer to the array of gapped Karlin-Altschul block for all + * contexts; needed to restore the search to its original + * configuration. */ + Blast_KarlinBlk** orig_kbp_gap_array; +} s_SearchParameters; /** - * Release the data associated with a Kappa_SearchParameters and + * Release the data associated with a s_SearchParameters and * delete the object * @param searchParams the object to be deleted [in][out] */ static void -Kappa_SearchParametersFree(Kappa_SearchParameters ** searchParams) +s_SearchParametersFree(s_SearchParameters ** searchParams) { - /* for convenience, remove one level of indirection from searchParams */ - Kappa_SearchParameters *sp = *searchParams; + /* for convenience, remove one level of indirection from searchParams */ + s_SearchParameters *sp = *searchParams; - if(sp->kbp_gap_orig) Blast_KarlinBlkFree(sp->kbp_gap_orig); + if(sp->kbp_gap_orig) Blast_KarlinBlkFree(sp->kbp_gap_orig); - if(sp->startMatrix) - _PSIDeallocateMatrix((void**) sp->startMatrix, sp->mRows); - if(sp->origMatrix) - _PSIDeallocateMatrix((void**) sp->origMatrix, sp->mRows); - if(sp->startFreqRatios) - _PSIDeallocateMatrix((void**) sp->startFreqRatios, sp->mRows); + Nlm_Int4MatrixFree(&sp->origMatrix); - if(sp->return_sfp) sfree(sp->return_sfp); - if(sp->scoreArray) sfree(sp->scoreArray); - if(sp->resProb) sfree(sp->resProb); - if(sp->queryProb) sfree(sp->queryProb); - - sfree(*searchParams); - *searchParams = NULL; + sfree(*searchParams); + *searchParams = NULL; } /** - * Create a new instance of Kappa_SearchParameters + * Create a new instance of s_SearchParameters * * @param rows number of rows in the scoring matrix - * @param adjustParameters if true, use composition-based statistics + * @param adjustParameters if >0, use composition-based statistics + * @param numQueries the number of queries in the concatenated + * query * @param positionBased if true, the search is position-based */ -static Kappa_SearchParameters * -Kappa_SearchParametersNew( - Int4 rows, - Boolean adjustParameters, - Boolean positionBased) +static s_SearchParameters * +s_SearchParametersNew( + Int4 rows, + Int4 adjustParameters, + Boolean positionBased) { - Kappa_SearchParameters *sp; /* the new object */ - sp = malloc(sizeof(Kappa_SearchParameters)); - - sp->orig_kbp_gap_array = NULL; - - sp->mRows = positionBased ? rows : BLASTAA_SIZE; - sp->nCols = BLASTAA_SIZE; - - sp->kbp_gap_orig = NULL; - sp->startMatrix = NULL; - sp->origMatrix = NULL; - sp->startFreqRatios = NULL; - sp->return_sfp = NULL; - sp->scoreArray = NULL; - sp->resProb = NULL; - sp->queryProb = NULL; - sp->adjustParameters = adjustParameters; - - if(adjustParameters) { + s_SearchParameters *sp; /* the new object */ + sp = malloc(sizeof(s_SearchParameters)); + + sp->orig_kbp_gap_array = NULL; + sp->kbp_gap_orig = NULL; + sp->origMatrix = NULL; + sp->kbp_gap_orig = Blast_KarlinBlkNew(); - sp->startMatrix = (Int4**) _PSIAllocateMatrix(sp->mRows, sp->nCols, - sizeof(Int4)); - sp->origMatrix = (Int4**) _PSIAllocateMatrix(sp->mRows, sp->nCols, - sizeof(Int4)); - sp->resProb = - (double *) calloc(BLASTAA_SIZE, sizeof(double)); - sp->scoreArray = - (double *) calloc(kScoreMatrixScoreRange, sizeof(double)); - sp->return_sfp = - (Blast_ScoreFreq*) calloc(1, sizeof(Blast_ScoreFreq)); - - if(!positionBased) { - sp->queryProb = - (double *) calloc(BLASTAA_SIZE, sizeof(double)); + if (adjustParameters) { + if (positionBased) { + sp->origMatrix = Nlm_Int4MatrixNew(rows, BLASTAA_SIZE); + } else { + sp->origMatrix = Nlm_Int4MatrixNew(BLASTAA_SIZE, BLASTAA_SIZE); + } } - } - /* end if(adjustParameters) */ - - return sp; + return sp; } @@ -2567,677 +993,629 @@ Kappa_SearchParametersNew( * Record the initial value of the search parameters that are to be * adjusted. * - * @param searchParams the object to be filled in [in|out] - * @param queryBlk query sequence [in] - * @param queryInfo query sequence information [in] - * @param sbp Scoring Blk (contains Karlin-Altschul parameters) [in] - * @param scoring gap-open/extend/decline_align information [in] - * @param positionBased is this search position-specific? [in] - * @todo instead of hard coding 0 for context we should use queryInfo + * @param searchParams holds the recorded values [out] + * @param search the search parameters [in] + * @param query a list of query data [in] + * @param numQueries the length of the array query [in] */ static void -Kappa_RecordInitialSearch(Kappa_SearchParameters * searchParams, - BLAST_SequenceBlk * queryBlk, - BlastQueryInfo* queryInfo, - BlastScoreBlk* sbp, - const BlastScoringParameters* scoring, - Boolean positionBased) +s_RecordInitialSearch(s_SearchParameters * searchParams, + BLAST_SequenceBlk * queryBlk, + BlastQueryInfo* queryInfo, + BlastScoreBlk* sbp, + const BlastScoringParameters* scoring, + int query_length, + Boolean adjustParameters, + Boolean positionBased) { - Uint1* query; /* the query sequence */ - Int4 queryLength; /* the length of the query sequence */ - const Int4 kContextOffset = queryInfo->contexts[0].query_offset; /* offset in buffer of start of query. */ - - query = &queryBlk->sequence[kContextOffset]; - queryLength = queryInfo->contexts[0].query_length; - ASSERT((0 == queryInfo->first_context) && - (queryInfo->first_context == queryInfo->last_context)); - - if(searchParams->adjustParameters) { - Int4 i, j; Blast_KarlinBlk* kbp; /* statistical parameters used to evaluate a - * query-subject pair */ - Int4 **matrix; /* matrix used to score a local - query-subject alignment */ - - if(positionBased) { - matrix = sbp->psi_matrix->pssm->data; - ASSERT(queryLength == searchParams->mRows); - ASSERT(queryLength == (Int4)sbp->psi_matrix->pssm->ncols); - } else { - matrix = sbp->matrix->data; - Blast_FillResidueProbability(query, queryLength, searchParams->queryProb); - } - kbp = sbp->kbp_gap[0]; - searchParams->gapOpen = scoring->gap_open; + * query-subject pair */ + /* YIKES! How do I get these! */ + /* + searchParams->original_expect_value = search->pbp->cutoff_e; + */ + searchParams->gap_open = scoring->gap_open; searchParams->gapExtend = scoring->gap_extend; searchParams->gapDecline = scoring->decline_align; - searchParams->scale_factor = scoring->scale_factor; + searchParams->scale_factor = scoring->scale_factor; searchParams->orig_kbp_gap_array = sbp->kbp_gap; - + kbp = sbp->kbp_gap[0]; Blast_KarlinBlkCopy(searchParams->kbp_gap_orig, kbp); - for(i = 0; i < searchParams->mRows; i++) { - for(j = 0; j < BLASTAA_SIZE; j++) { - searchParams->origMatrix[i][j] = matrix[i][j]; - } + if (adjustParameters) { + Int4 **matrix; + Int4 i, j; /* iteration indices */ + int rows; + if (positionBased) { + matrix = sbp->psi_matrix->pssm->data; + rows = query_length; + } else { + matrix = sbp->matrix->data; + rows = BLASTAA_SIZE; + } + for (i = 0; i < rows; i++) { + for (j = 0; j < BLASTAA_SIZE; j++) { + searchParams->origMatrix[i][j] = matrix[i][j]; + } + } } - } } /** * Rescale the search parameters in the search object and options * object to obtain more precision. - * - * @param sp record of parameters used and frequencies [in|out] - * @param queryBlk query sequence [in] - * @param queryInfo query sequence information [in] - * @param sbp Scoring Blk (contains Karlin-Altschul parameters) [in] - * @param scoringParams gap-open/extend/decline_align information [in] - * @param positionBased is this search position-specific? [in] - * @return scaling-factor to be used. */ -static double -Kappa_RescaleSearch(Kappa_SearchParameters * sp, - BLAST_SequenceBlk* queryBlk, - BlastQueryInfo* queryInfo, - BlastScoreBlk* sbp, - BlastScoringParameters* scoringParams, - Boolean positionBased) +static void +s_RescaleSearch(s_SearchParameters * sp, + BLAST_SequenceBlk* queryBlk, + BlastQueryInfo* queryInfo, + BlastScoreBlk* sbp, + BlastScoringParameters* scoringParams, + double localScalingFactor, + Boolean positionBased) { - double localScalingFactor; /* the factor by which to - * scale the scoring system in - * order to obtain greater - * precision */ - - if(!sp->adjustParameters) { - localScalingFactor = 1.0; - } else { - double initialUngappedLambda; /* initial value of the - * statistical parameter - * lambda used to evaluate - * ungapped alignments */ - Blast_KarlinBlk* kbp; /* the statistical parameters used to - * evaluate alignments of a - * query-subject pair */ - Uint1* query; /* the query sequence */ - Int4 queryLength; /* the length of the query sequence */ - - if((0 == strcmp(scoringParams->options->matrix, "BLOSUM62_20"))) { - localScalingFactor = SCALING_FACTOR / 10; - } else { - localScalingFactor = SCALING_FACTOR; - } + Blast_KarlinBlk* kbp; /* the statistical parameters used to + * evaluate alignments of a + * query-subject pair */ + kbp = sbp->kbp_gap[0]; + kbp->Lambda /= localScalingFactor; + kbp->logK = log(kbp->K); + /* YIKES! and what about the cutoff_e */ + /* + search->pbp->cutoff_e = options->kappa_expect_value; + */ + scoringParams->gap_open = BLAST_Nint(sp->gap_open * localScalingFactor); + scoringParams->gap_extend = BLAST_Nint(sp->gapExtend * localScalingFactor); scoringParams->scale_factor = localScalingFactor; + if (sp->gapDecline != INT2_MAX) { + scoringParams->decline_align = + BLAST_Nint(sp->gapDecline * localScalingFactor); + } +} - scoringParams->gap_open = BLAST_Nint(sp->gapOpen * localScalingFactor); - scoringParams->gap_extend = BLAST_Nint(sp->gapExtend * localScalingFactor); - if(sp->gapDecline != INT2_MAX) { - scoringParams->decline_align = - BLAST_Nint(sp->gapDecline * localScalingFactor); + +/** + * Restore the parameters that were adjusted to their original values + * @param searchParams a record of the original values [in] + * @param search the search to be restored [out] + * @param options the option block to be restored [out] + * @param matrix the scoring matrix to be restored [out] + * @param SmithWaterman if true, we have performed a Smith-Waterman + * alignment with these search parameters [in] + */ +static void +s_RestoreSearch(s_SearchParameters * searchParams, + BlastScoreBlk* sbp, + Int4 ** matrix, + int query_length, + BlastScoringParameters* scoring, + Boolean positionBased, + Boolean adjustParameters) +{ + Blast_KarlinBlk* kbp; /* statistical parameters used to + evaluate the significance of + alignment of a query-subject + pair */ + Int4 i, j; + /* YIKES! More stuff I don't know how to deal with */ + /* + search->pbp->gap_x_dropoff_final = searchParams->gap_x_dropoff_final; + search->pbp->cutoff_e = searchParams->original_expect_value; + search->pbp->gap_open = searchParams->gap_open; + search->pbp->gap_extend = searchParams->gapExtend; + search->pbp->decline_align = searchParams->gapDecline; + GapAlignBlkDelete(search->gap_align); + search->gap_align = searchParams->orig_gap_align; + search->sbp->kbp_gap = searchParams->orig_kbp_gap_array; + */ + kbp = sbp->kbp_gap[0]; + Blast_KarlinBlkCopy(kbp, searchParams->kbp_gap_orig); + + if(adjustParameters) { + int rows; + if (positionBased) { + rows = query_length; + } else { + rows = BLASTAA_SIZE; + } + for(i = 0; i < rows; i++) { + for(j = 0; j < BLASTAA_SIZE; j++) { + matrix[i][j] = searchParams->origMatrix[i][j]; + } + } } +} + +static void +s_MatrixInfoInit(Blast_MatrixInfo * self, + double localScalingFactor, + BLAST_SequenceBlk* queryBlk, + BlastQueryInfo* queryInfo, + BlastScoreBlk* sbp, + BlastScoringParameters* scoringParams, + Boolean positionBased, + const char * matrixName) +{ + Uint1 * query; /* the query sequence */ + int queryLength; + /* Int4 queryLength; */ /* the length of the query sequence */ + double initialUngappedLambda; + + /* YIKES! */ + /* + query = search->context[0].query->sequence; + queryLength = search->context[0].query->length; + */ query = &queryBlk->sequence[0]; queryLength = queryInfo->contexts[0].query_length; - if(positionBased) { - int status = 0; - ASSERT(queryLength == sp->mRows); - ASSERT(queryLength == (Int4)sbp->psi_matrix->pssm->ncols); - sp->startFreqRatios = - getStartFreqRatios(sbp, query, scoringParams->options->matrix, - sbp->psi_matrix->freq_ratios, queryLength); - status = scalePosMatrix(sp->startMatrix, sbp->matrix->data, - scoringParams->options->matrix, - sbp->psi_matrix->freq_ratios, query, - queryLength, sbp); - if (status) { - return 0.0; /* return incorrect value for scalingFactor */ - } - initialUngappedLambda = sbp->kbp_psi[0]->Lambda; + if (self->positionBased) { + /* YIKES! + if(sbp->posFreqs == NULL) { + sbp->posFreqs = + allocatePosFreqs(queryLength, BLASTAA_SIZE); + } + */ + s_GetStartFreqRatios(self->startFreqRatios, query, matrixName, + sbp->psi_matrix->freq_ratios, queryLength, + TRUE); + s_ScalePosMatrix(self->startMatrix, sbp->matrix->data, + matrixName,sbp->psi_matrix->freq_ratios, query, + queryInfo->max_length, sbp); + initialUngappedLambda = sbp->kbp_psi[0]->Lambda; } else { - SFreqRatios* freqRatios = - _PSIMatrixFrequencyRatiosNew(scoringParams->options->matrix); - sp->startFreqRatios = (double**) _PSIAllocateMatrix(sp->mRows, - sp->nCols, - sizeof(double)); - ASSERT(sp->startFreqRatios); - _PSICopyMatrix_double(sp->startFreqRatios, freqRatios->data, - sp->mRows, sp->nCols); - freqRatios = _PSIMatrixFrequencyRatiosFree(freqRatios); - initialUngappedLambda = sbp->kbp_ideal->Lambda; + s_GetStartFreqRatios(self->startFreqRatios, query, matrixName, + NULL, BLASTAA_SIZE, FALSE); + initialUngappedLambda = sbp->kbp_ideal->Lambda; } - sp->scaledUngappedLambda = initialUngappedLambda / localScalingFactor; - if(!positionBased) { - computeScaledStandardMatrix(sp->startMatrix, - scoringParams->options->matrix, - sp->scaledUngappedLambda); + self->ungappedLambda = initialUngappedLambda / localScalingFactor; + if ( !positionBased ) { + SFreqRatios * freqRatios; /* frequency ratios for the matrix */ + + freqRatios = _PSIMatrixFrequencyRatiosNew(matrixName); + /* + if (freqRatios == NULL) { + ErrPostEx(SEV_FATAL, 1, 0, "blastpgp: Cannot adjust parameters " + "for matrix %s\n", matrixName); + } + */ + Blast_Int4MatrixFromFreq(self->startMatrix, BLASTAA_SIZE, + freqRatios->data, self->ungappedLambda); + freqRatios = _PSIMatrixFrequencyRatiosFree(freqRatios); } - kbp = sbp->kbp_gap[0]; - kbp->Lambda /= localScalingFactor; - kbp->logK = log(kbp->K); - } - - return localScalingFactor; + self->matrixName = strdup(matrixName); } -/** LambdaRatioLowerBound is used when the expected score is too large - * causing impalaKarlinLambdaNR to give a Lambda estimate that - * is too small, or to fail entirely returning -1 */ -#define LambdaRatioLowerBound 0.5 -/** - * Adjust the search parameters - * - * @param sp a record of the initial search parameters [in|out] - * @param queryLength length of query sequence [in] - * @param subject data from the subject sequence [in] - * @param matrix a scoring matrix to be adjusted [out] - * @param positionBased is this search position-specific? [in] - * @return scaling-factor to be used. - */ -static Int4 -Kappa_AdjustSearch( - Kappa_SearchParameters * sp, - Int4 queryLength, - Kappa_SequenceData * subject, - Int4 ** matrix, - Boolean positionBased) +static void +s_GetQueryInfo(BlastCompo_QueryInfo **pquery, int * pnumQueries, + Uint1 * ccat_query, BlastQueryInfo* queryInfo) { - double LambdaRatio; /* the ratio of the corrected lambda to the - * original lambda */ - if(!sp->adjustParameters) { - LambdaRatio = 1.0; - } else { - /* do adjust the parameters */ - Blast_ScoreFreq* this_sfp; - double correctUngappedLambda; /* new value of ungapped lambda */ - - /* compute and plug in new matrix here */ - Blast_FillResidueProbability(subject->data, subject->length, sp->resProb); - - if(positionBased) { - ASSERT(queryLength == sp->mRows); - this_sfp = - posfillSfp(sp->startMatrix, queryLength, sp->resProb, sp->scoreArray, - sp->return_sfp, kScoreMatrixScoreRange); - } else { - this_sfp = - notposfillSfp(sp->startMatrix, sp->resProb, sp->queryProb, - sp->scoreArray, sp->return_sfp, kScoreMatrixScoreRange); + int query_index; + int numQueries = queryInfo->num_queries; + BlastCompo_QueryInfo * query = calloc(numQueries, + sizeof(BlastCompo_QueryInfo)); + *pnumQueries = numQueries; + *pquery = query; + for (query_index = 0; query_index < numQueries; query_index++) { + query[query_index].eff_search_space = + queryInfo->contexts[query_index].eff_searchsp; } - correctUngappedLambda = - Blast_KarlinLambdaNR(this_sfp, sp->scaledUngappedLambda); - - /* impalaKarlinLambdaNR will return -1 in the case where the - * expected score is >=0; however, because of the MAX statement 3 - * lines below, LambdaRatio should always be > 0; the succeeding - * test is retained as a vestige, in case one wishes to remove the - * MAX statement and allow LambdaRatio to take on the error value - * -1 */ - - LambdaRatio = correctUngappedLambda / sp->scaledUngappedLambda; - LambdaRatio = MIN(1, LambdaRatio); - LambdaRatio = MAX(LambdaRatio, LambdaRatioLowerBound); - - if(LambdaRatio > 0) { - scaleMatrix(matrix, sp->startFreqRatios, sp->mRows, - sp->scaledUngappedLambda, LambdaRatio); + for (query_index = 0; query_index < numQueries; query_index++) { + query[query_index].origin = + queryInfo->contexts[query_index].query_offset; + query[query_index].seq.data = &ccat_query[query[query_index].origin]; + query[query_index].seq.length = + queryInfo->contexts[query_index].query_length; + } + for (query_index = 0; query_index < numQueries; query_index++) { + Blast_ReadAaComposition(&query[query_index].composition, + query[query_index].seq.data, + query[query_index].seq.length); } - } - /* end else do adjust the parameters */ - - return LambdaRatio > 0 ? 0 : 1; } -/** - * Restore the parameters that were adjusted to their original values - * @param searchParams a record of the original values [in] - * @param sbp Karlin-Altschul parameters to be restored. [out] - * @param matrix the scoring matrix to be restored [out] - * @param scoring the scoring parameters to be restored [out] - * @param positionBased is this search position-specific? [in] - */ static void -Kappa_RestoreSearch( - Kappa_SearchParameters * searchParams, - BlastScoreBlk* sbp, - Int4 ** matrix, - BlastScoringParameters* scoring, - Boolean positionBased) +s_GappingParamsInit(Blast_GappingParamsContext * context, + BlastCompo_GappingParams * gapping_params, + BlastGapAlignStruct * gap_align, + const BlastScoringParameters* scoring, + BlastScoreBlk* sbp, + double localScalingFactor, + Int4 program_number, + double Lambda) { - if(searchParams->adjustParameters) { - Blast_KarlinBlk* kbp; /* statistical parameters used to - evaluate the significance of - alignment of a query-subject - pair */ - Int4 i, j; /* loop variables. */ - - scoring->gap_open = searchParams->gapOpen; - scoring->gap_extend = searchParams->gapExtend; - scoring->decline_align = searchParams->gapDecline; - scoring->scale_factor = searchParams->scale_factor; + context->gap_align = gap_align; + context->scoringParams = scoring; + context->sbp = sbp; + context->localScalingFactor = localScalingFactor; + context->prog_number = program_number; + + gapping_params->gap_open = scoring->gap_open; + gapping_params->gap_extend = scoring->gap_extend; + gapping_params->decline_align = scoring->decline_align; + /* YIKES! different x-dropoff due to different pass through the + blast code */ + gapping_params->x_dropoff = gap_align->gap_x_dropoff; + gapping_params->context = context; +} - sbp->kbp_gap = searchParams->orig_kbp_gap_array; +static const Blast_RedoAlignCallbacks +redo_align_callbacks = { + s_CalcLambda, s_SequenceGetRange, s_RedoOneAlignment, + s_NewAlignmentUsingXdrop +}; + + +static Blast_RedoAlignParams * +s_GetAlignParams(Blast_GappingParamsContext * context, + EBlastProgramType program_number, + BlastGapAlignStruct * gap_align, + BLAST_SequenceBlk * queryBlk, + BlastQueryInfo* queryInfo, + BlastScoreBlk* sbp, + BlastScoringParameters* scoringParams, + const BlastExtensionParameters* extendParams, + const BlastHitSavingParameters* hitParams, + const PSIBlastOptions* psiOptions, + const char * matrixName, + double localScalingFactor, + int adjustParameters) +{ + int rows; + int cutoff_s; + double cutoff_e; + BlastCompo_GappingParams * gapping_params = NULL; + Blast_MatrixInfo * scaledMatrixInfo; + Blast_KarlinBlk* kbp; + int subject_is_translated = program_number == eBlastTypeTblastn; + /* YIKES! wrong test for do_link_hsps */ + int do_link_hsps = program_number == eBlastTypeTblastn; + Boolean positionBased = (sbp->psi_matrix ? TRUE : FALSE); + + if (do_link_hsps) { + ASSERT( 0 && "Which cutoff needed here?" ); + /* cutoff_s = search->pbp->cutoff_s2 * localScalingFactor; */ + } else { + /* There is no cutoff score; we consider e-values instead */ + cutoff_s = 0; + } + cutoff_e = hitParams->options->expect_value; + rows = positionBased ? queryInfo->max_length : BLASTAA_SIZE; + scaledMatrixInfo = Blast_MatrixInfoNew(rows, positionBased); + s_MatrixInfoInit(scaledMatrixInfo, localScalingFactor, + queryBlk, queryInfo, sbp, scoringParams, + positionBased, matrixName); kbp = sbp->kbp_gap[0]; - Blast_KarlinBlkCopy(kbp, searchParams->kbp_gap_orig); + gapping_params = malloc(sizeof(BlastCompo_GappingParams)); + s_GappingParamsInit(context, gapping_params, gap_align, scoringParams, + sbp, localScalingFactor, program_number, + kbp->Lambda); + return + Blast_RedoAlignParamsNew(&scaledMatrixInfo, &gapping_params, + adjustParameters, positionBased, + subject_is_translated, + queryInfo->max_length, cutoff_s, cutoff_e, + do_link_hsps, kbp->Lambda, kbp->logK, + &redo_align_callbacks); +} + - for(i = 0; i < searchParams->mRows; i++) { - for(j = 0; j < BLASTAA_SIZE; j++) { - matrix[i][j] = searchParams->origMatrix[i][j]; - } +/** + * Convert a BlastCompo_Heap to a flat list of SeqAligns. Note that + * there may be more than one alignment per element in the heap. The + * new list preserves the order of the SeqAligns associated with each + * HeapRecord. (@todo this function is named as it is for + * compatibility with kappa.c, rename in the future) + * + * @param self a BlastCompo_Heap + * @param results BLAST core external results structure (pre-SeqAlign) + * [out] + * @param hitlist_size size of each list in the results structure above [in] + */ +static void +s_HeapToFlatList(BlastCompo_Heap * self, BlastHSPResults * results, + Int4 hitlist_size) +{ + BlastHSPList* hsp_list; + BlastHitList* hitlist = + results->hitlist_array[0] = Blast_HitListNew(hitlist_size); + + hsp_list = NULL; + while (NULL != (hsp_list = BlastCompo_HeapPop(self))) { + Blast_HitListUpdate(hitlist, hsp_list); } - } } + +/** + * Top level routine to recompute alignments for each + * match found by the gapped BLAST algorithm + * + * @param search is the structure with all the information about + * the search + * @param options is used to pass certain command line options + * taken in by BLAST + * @param hitlist_count is the number of old matches + * @param adjustParameters determines whether we are to adjust the + * Karlin-Altschul parameters and score matrix + * @param SmithWaterman determines whether the new local alignments + * should be computed by the optimal Smith-Waterman + * algorithm; SmithWaterman false means that + * alignments will be recomputed by the current + * X-drop algorithm as implemented in the procedure + * ALIGN. + * @return a array of lists of SeqAlign; each element + * in the array is a list of SeqAligns for + * one query in the concatenated query. + * It is assumed that at least one of adjustParameters and + * SmithWaterman is >0 or true when this procedure is called A linked list + * of alignments is returned; the alignments are sorted according to + * the lowest E-value of the best alignment for each matching + * sequence; alignments for the same matching sequence are in the + * list consecutively regardless of the E-value of the secondary + * alignments. Ties in sorted order are much rarer than for the + * standard BLAST method, but are broken deterministically based on + * the index of the matching sequences in the database. + */ Int2 -Kappa_RedoAlignmentCore(EBlastProgramType program_number, - BLAST_SequenceBlk * queryBlk, - BlastQueryInfo* queryInfo, - BlastScoreBlk* sbp, - BlastHSPStream* hsp_stream, - const BlastSeqSrc* seqSrc, - const Uint1* gen_code_string, - BlastScoringParameters* scoringParams, - const BlastExtensionParameters* extendParams, - const BlastHitSavingParameters* hitParams, - const PSIBlastOptions* psiOptions, - BlastHSPResults* results) +Blast_RedoAlignmentCore(EBlastProgramType program_number, + BLAST_SequenceBlk * queryBlk, + BlastQueryInfo* queryInfo, + BlastScoreBlk* sbp, + BlastHSPStream* hsp_stream, + const BlastSeqSrc* seqSrc, + const Uint1* gen_code_string, + BlastScoringParameters* scoringParams, + const BlastExtensionParameters* extendParams, + const BlastHitSavingParameters* hitParams, + const PSIBlastOptions* psiOptions, + BlastHSPResults* results) { - Int4 cutoff_s = 0; /* minimum score that must be achieved - by a newly-computed alignment */ - Boolean do_link_hsps; /* if true, use BlastLinkHsps to - compute e-values */ - Kappa_SequenceData query; /* data for the query sequence */ - double localScalingFactor; /* the factor by which to - * scale the scoring system in - * order to obtain greater - * precision */ - - Int4** matrix = NULL; /* score matrix */ - Blast_KarlinBlk* kbp; /* stores Karlin-Altschul parameters */ - Kappa_SearchParameters *searchParams; /* the values of the search - * parameters that will be - * recorded, altered in the - * search structure in this - * routine, and then restored - * before the routine - * exits. */ - Kappa_ForbiddenRanges forbidden; /* forbidden ranges for each - * database position (used in - * Smith-Waterman alignments) */ - SWheap significantMatches; /* a collection of alignments of the - * query sequence with sequences from - * the database */ - Kappa_WindowInfo ** windows; /* windows containing HSPs for - * a single query-subject pair */ - Int4 nWindows; /* number of windows in the array - * "windows" */ - Int4 lWindows; /* allocated size of "windows" */ - Int4 window_index; /* window index for use in loops */ - int status_code; /* status code of any routine that - returns one */ - - BlastGapAlignStruct* gapAlign; /* keeps track of gapped - alignment params */ - Boolean SmithWaterman; /* Perform Smith-Waterman alignments? */ - /* is this search position-specific? */ - Boolean positionBased = (sbp->psi_matrix ? TRUE : FALSE); - Boolean adjustParameters; /* Use composition based statistics? */ - BlastHSPList* thisMatch = NULL; /* alignment data for the - * current query-subject - * match */ - - double inclusion_ethresh; /* All alignments above this value will be - reported, no matter how many. */ - - if (program_number != eBlastTypeBlastp && - program_number != eBlastTypePsiBlast && - program_number != eBlastTypePhiBlastp) { /* tblastn ported but not fully - implemented */ - return BLASTERR_REDOALIGNMENTCORE_NOTSUPPORTED; - } - - inclusion_ethresh = - (psiOptions != NULL) ? psiOptions->inclusion_ethresh : 0; - - adjustParameters = extendParams->options->compositionBasedStats; - - if (extendParams->options->eTbackExt == eSmithWatermanTbck) - SmithWaterman = TRUE; - else - SmithWaterman = FALSE; - - if ((status_code=BLAST_GapAlignStructNew(scoringParams, extendParams, - BlastSeqSrcGetMaxSeqLen(seqSrc), sbp, &gapAlign)) != 0) - return status_code; - - /* Initialize the window list to have a single window -- the most - common case */ - lWindows = 1; nWindows = 1; - windows = calloc(lWindows, sizeof(Kappa_WindowInfo *)); - windows[0] = malloc(sizeof(Kappa_WindowInfo)); - - SWheapInitialize(&significantMatches, hitParams->options->hitlist_size, - inclusion_ethresh); - - /**** Validate parameters *************/ - if(0 == strcmp(scoringParams->options->matrix, "BLOSUM62_20") && - !adjustParameters) { - return 0; /* BLOSUM62_20 only makes sense if - * adjustParameters is on */ - } - /*****************/ - query.data = &queryBlk->sequence[0]; - query.length = queryInfo->contexts[0].query_length; - - if(SmithWaterman) { - Kappa_ForbiddenRangesInitialize(&forbidden, query.length); - } - - if(positionBased) { - ASSERT(program_number == eBlastTypePsiBlast); - matrix = sbp->psi_matrix->pssm->data; - ASSERT( matrix != NULL ); - - if(sbp->psi_matrix->freq_ratios == NULL) { - sbp->psi_matrix->freq_ratios = - (double**) _PSIAllocateMatrix(query.length, BLASTAA_SIZE, - sizeof(double)); + double localScalingFactor; /* the factor by which to + * scale the scoring system in + * order to obtain greater + * precision */ + Int4 **matrix; /* score matrix */ + s_SearchParameters *searchParams; /* the values of the search + * parameters that will be + * recorded, altered in the + * search structure in this + * routine, and then restored + * before the routine + * exits. */ + Blast_ForbiddenRanges forbidden; /* forbidden ranges for each + * database position (used + * in Smith-Waterman + * alignments) + */ + BlastCompo_Heap * redoneMatches; /* a collection of alignments + * for each query sequence with + * sequences from the + * database */ + Blast_CompositionWorkspace + *NRrecord = NULL; /* stores all fields needed for + * computing a compositionally adjusted + * score matrix using Newton's method */ + Int4 query_index; /* loop index */ + Int4 numQueries; /* number of queries in the + concatenated query */ + BlastGapAlignStruct* gapAlign; /* keeps track of gapped + alignment params */ + double inclusion_ethresh; /* All alignments above this value will be + reported, no matter how many. */ + BlastCompo_QueryInfo * query_info = NULL; + Blast_RedoAlignParams * redo_align_params; + Boolean positionBased = (sbp->psi_matrix ? TRUE : FALSE); + Boolean adjustParameters = extendParams->options->compositionBasedStats; + Boolean SmithWaterman; + int status_code; + BlastHSPList* thisMatch = NULL; /* alignment data for the + * current query-subject + * match */ + BlastCompo_Alignment * incoming_aligns; /* existing algnments + for a match */ + Blast_GappingParamsContext gapping_params_context; + int do_link_hsps; + + /**** Validate parameters *************/ + if (0 == strcmp(scoringParams->options->matrix, "BLOSUM62_20") && + !adjustParameters) { + return 0; /* BLOSUM62_20 only makes sense if + * adjustParameters is on */ } - } else { - matrix = sbp->matrix->data; - } - kbp = sbp->kbp_gap[0]; - - /* Initialize searchParams */ - searchParams = - Kappa_SearchParametersNew(query.length, adjustParameters, positionBased); - Kappa_RecordInitialSearch(searchParams, queryBlk, queryInfo, sbp, - scoringParams, positionBased); - localScalingFactor = Kappa_RescaleSearch(searchParams, queryBlk, queryInfo, - sbp, scoringParams, positionBased); - ASSERT(localScalingFactor != 0.0); - - - do_link_hsps = program_number == eBlastTypeTblastn; - if(do_link_hsps) { - ASSERT( 0 && "Which cutoff needed here?" ); - /* cutoff_s = search->pbp->cutoff_s2 * localScalingFactor; */ - } else { - /* There is no cutoff score; we consider e-values instead */ - cutoff_s = 0; - } - while (BlastHSPStreamRead(hsp_stream, &thisMatch) != kBlastHSPStream_Eof) { - /* for all matching sequences */ - Kappa_MatchingSequence matchingSeq; /* the data for a matching - * database sequence */ - Int4 * window_of_hsp; /* index of each HSP in the - * array "windows" */ - Kappa_WindowInfo * window; /* current window in the - * subject sequence */ - Kappa_DistinctAlignment * alignments; /* list of alignments for this - * query-subject pair */ - alignments = NULL; - - if(thisMatch->hsp_array == NULL) { - continue; + if (positionBased) { + adjustParameters = adjustParameters ? 1 : 0; } - - if(SWheapWillAcceptOnlyBelowCutoff(&significantMatches)) { - /* Only matches with evalue <= options->ethresh will be saved */ - - /* e-value for a sequence is the smallest e-value among the HSPs - * matching a region of the sequence to the query */ - double minEvalue = thisMatch->best_evalue; - if(minEvalue > (EVALUE_STRETCH * inclusion_ethresh)) { - /* This match is likely to have an evalue > options->ethresh - * and therefore, we assume that all other matches with higher - * input e-values are also unlikely to get sufficient - * improvement in a redone alignment */ - break; - } + if (extendParams->options->eTbackExt == eSmithWatermanTbck) { + SmithWaterman = TRUE; + } else { + SmithWaterman = FALSE; } - /* Get the sequence for this match */ - Kappa_MatchingSequenceInitialize(&matchingSeq, program_number, - seqSrc, gen_code_string, thisMatch->oid); - - window_of_hsp = calloc(thisMatch->hspcnt, sizeof(Int4)); - if(program_number == eBlastTypeTblastn) { - /* Find the multiple translation windows used by tblastn queries. */ - WindowsFromHSPs(thisMatch->hsp_array, thisMatch->hspcnt, - KAPPA_WINDOW_BORDER, matchingSeq.length, - &windows, &nWindows, &lWindows, window_of_hsp); - } else { /* the program is not tblastn, i.e. it is blastp */ - /* Initialize the single window used by blastp queries. */ - windows[0]->frame = 0; - windows[0]->hspcnt = thisMatch->hspcnt; - windows[0]->begin = 0; - windows[0]->end = matchingSeq.length; - } /* else the program is blastp */ - if(SmithWaterman) { - /* We are performing a Smith-Waterman alignment */ - for(window_index = 0; window_index < nWindows; window_index++) { - /* for all window */ - Kappa_SequenceData subject; /* sequence data for this window */ - - window = windows[window_index]; - Kappa_SequenceGetWindow( &matchingSeq, window, &subject ); - - if(0 == - Kappa_AdjustSearch(searchParams, query.length, &subject, matrix, - positionBased)) { - /* Kappa_AdjustSearch ran without error; compute the new - alignments. */ - Int4 aSwScore; /* score computed by the - * Smith-Waterman algorithm. */ - Boolean alignment_is_significant; /* True if the score/evalue of - * the Smith-Waterman alignment - * is significant. */ - Kappa_ForbiddenRangesClear(&forbidden); - do { - double newSwEvalue; /* evalue as computed by the - * Smith-Waterman algorithm */ - Int4 matchEnd, queryEnd; /* end points of the alignments - * computed by the Smith-Waterman - * algorithm. */ - newSwEvalue = - SmithWatermanScoreOnly(&subject, &query, matrix, - scoringParams->gap_open, - scoringParams->gap_extend, - &matchEnd, &queryEnd, &aSwScore, kbp, - queryInfo->contexts[0].eff_searchsp, - positionBased, - &forbidden); - if( do_link_hsps ) { - alignment_is_significant = aSwScore >= cutoff_s; - } else { - alignment_is_significant = - newSwEvalue < hitParams->options->expect_value; - if( alignments == NULL ) { - /* this is the most significant alignment; if it will not - * be accepted, no alignments from this match will */ - alignment_is_significant = - alignment_is_significant && - SWheapWouldInsert(&significantMatches, newSwEvalue, - aSwScore, thisMatch->oid); - } - } - - if(alignment_is_significant) { - Int4 matchStart, queryStart; /* the start of the - * alignment in the - * match/query sequence */ - - SmithWatermanFindStart(&subject, &query, matrix, - scoringParams->gap_open, - scoringParams->gap_extend, - matchEnd, queryEnd, aSwScore, - &matchStart, &queryStart, - positionBased, &forbidden); - - gapAlign->gap_x_dropoff = - (Int4) (extendParams->gap_x_dropoff_final * - NCBIMATH_LN2 / kbp->Lambda); - - alignments = - NewAlignmentUsingXdrop(&query, queryStart, queryEnd, - &subject, matchStart, matchEnd, - aSwScore, window, - gapAlign, scoringParams, - localScalingFactor, - program_number, alignments); - - Kappa_ForbiddenRangesPush(&forbidden, - queryStart, - alignments->queryEnd - queryStart, - matchStart, - alignments->matchEnd - matchStart); - } - /* end if the next local alignment is significant */ - } while(alignment_is_significant && window->hspcnt > 1); - /* end do..while the next local alignment is significant, and - * the original blast search found more than one alignment. */ - } /* end if Kappa_AdjustSearch ran without error. */ - Kappa_SequenceDataRelease(&subject); - } /* end for all windows */ + inclusion_ethresh = + (psiOptions != NULL) ? psiOptions->inclusion_ethresh : 0; + + /*****************/ + /* Initialize searchParams */ + searchParams = + s_SearchParametersNew(queryInfo->max_length, adjustParameters, + positionBased); + s_RecordInitialSearch(searchParams, queryBlk, queryInfo, sbp, + scoringParams, queryInfo->max_length, + adjustParameters, positionBased); + if (adjustParameters) { + if((0 == strcmp(scoringParams->options->matrix, "BLOSUM62_20"))) { + localScalingFactor = SCALING_FACTOR / 10; + } else { + localScalingFactor = SCALING_FACTOR; + } } else { - /* else we are not performing a Smith-Waterman alignment */ - Int4 hsp_index; - /* data for the current window */ - Kappa_SequenceData subject = {NULL,0,NULL}; - window_index = -1; /* -1 indicates that sequence data has - * not been obtained for any window in - * the list. */ - window = NULL; - - for(hsp_index = 0; hsp_index < thisMatch->hspcnt; hsp_index++) { - /* for all HSPs in thisMatch */ - if(!isAlreadyContained(thisMatch->hsp_array[hsp_index], alignments, - kbp->Lambda, localScalingFactor)) { - Kappa_DistinctAlignment * newAlign; /* the new alignment */ - Boolean adjust_search_failed = FALSE; /* if true, AdjustSearch was - * called and failed. */ - if( window_index != window_of_hsp[hsp_index] ) { - /* The current window doesn't contain this HSP. */ - Kappa_SequenceDataRelease(&subject); - - window_index = window_of_hsp[hsp_index]; - window = windows[window_index]; - Kappa_SequenceGetWindow(&matchingSeq, window, &subject); - - adjust_search_failed = - Kappa_AdjustSearch(searchParams, query.length, &subject, matrix, - positionBased); - } /* end if the current window doesn't contain this HSP */ - if(!adjust_search_failed) { - Int4 q_start, s_start; - - StartingPointForHit(&q_start, &s_start, sbp, positionBased, - thisMatch->hsp_array[hsp_index], - window, &query, &subject); - - if (positionBased) { - /* We don't use the scaled Lambda because we loose precision */ - gapAlign->gap_x_dropoff = - (Int4) (extendParams->options->gap_x_dropoff_final * - NCBIMATH_LN2 / - searchParams->kbp_gap_orig->Lambda*localScalingFactor); - } else { - /* Lambda is already scaled */ - gapAlign->gap_x_dropoff = - (Int4) (extendParams->options->gap_x_dropoff_final * - NCBIMATH_LN2 / kbp->Lambda); - } - BLAST_GappedAlignmentWithTraceback(program_number, - query.data, subject.data, - gapAlign, scoringParams, - q_start, s_start, - query.length, subject.length); - - newAlign = NewAlignmentFromGapAlign(gapAlign, window); - withDistinctEnds(&newAlign, &alignments); - } /* end if adjust search failed */ - } /* end if not isAlreadyContained */ - } /* for all HSPs in thisMatch */ - Kappa_SequenceDataRelease(&subject); - } /* end else we are not performing a Smith-Waterman alignment */ - sfree(window_of_hsp); - - if( alignments != NULL) { /* alignments were found */ - BlastHSPList * hsp_list; /* a hitlist containing the newly-computed - * alignments */ - double bestEvalue; /* best evalue among alignments in the hitlist */ - Int4 bestScore; /* best score among alignments in the hitlist */ - - hsp_list = s_HSPListFromDistinctAlignments(&alignments, - matchingSeq.index); - - if(hsp_list->hspcnt > 1) { /* if there is more than one HSP, */ - /* then eliminate HSPs that are contained in a higher-scoring HSP. */ - if(!SmithWaterman || nWindows > 1) { - /* For SmithWaterman alignments in a single window, the - * forbidden ranges rule does not allow one alignment to be - * contained in another, so the call to HitlistReapContained - * is not needed. */ - qsort(hsp_list->hsp_array, hsp_list->hspcnt, sizeof(BlastHSP *), - ScoreCompareHSPs); - HitlistReapContained(hsp_list->hsp_array, &hsp_list->hspcnt); + localScalingFactor = 1.0; + } + s_RescaleSearch(searchParams, queryBlk, queryInfo, sbp, scoringParams, + localScalingFactor, positionBased); + /********/ + if (positionBased) { + matrix = sbp->psi_matrix->pssm->data; + if ( !matrix ) { + /* YIKES! error return + Char* msg = + "Cannot perform position-specific search without a PSSM"; + BlastConstructErrorMessage("RedoAlignmentCore", msg, 3, + &(search->error_return)); + return NULL; + */ } - } - - if(do_link_hsps) { - BLAST_LinkHsps(program_number, hsp_list, - queryInfo, matchingSeq.length, - sbp, hitParams->link_hsp_params, TRUE); - } else { - Blast_HSPListGetEvalues(queryInfo, hsp_list, TRUE, sbp, - 0.0, /* use a non-zero gap decay only when - linking hsps */ - 1.0); /* Use scaling factor equal to 1, because - both scores and Lambda are scaled, so - they will cancel each other. */ - } - bestEvalue = hsp_list->best_evalue; - bestScore = hsp_list->hsp_array[0]->score; - - if(bestEvalue <= hitParams->options->expect_value && - SWheapWouldInsert(&significantMatches, bestEvalue, - bestScore, thisMatch->oid)) { - /* If the best alignment is significant, then save the current list */ - - Blast_HSPListReapByEvalue(hsp_list, hitParams->options); - - s_HSPListRescaleScores(hsp_list, kbp->Lambda, kbp->logK, - localScalingFactor); - - SWheapInsert(&significantMatches, hsp_list, bestEvalue, bestScore, - thisMatch->oid); - } else { /* the best alignment is not significant */ - Blast_HSPListFree(hsp_list); - } /* end else the best alignment is not significant */ - } /* end if any alignments were found */ - - Kappa_MatchingSequenceRelease(&matchingSeq); - thisMatch = Blast_HSPListFree(thisMatch); - } - /* end for all matching sequences */ - SWheapToFlatList( &significantMatches, results, - hitParams->options->hitlist_size ); - /* Clean up */ - for( window_index = 0; window_index < nWindows; window_index++ ) { - sfree(windows[window_index]); - } - sfree(windows); - SWheapRelease(&significantMatches); - if(SmithWaterman) Kappa_ForbiddenRangesRelease(&forbidden); - gapAlign = BLAST_GapAlignStructFree(gapAlign); - - Kappa_RestoreSearch(searchParams, sbp, matrix, scoringParams, positionBased); - Kappa_SearchParametersFree(&searchParams); - - return 0; + } else { + matrix = sbp->matrix->data; + } + if ((status_code=BLAST_GapAlignStructNew(scoringParams, + extendParams, + BlastSeqSrcGetMaxSeqLen(seqSrc), + sbp, &gapAlign)) != 0) { + return status_code; + } + gapAlign->gap_x_dropoff = + extendParams->gap_x_dropoff_final * localScalingFactor; + redo_align_params = + s_GetAlignParams(&gapping_params_context, program_number, + gapAlign, queryBlk, queryInfo, + sbp, scoringParams, extendParams, hitParams, + psiOptions, scoringParams->options->matrix, + localScalingFactor, adjustParameters); + do_link_hsps = redo_align_params->do_link_hsps; + + s_GetQueryInfo(&query_info, &numQueries, queryBlk->sequence, queryInfo); + if(SmithWaterman) { + Blast_ForbiddenRangesInitialize(&forbidden, queryInfo->max_length); + } + redoneMatches = calloc(numQueries, sizeof(BlastCompo_Heap)); + for (query_index = 0; query_index < numQueries; query_index++) { + BlastCompo_HeapInitialize(&redoneMatches[query_index], + hitParams->options->hitlist_size, + inclusion_ethresh); + } + if( adjustParameters > 1 && !positionBased ) { + NRrecord = Blast_CompositionWorkspaceNew(); + Blast_CompositionWorkspaceInit(NRrecord, + scoringParams->options->matrix); + } + while (BlastHSPStreamRead(hsp_stream, &thisMatch) != kBlastHSPStream_Eof) { + /* for all matching sequences */ + BlastCompo_MatchingSequence matchingSeq; /* the data for a matching + * database sequence */ + BlastCompo_Alignment ** alignments; /* array of lists of + * alignments for each + * query to this subject */ + alignments = calloc(numQueries, sizeof(BlastCompo_Alignment *)); + + if(thisMatch->hsp_array == NULL) { + continue; + } + if (BlastCompo_EarlyTermination(thisMatch->best_evalue, + redoneMatches, numQueries)) { + break; + } + /* Get the sequence for this match */ + s_MatchingSequenceInitialize(&matchingSeq, program_number, + seqSrc, gen_code_string, thisMatch->oid); + incoming_aligns = + s_ResultHspToDistinctAlign(queryInfo, thisMatch->hsp_array, + thisMatch->hspcnt, localScalingFactor); + if (SmithWaterman) { + Blast_RedoOneMatchSmithWaterman(alignments, + redo_align_params, + incoming_aligns, + thisMatch->hspcnt, + &matchingSeq, query_info, + numQueries, matrix, + NRrecord, &forbidden, + redoneMatches); + } else { + Blast_RedoOneMatch(alignments, redo_align_params, + incoming_aligns, thisMatch->hspcnt, + &matchingSeq, queryInfo->max_length, + query_info, numQueries, matrix, + NRrecord); + } + for (query_index = 0; query_index < numQueries; query_index++) { + /* Loop over queries */ + if( alignments[query_index] != NULL) { /* alignments were found */ + double bestEvalue; /* best evalue among alignments in the + hitlist */ + Int4 bestScore; /* best score among alignments in + the hitlist */ + BlastHSPList * hsp_list; /* a hitlist containing the + * newly-computed alignments */ + void * discardedAligns; + hsp_list = + s_HSPListFromDistinctAlignments(&alignments[query_index], + matchingSeq.index); + if (hsp_list->hspcnt > 1) { + s_HitlistReapContained(hsp_list->hsp_array, + &hsp_list->hspcnt); + } + s_HitlistEvaluateAndPurge(&bestScore, &bestEvalue, + hsp_list, + matchingSeq.length, + program_number, queryInfo, + sbp, hitParams, + do_link_hsps); + if (bestEvalue <= hitParams->options->expect_value && + BlastCompo_HeapWouldInsert(&redoneMatches[query_index], + bestEvalue, bestScore, + thisMatch->oid)) { + s_HSPListRescaleScores(hsp_list, redo_align_params->Lambda, + redo_align_params->logK, + localScalingFactor); + + BlastCompo_HeapInsert(&redoneMatches[query_index], + hsp_list, bestEvalue, + bestScore, thisMatch->oid, + &discardedAligns); + if (discardedAligns != NULL) { + Blast_HSPListFree(discardedAligns); + } + } else { /* the best alignment is not significant */ + Blast_HSPListFree(hsp_list); + } /* end if the best alignment is significant */ + } /* end if any alignments were found */ + } /* end loop over queries */ + s_MatchingSequenceRelease(&matchingSeq); + thisMatch = Blast_HSPListFree(thisMatch); + sfree(alignments); + BlastCompo_AlignmentsFree(&incoming_aligns, NULL); + } + /* end for all matching sequences */ + /* YIKES! handle multiple queries + for (query_index = 0; query_index < numQueries; query_index++) { + results[query_index] = + BlastCompo_HeapToFlatList(&redoneMatches[query_index]); + } + */ + s_HeapToFlatList(&redoneMatches[0], results, + hitParams->options->hitlist_size); + /* Clean up */ + free(query_info); + Blast_RedoAlignParamsFree(&redo_align_params); + for (query_index = 0; query_index < numQueries; query_index++) { + BlastCompo_HeapRelease(&redoneMatches[query_index]); + } + sfree(redoneMatches); redoneMatches = NULL; + if(SmithWaterman) { + Blast_ForbiddenRangesRelease(&forbidden); + } + gapAlign = BLAST_GapAlignStructFree(gapAlign); + s_RestoreSearch(searchParams, sbp, matrix, queryInfo->max_length, + scoringParams, positionBased, adjustParameters); + s_SearchParametersFree(&searchParams); + if (NULL != NRrecord) { + Blast_CompositionWorkspaceFree(&NRrecord); + } + return 0; } diff --git a/algo/blast/core/blast_kappa.h b/algo/blast/core/blast_kappa.h index d60f509e..3b6a79c1 100644 --- a/algo/blast/core/blast_kappa.h +++ b/algo/blast/core/blast_kappa.h @@ -1,4 +1,4 @@ -/* $Id: blast_kappa.h,v 1.9 2004/11/23 21:46:03 camacho Exp $ +/* $Id: blast_kappa.h,v 1.10 2005/12/01 14:47:40 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -68,7 +68,7 @@ extern "C" { */ Int2 -Kappa_RedoAlignmentCore(EBlastProgramType program_number, +Blast_RedoAlignmentCore(EBlastProgramType program_number, BLAST_SequenceBlk * queryBlk, BlastQueryInfo* query_info, BlastScoreBlk* sbp, @@ -90,6 +90,9 @@ Kappa_RedoAlignmentCore(EBlastProgramType program_number, * =========================================================================== * * $Log: blast_kappa.h,v $ + * Revision 1.10 2005/12/01 14:47:40 madden + * Renamed Kappa_RedoAlignmentCore as Blast_RedoAlignmentCore + * * Revision 1.9 2004/11/23 21:46:03 camacho * Brought up to date with current version of kappa.c [by Mike Gertz] * diff --git a/algo/blast/core/blast_lookup.c b/algo/blast/core/blast_lookup.c index 61a24cf7..6a71a036 100644 --- a/algo/blast/core/blast_lookup.c +++ b/algo/blast/core/blast_lookup.c @@ -1,4 +1,4 @@ -/* $Id: blast_lookup.c,v 1.43 2005/08/02 21:20:26 coulouri Exp $ +/* $Id: blast_lookup.c,v 1.44 2005/11/16 14:27:03 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. @@ -43,7 +43,7 @@ #ifndef SKIP_DOXYGEN_PROCESSING static char const rcsid[] = - "$Id: blast_lookup.c,v 1.43 2005/08/02 21:20:26 coulouri Exp $"; + "$Id: blast_lookup.c,v 1.44 2005/11/16 14:27:03 madden Exp $"; #endif /* SKIP_DOXYGEN_PROCESSING */ /** Structure containing information needed for adding neighboring words. diff --git a/algo/blast/core/blast_lookup.h b/algo/blast/core/blast_lookup.h index 28d71edd..a1c283ca 100644 --- a/algo/blast/core/blast_lookup.h +++ b/algo/blast/core/blast_lookup.h @@ -1,4 +1,4 @@ -/* $Id: blast_lookup.h,v 1.25 2005/07/27 19:11:33 camacho Exp $ +/* $Id: blast_lookup.h,v 1.26 2005/11/16 14:31:36 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. diff --git a/algo/blast/core/blast_message.c b/algo/blast/core/blast_message.c index e43dfeba..92e10de6 100644 --- a/algo/blast/core/blast_message.c +++ b/algo/blast/core/blast_message.c @@ -1,4 +1,4 @@ -/* $Id: blast_message.c,v 1.18 2005/06/20 13:09:36 madden Exp $ +/* $Id: blast_message.c,v 1.19 2005/11/16 14:27:03 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. @@ -31,7 +31,7 @@ #ifndef SKIP_DOXYGEN_PROCESSING static char const rcsid[] = - "$Id: blast_message.c,v 1.18 2005/06/20 13:09:36 madden Exp $"; + "$Id: blast_message.c,v 1.19 2005/11/16 14:27:03 madden Exp $"; #endif /* SKIP_DOXYGEN_PROCESSING */ #include <algo/blast/core/blast_message.h> @@ -113,6 +113,9 @@ Blast_Perror(Int2 error_code) * =========================================================================== * * $Log: blast_message.c,v $ + * Revision 1.19 2005/11/16 14:27:03 madden + * Fix spelling in CRN + * * Revision 1.18 2005/06/20 13:09:36 madden * Rename BlastSeverity enums in line with C++ tookit convention * diff --git a/algo/blast/core/blast_options.c b/algo/blast/core/blast_options.c index 66ef1897..2bb5d938 100644 --- a/algo/blast/core/blast_options.c +++ b/algo/blast/core/blast_options.c @@ -1,4 +1,4 @@ -/* $Id: blast_options.c,v 1.171 2005/06/24 12:15:40 madden Exp $ +/* $Id: blast_options.c,v 1.175 2005/11/16 14:27:03 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. @@ -34,7 +34,7 @@ #ifndef SKIP_DOXYGEN_PROCESSING static char const rcsid[] = - "$Id: blast_options.c,v 1.171 2005/06/24 12:15:40 madden Exp $"; + "$Id: blast_options.c,v 1.175 2005/11/16 14:27:03 madden Exp $"; #endif /* SKIP_DOXYGEN_PROCESSING */ #include <algo/blast/core/blast_options.h> @@ -552,9 +552,9 @@ BLAST_FillScoringOptions(BlastScoringOptions* options, options->gap_extend = BLAST_GAP_EXTN_NUCL; } } - if (gap_open) + if (gap_open >= 0) options->gap_open = gap_open; - if (gap_extend) + if (gap_extend >= 0) options->gap_extend = gap_extend; return 0; @@ -589,7 +589,7 @@ BlastScoringOptionsValidate(EBlastProgramType program_number, "BLASTN penalty must be negative"); return (Int2) code; } - if (options->gap_open > 0 && options->gap_extend == 0) + if (options->gapped_calculation && options->gap_open > 0 && options->gap_extend == 0) { Int4 code=2; Int4 subcode=1; @@ -601,12 +601,13 @@ BlastScoringOptionsValidate(EBlastProgramType program_number, } else { - Int2 status=0; - - if ((status=Blast_KarlinBlkGappedLoadFromTables(NULL, options->gap_open, - options->gap_extend, options->decline_align, + if (options->gapped_calculation && !Blast_ProgramIsRpsBlast(program_number)) + { + Int2 status=0; + if ((status=Blast_KarlinBlkGappedLoadFromTables(NULL, options->gap_open, + options->gap_extend, options->decline_align, options->matrix)) != 0) - { + { if (status == 1) { char* buffer; @@ -634,8 +635,8 @@ BlastScoringOptionsValidate(EBlastProgramType program_number, sfree(buffer); return (Int2) code; } - } - + } + } } if (program_number != eBlastTypeBlastx && @@ -832,6 +833,66 @@ BLAST_FillLookupTableOptions(LookupTableOptions* options, return 0; } +Int2 BLAST_GetSuggestedThreshold(EBlastProgramType program_number, const char* matrixName, Int4* threshold) +{ + + const Int4 kB62_threshold = 11; + + if (program_number == eBlastTypeBlastn) + return 0; + + if (matrixName == NULL) + return -1; + + if(strcasecmp(matrixName, "BLOSUM62") == 0) + *threshold = kB62_threshold; + else if(strcasecmp(matrixName, "BLOSUM45") == 0) + *threshold = 14; + else if(strcasecmp(matrixName, "BLOSUM62_20") == 0) + *threshold = 100; + else if(strcasecmp(matrixName, "BLOSUM80") == 0) + *threshold = 12; + else if(strcasecmp(matrixName, "PAM30") == 0) + *threshold = 16; + else if(strcasecmp(matrixName, "PAM70") == 0) + *threshold = 14; + else + *threshold = kB62_threshold; + + if (Blast_SubjectIsTranslated(program_number) == TRUE) + *threshold += 2; /* Covers tblastn, tblastx, psi-tblastn rpstblastn. */ + else if (Blast_QueryIsTranslated(program_number) == TRUE) + *threshold += 1; + + return 0; +} + +Int2 BLAST_GetSuggestedWindowSize(EBlastProgramType program_number, const char* matrixName, Int4* window_size) +{ + const Int4 kB62_windowsize = 40; + + if (program_number == eBlastTypeBlastn) + return 0; + + if (matrixName == NULL) + return -1; + + if(strcasecmp(matrixName, "BLOSUM62") == 0) + *window_size = kB62_windowsize; + else if(strcasecmp(matrixName, "BLOSUM45") == 0) + *window_size = 60; + else if(strcasecmp(matrixName, "BLOSUM80") == 0) + *window_size = 25; + else if(strcasecmp(matrixName, "PAM30") == 0) + *window_size = 15; + else if(strcasecmp(matrixName, "PAM70") == 0) + *window_size = 20; + else + *window_size = kB62_windowsize; + + return 0; +} + /** Validate options for the discontiguous word megablast * Word size must be 11 or 12; template length 16, 18 or 21; * template type 0, 1 or 2. @@ -1204,6 +1265,18 @@ Int2 BLAST_ValidateOptions(EBlastProgramType program_number, * =========================================================================== * * $Log: blast_options.c,v $ + * Revision 1.175 2005/11/16 14:27:03 madden + * Fix spelling in CRN + * + * Revision 1.174 2005/10/18 15:19:04 madden + * Exclude rpsblast from validation of gap parameters + * + * Revision 1.173 2005/10/17 14:03:34 madden + * Change convention for unset gap parameters from zero to negative number + * + * Revision 1.172 2005/08/29 13:51:44 madden + * Add functions BLAST_GetSuggestedThreshold and BLAST_GetSuggestedWindowSize + * * Revision 1.171 2005/06/24 12:15:40 madden * Add protection against NULL pointers in options free functons * diff --git a/algo/blast/core/blast_options.h b/algo/blast/core/blast_options.h index a68b58e4..96abb605 100644 --- a/algo/blast/core/blast_options.h +++ b/algo/blast/core/blast_options.h @@ -1,4 +1,4 @@ -/* $Id: blast_options.h,v 1.121 2005/06/02 16:18:05 camacho Exp $ +/* $Id: blast_options.h,v 1.125 2005/11/29 17:27:40 camacho Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. @@ -316,11 +316,6 @@ typedef struct BlastHitSavingOptions { Int4 culling_limit; /**< If the query range of an HSP is contained in at least this many higher-scoring HSPs, throw away the HSP as redundant (turned off if zero) */ - /* PSI-BLAST Hit saving options */ - Int4 required_start; /**< Start of the region required to be part of the - alignment */ - Int4 required_end; /**< End of the region required to be part of the - alignment */ /********************************************************************/ /* Merge all these in a structure for clarity? */ @@ -870,6 +865,30 @@ Int2 BLAST_ValidateOptions(EBlastProgramType program_number, Blast_Message* *blast_msg); + +/** Get thresholds for word-finding suggested by Stephen Altschul. + * + * @param program_number Type of blast program: blastn, blastp, blastx, + * tblastn, tblastx) [in] + * @param matrixName matrix, e.g., BLOSUM62 [in] + * @param threshold returns suggested value [in|out] + * @return zero on success + */ +Int2 BLAST_GetSuggestedThreshold(EBlastProgramType program_number, + const char* matrixName, + Int4* threshold); + +/** Get window sizes for two hit algorithm suggested by Stephen Altschul. + * + * @param program_number Type of blast program: blastn, blastp, blastx, + * tblastn, tblastx) [in] + * @param matrixName matrix, e.g., BLOSUM62 [in] + * @param window_size returns suggested value [in|out] + * @return zero on success + */ +Int2 BLAST_GetSuggestedWindowSize(EBlastProgramType program_number, + const char* matrixName, + Int4* window_size); #ifdef __cplusplus } #endif diff --git a/algo/blast/core/blast_parameters.c b/algo/blast/core/blast_parameters.c index 8fd4279e..33ebe347 100644 --- a/algo/blast/core/blast_parameters.c +++ b/algo/blast/core/blast_parameters.c @@ -1,4 +1,4 @@ -/* $Id: blast_parameters.c,v 1.10 2005/06/08 17:27:53 madden Exp $ +/* $Id: blast_parameters.c,v 1.12 2005/11/16 14:27:03 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. @@ -30,7 +30,7 @@ #ifndef SKIP_DOXYGEN_PROCESSING static char const rcsid[] = - "$Id: blast_parameters.c,v 1.10 2005/06/08 17:27:53 madden Exp $"; + "$Id: blast_parameters.c,v 1.12 2005/11/16 14:27:03 madden Exp $"; #endif /* SKIP_DOXYGEN_PROCESSING */ #include <algo/blast/core/blast_parameters.h> @@ -83,23 +83,27 @@ s_BlastFindValidKarlinBlk(Blast_KarlinBlk** kbp_in, const BlastQueryInfo* query_ * @param kbp_in array of Karlin blocks to be searched [in] * @param query_info information on number of queries (specifies number of * elements in above array) [in] + * @param kbp_out Karlin blocks with smallest lambda [out] * @return The smallest lambda value */ static double s_BlastFindSmallestLambda(Blast_KarlinBlk** kbp_in, - const BlastQueryInfo* query_info) + const BlastQueryInfo* query_info, + Blast_KarlinBlk** kbp_out) { Int4 i; - double min_lambda = 0.0; + double min_lambda = (double) INT4_MAX; ASSERT(kbp_in && query_info); for (i=query_info->first_context; i<=query_info->last_context; i++) { if (s_BlastKarlinBlkIsValid(kbp_in[i])) { - if (min_lambda == 0.0) + if (min_lambda > kbp_in[i]->Lambda) + { min_lambda = kbp_in[i]->Lambda; - else - min_lambda = MIN(min_lambda, kbp_in[i]->Lambda); + if (kbp_out) + *kbp_out = kbp_in[i]; + } } } @@ -218,7 +222,7 @@ BlastInitialWordParametersNew(EBlastProgramType program_number, (*parameters)->x_dropoff_init = (Int4) ceil(sbp->scale_factor * word_options->x_dropoff * NCBIMATH_LN2/ - s_BlastFindSmallestLambda(sbp->kbp_std, query_info)); + s_BlastFindSmallestLambda(sbp->kbp_std, query_info, NULL)); if (program_number == eBlastTypeBlastn && (query_info->contexts[query_info->last_context].query_offset + @@ -376,7 +380,7 @@ Int2 BlastExtensionParametersNew(EBlastProgramType program_number, /* Set gapped X-dropoffs only if it is a gapped search. */ if (sbp->kbp_gap) { - double min_lambda = s_BlastFindSmallestLambda(sbp->kbp_gap, query_info); + double min_lambda = s_BlastFindSmallestLambda(sbp->kbp_gap, query_info, NULL); params->gap_x_dropoff = (Int4) (options->gap_x_dropoff*NCBIMATH_LN2 / min_lambda); /* Note that this conversion from bits to raw score is done prematurely @@ -732,9 +736,6 @@ BlastHitSavingParametersUpdate(EBlastProgramType program_number, return 0; } -/** machine epsilon assumed by CalculateLinkHSPCutoffs */ -#define MY_EPS 1.0e-9 - /* FIXME, move to blast_engine.c and make private? */ void CalculateLinkHSPCutoffs(EBlastProgramType program, BlastQueryInfo* query_info, @@ -742,34 +743,34 @@ CalculateLinkHSPCutoffs(EBlastProgramType program, BlastQueryInfo* query_info, const BlastInitialWordParameters* word_params, Int8 db_length, Int4 subject_length) { - double gap_prob, gap_decay_rate, x_variable, y_variable; Blast_KarlinBlk* kbp; + double gap_prob, gap_decay_rate, x_variable, y_variable; Int4 expected_length, window_size, query_length; Int8 search_sp; - Int4 concat_qlen; + const double kEpsilon = 1.0e-9; if (!link_hsp_params) return; - /* Do this for the first context, should this be changed?? */ - kbp = sbp->kbp[query_info->first_context]; + /* Get KarlinBlk for context with smallest lambda (still greater than zero) */ + s_BlastFindSmallestLambda(sbp->kbp, query_info, &kbp); window_size = link_hsp_params->gap_size + link_hsp_params->overlap_size + 1; gap_prob = link_hsp_params->gap_prob = BLAST_GAP_PROB; gap_decay_rate = link_hsp_params->gap_decay_rate; /* Use average query length */ - concat_qlen = - query_info->contexts[query_info->last_context].query_offset + - query_info->contexts[query_info->last_context].query_length - 1; - - query_length = concat_qlen / (query_info->last_context + 1); + query_length = + (query_info->contexts[query_info->last_context].query_offset + + query_info->contexts[query_info->last_context].query_length - 1) + / (query_info->last_context + 1); if (Blast_SubjectIsTranslated(program) || program == eBlastTypeRpsTblastn) { /* Lengths in subsequent calculations should be on the protein scale */ subject_length /= CODON_LENGTH; db_length /= CODON_LENGTH; } + /* Subtract off the expected score. */ expected_length = BLAST_Nint(log(kbp->K*((double) query_length)* @@ -789,6 +790,7 @@ CalculateLinkHSPCutoffs(EBlastProgramType program, BlastQueryInfo* query_info, y_variable = log((double) (subject_length + expected_length)/ (double) subject_length)*(kbp->K)/(gap_decay_rate); } + search_sp = ((Int8) query_length)* ((Int8) subject_length); x_variable = 0.25*y_variable*((double) search_sp); @@ -798,11 +800,11 @@ CalculateLinkHSPCutoffs(EBlastProgramType program, BlastQueryInfo* query_info, are being checked for. */ if (search_sp > 8*window_size*window_size) { - x_variable /= (1.0 - gap_prob + MY_EPS); + x_variable /= (1.0 - gap_prob + kEpsilon); link_hsp_params->cutoff_big_gap = (Int4) floor((log(x_variable)/kbp->Lambda)) + 1; x_variable = y_variable*(window_size*window_size); - x_variable /= (gap_prob + MY_EPS); + x_variable /= (gap_prob + kEpsilon); link_hsp_params->cutoff_small_gap = MAX(word_params->cutoff_score, (Int4) floor((log(x_variable)/kbp->Lambda)) + 1); @@ -824,6 +826,12 @@ CalculateLinkHSPCutoffs(EBlastProgramType program, BlastQueryInfo* query_info, * =========================================================================== * * $Log: blast_parameters.c,v $ + * Revision 1.12 2005/11/16 14:27:03 madden + * Fix spelling in CRN + * + * Revision 1.11 2005/11/04 13:26:20 madden + * Fixes to CalculateLinkHSPCutoffs so that invalid KarlinBlk is not used + * * Revision 1.10 2005/06/08 17:27:53 madden * Use functions from blast_program.c * diff --git a/algo/blast/core/blast_parameters.h b/algo/blast/core/blast_parameters.h index df09213f..3c683ce4 100644 --- a/algo/blast/core/blast_parameters.h +++ b/algo/blast/core/blast_parameters.h @@ -1,4 +1,4 @@ -/* $Id: blast_parameters.h,v 1.5 2005/02/08 14:45:55 madden Exp $ +/* $Id: blast_parameters.h,v 1.6 2005/11/16 14:31:36 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. diff --git a/algo/blast/core/blast_program.c b/algo/blast/core/blast_program.c index c9edd3ad..009a5fb2 100644 --- a/algo/blast/core/blast_program.c +++ b/algo/blast/core/blast_program.c @@ -1,6 +1,6 @@ #ifndef SKIP_DOXYGEN_PROCESSING static char const rcsid[] = - "$Id: blast_program.c,v 1.2 2005/06/08 19:30:51 camacho Exp $"; + "$Id: blast_program.c,v 1.3 2005/11/16 14:27:03 madden Exp $"; #endif /* SKIP_DOXYGEN_PROCESSING */ /* =========================================================================== * @@ -9,7 +9,7 @@ static char const rcsid[] = * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. diff --git a/algo/blast/core/blast_program.h b/algo/blast/core/blast_program.h index cd747bcc..6223cfca 100644 --- a/algo/blast/core/blast_program.h +++ b/algo/blast/core/blast_program.h @@ -1,7 +1,7 @@ #ifndef ALGO_BLAST_CORE___BLAST_PROGRAM__H #define ALGO_BLAST_CORE___BLAST_PROGRAM__H -/* $Id: blast_program.h,v 1.1 2005/06/08 17:25:37 madden Exp $ +/* $Id: blast_program.h,v 1.2 2005/11/16 14:31:36 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -9,7 +9,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. diff --git a/algo/blast/core/blast_psi.c b/algo/blast/core/blast_psi.c index d89fae97..88b18416 100644 --- a/algo/blast/core/blast_psi.c +++ b/algo/blast/core/blast_psi.c @@ -1,6 +1,6 @@ #ifndef SKIP_DOXYGEN_PROCESSING static char const rcsid[] = - "$Id: blast_psi.c,v 1.29 2005/05/23 15:32:56 camacho Exp $"; + "$Id: blast_psi.c,v 1.30 2005/11/16 14:27:03 madden Exp $"; #endif /* SKIP_DOXYGEN_PROCESSING */ /* =========================================================================== * @@ -9,7 +9,7 @@ static char const rcsid[] = * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. diff --git a/algo/blast/core/blast_psi.h b/algo/blast/core/blast_psi.h index 40e59b32..e5227c51 100644 --- a/algo/blast/core/blast_psi.h +++ b/algo/blast/core/blast_psi.h @@ -1,7 +1,7 @@ #ifndef ALGO_BLAST_CORE___BLAST_PSI__H #define ALGO_BLAST_CORE___BLAST_PSI__H -/* $Id: blast_psi.h,v 1.15 2005/05/20 18:18:31 camacho Exp $ +/* $Id: blast_psi.h,v 1.16 2005/11/16 14:31:36 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -9,7 +9,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. diff --git a/algo/blast/core/blast_psi_priv.c b/algo/blast/core/blast_psi_priv.c index 92dfb1a7..1062ac8a 100644 --- a/algo/blast/core/blast_psi_priv.c +++ b/algo/blast/core/blast_psi_priv.c @@ -1,6 +1,6 @@ #ifndef SKIP_DOXYGEN_PROCESSING static char const rcsid[] = - "$Id: blast_psi_priv.c,v 1.53 2005/04/21 20:26:57 camacho Exp $"; + "$Id: blast_psi_priv.c,v 1.57 2005/11/18 20:09:45 camacho Exp $"; #endif /* SKIP_DOXYGEN_PROCESSING */ /* =========================================================================== * @@ -454,7 +454,7 @@ _PSISequenceWeightsFree(_PSISequenceWeights* seq_weights) } #ifdef _DEBUG -static char getRes(char input) +char GetResidue(char input) { switch (input) { case 0: return ('-'); @@ -502,7 +502,7 @@ __printMsa(const char* filename, const _PSIMsa* msa) /*fprintf(fp, "%3d\t", i);*/ for (j = 0; j < msa->dimensions->query_length; j++) { if (msa->cell[i][j].is_aligned) { - fprintf(fp, "%c", getRes(msa->cell[i][j].letter)); + fprintf(fp, "%c", GetResidue(msa->cell[i][j].letter)); } else { fprintf(fp, "."); } @@ -1637,6 +1637,15 @@ _PSISpreadGapWeights(const _PSIMsa* msa, } } +/** The following define enables/disables the _PSICheckSequenceWeights + * function's abort statement in the case when the sequence weights are not + * being checked. When this is enabled, abort() will be invoked if none of the + * sequence weights are checked to be in the proper range. The C toolkit code + * silently ignores this situation, so it's implemented that way here for + * backwards compatibility. + */ +#define SEQUENCE_WEIGHTS_CHECK__ABORT_ON_FAILURE 0 + /* Verifies that each column of the match_weights field in the seq_weights * structure adds up to 1. */ static int @@ -1646,9 +1655,12 @@ _PSICheckSequenceWeights(const _PSIMsa* msa, { const Uint1 kXResidue = AMINOACID_TO_NCBISTDAA['X']; Uint4 pos = 0; /* residue position (ie: column number) */ - Boolean check_performed = FALSE; /* were there any sequences checked? */ const Uint4 kExpectedNumMatchingSeqs = nsg_compatibility_mode ? 0 : 1; +#if SEQUENCE_WEIGHTS_CHECK__ABORT_ON_FAILURE + Boolean check_performed = FALSE; /* were there any sequences checked? */ +#endif + ASSERT(msa); ASSERT(seq_weights); @@ -1659,6 +1671,9 @@ _PSICheckSequenceWeights(const _PSIMsa* msa, if (msa->num_matching_seqs[pos] <= kExpectedNumMatchingSeqs || msa->cell[kQueryIndex][pos].letter == kXResidue) { + /* N.B.: the following statement allows for the sequence weights to + * go unchecked. To allow more strict checking, enable the + * SEQUENCE_WEIGHTS_CHECK__ABORT_ON_FAILURE #define above */ continue; } @@ -1669,15 +1684,19 @@ _PSICheckSequenceWeights(const _PSIMsa* msa, if (running_total < 0.99 || running_total > 1.01) { return PSIERR_BADSEQWEIGHTS; } +#if SEQUENCE_WEIGHTS_CHECK__ABORT_ON_FAILURE check_performed = TRUE; +#endif } +#if SEQUENCE_WEIGHTS_CHECK__ABORT_ON_FAILURE /* This condition should never happen because it means that no sequences * were selected to calculate the sequence weights! */ if ( !check_performed && !nsg_compatibility_mode ) { /* old code didn't check for this... */ assert(!"Did not perform sequence weights check"); } +#endif return PSI_SUCCESS; } @@ -1923,7 +1942,7 @@ _PSIConvertFreqRatiosToPSSM(_PSIInternalPssmData* internal_pssm, internal_pssm->pssm[i][j] = sbp->matrix->data[kResidue][j]; - if (sbp->matrix->data[kResidue][j] != BLAST_SCORE_MIN) { + if (freq_ratios->data[kResidue][j] != 0.0) { double tmp = kPSIScaleFactor * freq_ratios->bit_scale_factor * log(freq_ratios->data[kResidue][j])/NCBIMATH_LN2; @@ -1959,8 +1978,8 @@ _PSIScaleMatrix(const Uint1* query, int** scaled_pssm = NULL; int** pssm = NULL; double factor; - double factor_low = 0.0; - double factor_high = 0.0; + double factor_low = 1.0; + double factor_high = 1.0; double ideal_lambda = 0.0; /* ideal value of ungapped lambda for underlying scoring matrix */ double new_lambda = 0.0; /* Karlin-Altschul parameter calculated @@ -2004,6 +2023,7 @@ _PSIScaleMatrix(const Uint1* query, if (first_time) { factor_high = 1.0 + kPositScalingPercent; factor = factor_high; + factor_low = 1.0; too_high = TRUE; first_time = FALSE; } else { @@ -2188,7 +2208,7 @@ _PSIComputeScoreProbabilities(const int** pssm, /* [in] */ } ASSERT(score_freqs->score_avg == 0.0); - for (s = min_score; s < max_score; s++) { + for (s = min_score; s <= max_score; s++) { score_freqs->score_avg += (s * score_freqs->sprob[s]); } @@ -2358,6 +2378,19 @@ _PSISaveDiagnostics(const _PSIMsa* msa, /* * =========================================================================== * $Log: blast_psi_priv.c,v $ + * Revision 1.57 2005/11/18 20:09:45 camacho + * Fixes for backwards compatibility with C toolkit PSSM engine for certain corner + * cases. + * + * Revision 1.56 2005/10/17 18:34:54 camacho + * Remove abort() call when sequence weights are not checked + * + * Revision 1.55 2005/10/05 14:09:30 camacho + * Port change in revision 6.76 of posit.c + * + * Revision 1.54 2005/10/03 20:42:41 camacho + * Minor + * * Revision 1.53 2005/04/21 20:26:57 camacho * Relax validation in s_PSIValidateAlignedColumns so that query sequence can be * the only aligned sequence for a given column of the multiple sequence diff --git a/algo/blast/core/blast_rps.h b/algo/blast/core/blast_rps.h index 39e8d5e9..9731679b 100644 --- a/algo/blast/core/blast_rps.h +++ b/algo/blast/core/blast_rps.h @@ -1,4 +1,4 @@ -/* $Id: blast_rps.h,v 1.8 2004/11/04 15:52:14 papadopo Exp $ +/* $Id: blast_rps.h,v 1.9 2005/11/16 14:31:36 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. diff --git a/algo/blast/core/blast_seg.h b/algo/blast/core/blast_seg.h index 152d50e2..7f29a78f 100644 --- a/algo/blast/core/blast_seg.h +++ b/algo/blast/core/blast_seg.h @@ -1,4 +1,4 @@ -/* $Id: blast_seg.h,v 1.15 2004/11/29 19:54:00 dondosha Exp $ +/* $Id: blast_seg.h,v 1.16 2005/11/16 14:31:36 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. diff --git a/algo/blast/core/blast_setup.c b/algo/blast/core/blast_setup.c index 969385f9..ac4da0e2 100644 --- a/algo/blast/core/blast_setup.c +++ b/algo/blast/core/blast_setup.c @@ -1,4 +1,4 @@ -/* $Id: blast_setup.c,v 1.123 2005/08/15 16:11:43 dondosha Exp $ +/* $Id: blast_setup.c,v 1.127 2005/10/03 12:57:03 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -34,7 +34,7 @@ #ifndef SKIP_DOXYGEN_PROCESSING static char const rcsid[] = - "$Id: blast_setup.c,v 1.123 2005/08/15 16:11:43 dondosha Exp $"; + "$Id: blast_setup.c,v 1.127 2005/10/03 12:57:03 madden Exp $"; #endif /* SKIP_DOXYGEN_PROCESSING */ #include <algo/blast/core/blast_setup.h> @@ -69,7 +69,7 @@ Blast_ScoreBlkKbpGappedCalc(BlastScoreBlk * sbp, Blast_KarlinBlkNuclGappedCalc(sbp->kbp_gap_std[index], scoring_options->gap_open, scoring_options->gap_extend, scoring_options->reward, scoring_options->penalty, - sbp->kbp_std[index], error_return); + sbp->kbp_std[index], &(sbp->round_down), error_return); } else { retval = Blast_KarlinBlkGappedCalc(sbp->kbp_gap_std[index], @@ -385,57 +385,62 @@ BlastSetup_ScoreBlkInit(BLAST_SequenceBlk* query_blk, Int2 BLAST_MainSetUp(EBlastProgramType program_number, const QuerySetUpOptions *qsup_options, const BlastScoringOptions *scoring_options, - const BlastHitSavingOptions *hit_options, BLAST_SequenceBlk *query_blk, const BlastQueryInfo *query_info, double scale_factor, BlastSeqLoc **lookup_segments, - BlastMaskInformation* maskInfo, + BlastMaskLoc **mask, BlastScoreBlk **sbpp, Blast_Message **blast_message) { Boolean mask_at_hash = FALSE; /* mask only for making lookup table? */ Int2 status = 0; /* return value */ BlastMaskLoc *filter_maskloc = NULL; /* Local variable for mask locs. */ - SBlastFilterOptions* filter_options = NULL; + SBlastFilterOptions* filter_options = qsup_options->filtering_options; + Boolean filter_options_allocated = FALSE; - if (maskInfo) - { - maskInfo->filter_slp = NULL; - maskInfo->mask_at_hash = FALSE; - } - if (qsup_options->filtering_options == NULL && qsup_options->filter_string) + if (mask) + *mask = NULL; + + if (filter_options == NULL && qsup_options->filter_string) { - status = BlastFilteringOptionsFromString(program_number, qsup_options->filter_string, &filter_options, blast_message); - if (status) + status = BlastFilteringOptionsFromString(program_number, + qsup_options->filter_string, + &filter_options, + blast_message); + if (status) { + filter_options = SBlastFilterOptionsFree(filter_options); return status; + } + filter_options_allocated = TRUE; } + ASSERT(filter_options); status = BlastSetUp_GetFilteringLocations(query_blk, query_info, program_number, - filter_options ? filter_options : qsup_options->filtering_options, + filter_options, & filter_maskloc, blast_message); - if (status) { + if (filter_options_allocated) + filter_options = SBlastFilterOptionsFree(filter_options); return status; } - mask_at_hash = SBlastFilterOptionsMaskAtHash(filter_options ? filter_options : qsup_options->filtering_options); + mask_at_hash = SBlastFilterOptionsMaskAtHash(filter_options); - filter_options = SBlastFilterOptionsFree(filter_options); + if (filter_options_allocated) { + filter_options = SBlastFilterOptionsFree(filter_options); + } - if (!mask_at_hash) - { - status = BlastSetUp_MaskQuery(query_blk, query_info, filter_maskloc, - program_number); - if (status != 0) { - return status; - } + + if (!mask_at_hash) { + BlastSetUp_MaskQuery(query_blk, query_info, filter_maskloc, + program_number); } if (program_number == eBlastTypeBlastx && scoring_options->is_ooframe) { @@ -451,17 +456,14 @@ Int2 BLAST_MainSetUp(EBlastProgramType program_number, filter_maskloc, lookup_segments); } - if (maskInfo) + if (mask) { - if (program_number == eBlastTypeBlastx || - program_number == eBlastTypeTblastx || - program_number == eBlastTypeRpsTblastn) { + if (Blast_QueryIsTranslated(program_number)) { /* Filter locations so far are in protein coordinates; convert them back to nucleotide here. */ BlastMaskLocProteinToDNA(filter_maskloc, query_info); } - maskInfo->filter_slp = filter_maskloc; - maskInfo->mask_at_hash = mask_at_hash; + *mask = filter_maskloc; filter_maskloc = NULL; } else @@ -470,11 +472,8 @@ Int2 BLAST_MainSetUp(EBlastProgramType program_number, status = BlastSetup_ScoreBlkInit(query_blk, query_info, scoring_options, program_number, sbpp, scale_factor, blast_message); - if (status > 0) { - return status; - } - return 0; + return status; } @@ -688,8 +687,7 @@ BlastSeqLoc_RestrictToInterval(BlastSeqLoc* *mask, Int4 from, Int4 to) /* Shift the pointer to the next link in chain and free this link. */ if (last_loc) last_loc->next = seqloc->next; - sfree(seqloc->ssr); - sfree(seqloc); + seqloc = BlastSeqLocNodeFree(seqloc); } else if (!head_loc) { /* First time a mask was found within the range. */ head_loc = last_loc = seqloc; diff --git a/algo/blast/core/blast_setup.h b/algo/blast/core/blast_setup.h index 622e3dc3..3614d9b4 100644 --- a/algo/blast/core/blast_setup.h +++ b/algo/blast/core/blast_setup.h @@ -1,4 +1,4 @@ -/* $Id: blast_setup.h,v 1.54 2005/08/15 16:10:21 dondosha Exp $ +/* $Id: blast_setup.h,v 1.55 2005/08/29 14:32:36 dondosha Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -51,13 +51,12 @@ extern "C" { * @param program_number Type of BLAST program (0=blastn, ...). [in] * @param qsup_options options for query setup. [in] * @param scoring_options options for scoring. [in] - * @param hit_options options for saving hits. [in] * @param query_blk BLAST_SequenceBlk* for the query. [in] * @param query_info The query information block [in] * @param scale_factor Multiplier for cutoff and dropoff scores [in] * @param lookup_segments Start/stop locations for non-masked query * segments [out] - * @param maskInfo masking location information, mask_at_hash value. [out] + * @param mask masking locations. [out] * @param sbpp Contains scoring information. [out] * @param blast_message error or warning [out] */ @@ -65,12 +64,11 @@ NCBI_XBLAST_EXPORT Int2 BLAST_MainSetUp(EBlastProgramType program_number, const QuerySetUpOptions* qsup_options, const BlastScoringOptions* scoring_options, - const BlastHitSavingOptions* hit_options, BLAST_SequenceBlk* query_blk, const BlastQueryInfo* query_info, double scale_factor, BlastSeqLoc* *lookup_segments, - BlastMaskInformation* maskInfo, + BlastMaskLoc* *mask, BlastScoreBlk* *sbpp, Blast_Message* *blast_message); @@ -242,6 +240,11 @@ PHIPatternSpaceCalc(BlastQueryInfo* query_info, /* * * $Log: blast_setup.h,v $ +* Revision 1.55 2005/08/29 14:32:36 dondosha +* From Ilya Dondoshansky: +* Removed BlastMaskInformation wrapper structure, because mask_at_hash can +* be retrieved from options +* * Revision 1.54 2005/08/15 16:10:21 dondosha * Added error return argument to Blast_ScoreBlkKbpGappedCalc * diff --git a/algo/blast/core/blast_stat.c b/algo/blast/core/blast_stat.c index e5c57887..a82f2201 100644 --- a/algo/blast/core/blast_stat.c +++ b/algo/blast/core/blast_stat.c @@ -1,4 +1,4 @@ -/* $Id: blast_stat.c,v 1.123 2005/08/19 17:56:18 dondosha Exp $ +/* $Id: blast_stat.c,v 1.136 2005/11/14 15:55:42 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -50,7 +50,7 @@ #ifndef SKIP_DOXYGEN_PROCESSING static char const rcsid[] = - "$Id: blast_stat.c,v 1.123 2005/08/19 17:56:18 dondosha Exp $"; + "$Id: blast_stat.c,v 1.136 2005/11/14 15:55:42 madden Exp $"; #endif /* SKIP_DOXYGEN_PROCESSING */ #include <algo/blast/core/blast_stat.h> @@ -295,6 +295,7 @@ static Int4 blosum80_prefs[BLOSUM80_VALUES_MAX] = { BLAST_MATRIX_NOMINAL, BLAST_MATRIX_NOMINAL, BLAST_MATRIX_NOMINAL, + BLAST_MATRIX_NOMINAL, BLAST_MATRIX_BEST, BLAST_MATRIX_NOMINAL }; /**< Quality values for BLOSUM80 matrix, each element corresponds to same element number in array blosum80_values */ @@ -351,8 +352,8 @@ BLAST_MATRIX_NOMINAL, BLAST_MATRIX_NOMINAL, BLAST_MATRIX_NOMINAL, BLAST_MATRIX_NOMINAL, -BLAST_MATRIX_BEST, BLAST_MATRIX_NOMINAL, +BLAST_MATRIX_BEST, BLAST_MATRIX_NOMINAL, BLAST_MATRIX_NOMINAL, BLAST_MATRIX_NOMINAL, @@ -562,6 +563,12 @@ BLAST_MATRIX_NOMINAL * 8. Theta */ +/** Karlin-Altschul parameter values for substitution scores 1 and -5. */ +static const array_of_8 blastn_values_1_5[] = { + { 0, 0, 1.39, 0.747, 1.38, 1.00, 0, 100 }, + { 3, 3, 1.39, 0.747, 1.38, 1.00, 0, 100 } +}; + /** Karlin-Altschul parameter values for substitution scores 1 and -4. */ static const array_of_8 blastn_values_1_4[] = { { 0, 0, 1.383, 0.738, 1.36, 1.02, 0, 100 }, @@ -632,6 +639,16 @@ static const array_of_8 blastn_values_2_3[] = { { 2, 2, 0.515, 0.14, 0.33, 1.55, -9, 81 } }; +/** Karlin-Altschul parameter values for substitution scores 3 and -4. */ +static const array_of_8 blastn_values_3_4[] = { + { 6, 3, 0.389, 0.25, 0.56, 0.7, -5, 95}, + { 5, 3, 0.375, 0.21, 0.47, 0.8, -6, 92}, + { 4, 3, 0.351, 0.14, 0.35, 1.0, -9, 86}, + { 6, 2, 0.362, 0.16, 0.45, 0.8, -4, 88}, + { 5, 2, 0.330, 0.092, 0.28, 1.2, -13, 81}, + { 4, 2, 0.281, 0.046, 0.16, 1.8, -23, 69} +}; + /** Karlin-Altschul parameter values for substitution scores 4 and -5. */ static const array_of_8 blastn_values_4_5[] = { { 0, 0, 0.22, 0.061, 0.22, 1.0, -15, 74 }, @@ -652,6 +669,11 @@ static const array_of_8 blastn_values_1_1[] = { { 2, 1, 0.99, 0.17, 0.30, 3.3, -10, 90 } }; +/** Karlin-Altschul parameter values for substitution scores 3 and -2. */ +static const array_of_8 blastn_values_3_2[] = { + { 5, 5, 0.208, 0.030, 0.072, 2.9, -47, 77} +}; + /** Karlin-Altschul parameter values for substitution scores 5 and -4. */ static const array_of_8 blastn_values_5_4[] = { { 10, 6, 0.163, 0.068, 0.16, 1.0, -19, 85 }, @@ -2697,6 +2719,306 @@ void BLAST_GetAlphaBeta(const char* matrixName, double *alpha, sfree(beta_arr); } +static Int2 +s_SplitArrayOf8(const array_of_8* input, const array_of_8** normal, const array_of_8** non_affine, Boolean *split) +{ + + if (input == NULL || normal == NULL || non_affine == NULL) + return -1; + + *normal = NULL; + *non_affine = NULL; + + if (input[0][0] == 0 && input[0][1] == 0) + { + *normal = input+1; + *non_affine = input; + *split = TRUE; + } + else + { + *normal = input; + *split = FALSE; + } + return 0; + +} + +/** Adjust Lambda and H if reward and penalty have a non-1 gcd. + * the two arrays (normal and linear) should be filled in with values already. + * @param normal the values for normal (e.g, "affine") gap costs [in|out] + * @param linear specialized values used for megablast [in|out] + * @param size Number of supported combinations for this match/mismatch pair [out] + * @param gap_existence_max start of infinite regime for gap existence [in|out] + * @param gap_extend_max start of infinite regime for gap extension [in|out] + * @param divisor divisor for gap costs [out] +*/ +static Int2 +s_AdjustGapParametersByGcd(array_of_8* normal, array_of_8* linear, int size, Int4* gap_existence_max, Int4* gap_extend_max, int divisor) +{ + if (divisor == 1) + return 0; + + if (size <=0) + return 1; + + (*gap_existence_max) *= divisor; + (*gap_extend_max) *= divisor; + + if (normal) + { + int i; + + for (i=0; i<size; i++) + { /* divide lambda and alpha by divisor. */ + /* multiply gap existence and extension by divisor. */ + normal[i][0] *= divisor; + normal[i][1] *= divisor; + normal[i][2] /= divisor; + normal[i][5] /= divisor; + } + } + if (linear) + { /* divide lambda and alpha by divisor. */ + linear[0][0] *= divisor; + linear[0][1] *= divisor; + linear[0][2] /= divisor; + linear[0][5] /= divisor; + } + + return 0; +} + +/** Returns the array of values corresponding to the given match/mismatch + * scores, the number of supported gap cost combinations and thresholds for + * the gap costs, beyond which the ungapped statistics can be applied. + * @param reward Match reward score [in] + * @param penalty Mismatch penalty score [in] + * @param array_size Number of supported combinations for this match/mismatch + * pair [out] + * @param normal the values for normal (e.g, "affine") gap costs [out] + * @param non_affine specialized values used for megablast [out] + * @param gap_open_max Gap opening cost threshold for infinite gap costs [out] + * @param gap_extend_max Gap extension cost threshold for infinite gap costs [out] + * @param round_down if set to TRUE only even scores should be used for calculation + * of expect value or bit scores [out] + * @param error_return Pointer to error message [out] + * @return zero on success, other values if error + */ +static Int2 +s_GetNuclValuesArray(Int4 reward, Int4 penalty, Int4* array_size, + array_of_8** normal, array_of_8** non_affine, + Int4* gap_open_max, Int4* gap_extend_max, Boolean* round_down, + Blast_Message** error_return) +{ + Int2 status = 0; + const array_of_8 * kValues = NULL; + const array_of_8 * kValues_non_affine = NULL; + Boolean split = FALSE; + int divisor = BLAST_Gcd(reward, penalty); + + *round_down = FALSE; + + *array_size = 0; + *normal = NULL; + *non_affine = NULL; + + if (divisor != 1) + { + reward /= divisor; + penalty /= divisor; + } + + if (reward == 1 && penalty == -5) { + if ((status=s_SplitArrayOf8(blastn_values_1_5, &kValues, &kValues_non_affine, &split))) + return status; + + *array_size = sizeof(blastn_values_1_5)/sizeof(array_of_8); + *gap_open_max = 3; + *gap_extend_max = 3; + } else if (reward == 1 && penalty == -4) { + if ((status=s_SplitArrayOf8(blastn_values_1_4, &kValues, &kValues_non_affine, &split))) + return status; + + *array_size = sizeof(blastn_values_1_4)/sizeof(array_of_8); + *gap_open_max = 2; + *gap_extend_max = 2; + } else if (reward == 2 && penalty == -7) { + if ((status=s_SplitArrayOf8(blastn_values_2_7, &kValues, &kValues_non_affine, &split))) + return status; + + *round_down = TRUE; + *array_size = sizeof(blastn_values_2_7)/sizeof(array_of_8); + *gap_open_max = 4; + *gap_extend_max = 4; + } else if (reward == 1 && penalty == -3) { + if ((status=s_SplitArrayOf8(blastn_values_1_3, &kValues, &kValues_non_affine, &split))) + return status; + + *array_size = sizeof(blastn_values_1_3)/sizeof(array_of_8); + *gap_open_max = 2; + *gap_extend_max = 2; + } else if (reward == 2 && penalty == -5) { + if ((status=s_SplitArrayOf8(blastn_values_2_5, &kValues, &kValues_non_affine, &split))) + return status; + + *round_down = TRUE; + *array_size = sizeof(blastn_values_2_5)/sizeof(array_of_8); + *gap_open_max = 4; + *gap_extend_max = 4; + } else if (reward == 1 && penalty == -2) { + if ((status=s_SplitArrayOf8(blastn_values_1_2, &kValues, &kValues_non_affine, &split))) + return status; + + *array_size = sizeof(blastn_values_1_2)/sizeof(array_of_8); + *gap_open_max = 2; + *gap_extend_max = 2; + } else if (reward == 2 && penalty == -3) { + if ((status=s_SplitArrayOf8(blastn_values_2_3, &kValues, &kValues_non_affine, &split))) + return status; + + *round_down = TRUE; + *array_size = sizeof(blastn_values_2_3)/sizeof(array_of_8); + *gap_open_max = 6; + *gap_extend_max = 4; + } else if (reward == 3 && penalty == -4) { + if ((status=s_SplitArrayOf8(blastn_values_3_4, &kValues, &kValues_non_affine, &split))) + return status; + + *round_down = TRUE; + *array_size = sizeof(blastn_values_3_4)/sizeof(array_of_8); + *gap_open_max = 6; + *gap_extend_max = 3; + } else if (reward == 1 && penalty == -1) { + if ((status=s_SplitArrayOf8(blastn_values_1_1, &kValues, &kValues_non_affine, &split))) + return status; + + *array_size = sizeof(blastn_values_1_1)/sizeof(array_of_8); + *gap_open_max = 4; + *gap_extend_max = 2; + } else if (reward == 3 && penalty == -2) { + if ((status=s_SplitArrayOf8(blastn_values_3_2, &kValues, &kValues_non_affine, &split))) + return status; + + *array_size = sizeof(blastn_values_3_2)/sizeof(array_of_8); + *gap_open_max = 5; + *gap_extend_max = 5; + } else if (reward == 4 && penalty == -5) { + if ((status=s_SplitArrayOf8(blastn_values_4_5, &kValues, &kValues_non_affine, &split))) + return status; + + *array_size = sizeof(blastn_values_4_5)/sizeof(array_of_8); + *gap_open_max = 12; + *gap_extend_max = 8; + } else if (reward == 5 && penalty == -4) { + if ((status=s_SplitArrayOf8(blastn_values_5_4, &kValues, &kValues_non_affine, &split))) + return status; + + *array_size = sizeof(blastn_values_5_4)/sizeof(array_of_8); + *gap_open_max = 25; + *gap_extend_max = 10; + } else { /* Unsupported reward-penalty */ + status = -1; + if (error_return) { + char buffer[256]; + sprintf(buffer, "Substitution scores %d and %d are not supported", + reward, penalty); + Blast_MessageWrite(error_return, eBlastSevError, 0, 0, buffer); + } + } + if (split) + (*array_size)--; + + if (status == 0) + { + if (*array_size > 0) + *normal = BlastMemDup(kValues, (*array_size)*sizeof(array_of_8)); + if (kValues_non_affine) + *non_affine = BlastMemDup(kValues_non_affine, sizeof(array_of_8)); + + status = s_AdjustGapParametersByGcd(*normal, *non_affine, *array_size, gap_open_max, gap_extend_max, divisor); + } + + return status; +} + +Int2 BLAST_GetProteinGapExistenceExtendParams(const char* matrixName, + Int4* gap_existence, + Int4* gap_extension) +{ + Int4* gapOpen_arr,* gapExtend_arr,* pref_flags; + Int4 i; /*loop index*/ + Int2 num_values = Blast_GetMatrixValues(matrixName, &gapOpen_arr, + &gapExtend_arr, NULL, NULL, NULL, NULL, NULL, NULL, &pref_flags); + + if (num_values <= 0) + return -1; + + for(i = 1; i < num_values; i++) { + if(pref_flags[i]==BLAST_MATRIX_BEST) { + (*gap_existence) = gapOpen_arr[i]; + (*gap_extension) = gapExtend_arr[i]; + break; + } + } + + sfree(gapOpen_arr); + sfree(gapExtend_arr); + sfree(pref_flags); + + return 0; +} + + +Int2 BLAST_GetNucleotideGapExistenceExtendParams(Int4 reward, + Int4 penalty, + Int4* gap_existence, + Int4* gap_extension) +{ + int array_size = 0; /* dummy parameter. */ + array_of_8* normal=NULL; /* dummy parameter */ + array_of_8* non_affine=NULL; /* dummy parameter */ + Boolean round_down = FALSE; + int gap_existence_max=0; + int gap_extension_max=0; + Int2 status = s_GetNuclValuesArray(reward, penalty, &array_size, &normal, &non_affine, + &gap_existence_max, &gap_extension_max, &round_down, NULL); + + if (status) + { + sfree(normal); + sfree(non_affine); + return status; + } + + if (*gap_existence == 0 && *gap_extension == 0 && non_affine) + status = 0; /* these values are supported. */ + else + { + int index=0; + Boolean found=FALSE; + while (index < array_size) + { + if (*gap_existence == normal[index][0] && *gap_extension == normal[index][1]) + { + found = TRUE; + break; /* these values are supported. */ + } + index++; + } + + if (!found) + { + *gap_existence = gap_existence_max; + *gap_extension = gap_extension_max; + } + status = 0; + } + sfree(normal); + sfree(non_affine); + return status; +} + /** Fills in error_return with strings describing the allowed values. * @param matrix_name name of the matrix [in] * @param error_return object to be filled in [in|out] @@ -2956,85 +3278,11 @@ BLAST_PrintAllowedValues(const char *matrix_name, Int4 gap_open, Int4 gap_extend return buffer; } -/** Returns the array of values corresponding to the given match/mismatch - * scores, the number of supported gap cost combinations and thresholds for - * the gap costs, beyond which the ungapped statistics can be applied. - * @param reward Match reward score [in] - * @param penalty Mismatch penalty score [in] - * @param array_size Number of supported combinations for this match/mismatch - * pair [out] - * @param gap_open_max Gap opening cost threshold for infinite gap costs [in] - * @param gap_extend_max Gap extension cost threshold for infinite gap costs [in] - * @param error_return Pointer to error message [in] [out] - * @return Corresponding array of values. - */ -static const array_of_8 * -s_GetNuclValuesArray(Int4 reward, Int4 penalty, Int4* array_size, - Int4* gap_open_max, Int4* gap_extend_max, - Blast_Message** error_return) -{ - const array_of_8 * kValues = NULL; - - if (reward == 1 && penalty == -4) { - kValues = blastn_values_1_4; - *array_size = sizeof(blastn_values_1_4)/sizeof(array_of_8); - *gap_open_max = 2; - *gap_extend_max = 2; - } else if (reward == 2 && penalty == -7) { - kValues = blastn_values_2_7; - *array_size = sizeof(blastn_values_2_7)/sizeof(array_of_8); - *gap_open_max = 4; - *gap_extend_max = 4; - } else if (reward == 1 && penalty == -3) { - kValues = blastn_values_1_3; - *array_size = sizeof(blastn_values_1_3)/sizeof(array_of_8); - *gap_open_max = 2; - *gap_extend_max = 2; - } else if (reward == 2 && penalty == -5) { - kValues = blastn_values_2_5; - *array_size = sizeof(blastn_values_2_5)/sizeof(array_of_8); - *gap_open_max = 4; - *gap_extend_max = 4; - } else if (reward == 1 && penalty == -2) { - kValues = blastn_values_1_2; - *array_size = sizeof(blastn_values_1_2)/sizeof(array_of_8); - *gap_open_max = 2; - *gap_extend_max = 2; - } else if (reward == 2 && penalty == -3) { - kValues = blastn_values_2_3; - *array_size = sizeof(blastn_values_2_3)/sizeof(array_of_8); - *gap_open_max = 6; - *gap_extend_max = 4; - } else if (reward == 1 && penalty == -1) { - kValues = blastn_values_1_1; - *array_size = sizeof(blastn_values_1_1)/sizeof(array_of_8); - *gap_open_max = 4; - *gap_extend_max = 2; - } else if (reward == 4 && penalty == -5) { - kValues = blastn_values_4_5; - *array_size = sizeof(blastn_values_4_5)/sizeof(array_of_8); - *gap_open_max = 12; - *gap_extend_max = 8; - } else if (reward == 5 && penalty == -4) { - kValues = blastn_values_5_4; - *array_size = sizeof(blastn_values_5_4)/sizeof(array_of_8); - *gap_open_max = 25; - *gap_extend_max = 10; - } else if (error_return) { - char buffer[256]; - /* Unsupported reward-penalty */ - sprintf(buffer, "Substitution scores %d and %d are not supported", - reward, penalty); - Blast_MessageWrite(error_return, eBlastSevError, 0, 0, buffer); - } - - return kValues; -} - Int2 Blast_KarlinBlkNuclGappedCalc(Blast_KarlinBlk* kbp, Int4 gap_open, Int4 gap_extend, Int4 reward, Int4 penalty, Blast_KarlinBlk* kbp_ungap, + Boolean* round_down, Blast_Message** error_return) { const int kGapOpenIndex = 0; @@ -3043,53 +3291,88 @@ Blast_KarlinBlkNuclGappedCalc(Blast_KarlinBlk* kbp, Int4 gap_open, const int kKIndex = 3; const int kHIndex = 4; int num_combinations = 0; - int index; int gap_open_max, gap_extend_max; - - const array_of_8 *kValues = - s_GetNuclValuesArray(reward, penalty, &num_combinations, - &gap_open_max, &gap_extend_max, error_return); + array_of_8* normal=NULL; + array_of_8* linear=NULL; + Int2 status = s_GetNuclValuesArray(reward, + penalty, + &num_combinations, + &normal, + &linear, + &gap_open_max, + &gap_extend_max, + round_down, + error_return); + + if (status) + { + sfree(normal); + sfree(linear); + return status; + } ASSERT(kbp && kbp_ungap); -#ifndef NEW_BLASTN_STAT - Blast_KarlinBlkCopy(kbp, kbp_ungap); - return 0; -#endif - - if (!kValues) - return 1; /* Try to find the table entry corresponding to input gap costs. */ - for (index = 0; index < num_combinations; ++index) { - if (kValues[index][kGapOpenIndex] == gap_open && - kValues[index][kGapExtIndex] == gap_extend) { - kbp->Lambda = kValues[index][kLambdaIndex]; - kbp->K = kValues[index][kKIndex]; - kbp->logK = log(kbp->K); - kbp->H = kValues[index][kHIndex]; - break; - } + if (gap_open == 0 && gap_extend == 0 && linear) + { + kbp->Lambda = linear[0][kLambdaIndex]; + kbp->K = linear[0][kKIndex]; + kbp->logK = log(kbp->K); + kbp->H = linear[0][kHIndex]; } + else + { + int index=0; + for (index = 0; index < num_combinations; ++index) { + if (normal[index][kGapOpenIndex] == gap_open && + normal[index][kGapExtIndex] == gap_extend) { + kbp->Lambda = normal[index][kLambdaIndex]; + kbp->K = normal[index][kKIndex]; + kbp->logK = log(kbp->K); + kbp->H = normal[index][kHIndex]; + break; + } + } - /* If gap costs are not found in the table, check if they belong to the - infinite domain, where ungapped values of the parameters can be used. */ - if (index == num_combinations) { + /* If gap costs are not found in the table, check if they belong to the + infinite domain, where ungapped values of the parameters can be used. */ + if (index == num_combinations) { /* If gap costs are larger than maximal provided in tables, copy the values from the ungapped Karlin block. */ - if (gap_open >= gap_open_max && gap_extend >= gap_extend_max) { - Blast_KarlinBlkCopy(kbp, kbp_ungap); - } else if (error_return) { - char buffer[256]; - /* Unsupported gap costs combination. */ - sprintf(buffer, "Gap existence and extension values %d and %d " - "are not supported for substitution scores %d and %d", - gap_open, gap_extend, reward, penalty); - Blast_MessageWrite(error_return, eBlastSevError, 0, 0, buffer); - return 1; + if (gap_open >= gap_open_max && gap_extend >= gap_extend_max) { + Blast_KarlinBlkCopy(kbp, kbp_ungap); + } else if (error_return) { + char buffer[8192]; + int i=0; + int len=0; + /* Unsupported gap costs combination. */ + sprintf(buffer, "Gap existence and extension values %ld and %ld " + "are not supported for substitution scores %ld and %ld\n", + (long) gap_open, (long) gap_extend, (long) reward, (long) penalty); + for (i = 0; i < num_combinations; ++i) + { + len = strlen(buffer); + sprintf(buffer+len, "%ld and %ld are supported existence and extension values\n", + (long) normal[i][kGapOpenIndex], (long) normal[i][kGapExtIndex]); + } + len = strlen(buffer); + sprintf(buffer+len, "%ld and %ld are supported existence and extension values\n", + (long) gap_open_max, (long) gap_extend_max); + len = strlen(buffer); + sprintf(buffer+len, "Any values more stringent than %ld and %ld are supported\n", + (long) gap_open_max, (long) gap_extend_max); + Blast_MessageWrite(error_return, eBlastSevError, 0, 0, buffer); + sfree(normal); + sfree(linear); + return 1; + } } } + sfree(normal); + sfree(linear); return 0; } @@ -3121,35 +3404,59 @@ Int2 Blast_GetNuclAlphaBeta(Int4 reward, Int4 penalty, Int4 gap_open, Int4 num_combinations = 0; Int4 gap_open_max = 0, gap_extend_max = 0; Int4 index = 0; - const array_of_8 *kValues = - s_GetNuclValuesArray(reward, penalty, &num_combinations, - &gap_open_max, &gap_extend_max, NULL);; + array_of_8* normal=NULL; + array_of_8* linear=NULL; + Boolean round_down = FALSE; + Boolean found = FALSE; + Int2 status = s_GetNuclValuesArray(reward, + penalty, + &num_combinations, + &normal, + &linear, + &gap_open_max, + &gap_extend_max, + &round_down, + NULL); - ASSERT(alpha && beta && kbp); + if (status) + return status; -#ifndef NEW_BLASTN_STAT - *alpha = kbp->Lambda/kbp->H; - *beta = 0; - return 0; -#endif + ASSERT(alpha && beta && kbp); /* For ungapped search return ungapped values of alpha and beta. */ - if (gapped_calculation && kValues) { - for (index = 0; index < num_combinations; ++index) { - if (kValues[index][kGapOpenIndex] == gap_open && - kValues[index][kGapExtIndex] == gap_extend) { - *alpha = kValues[index][kAlphaIndex]; - *beta = kValues[index][kBetaIndex]; - return 0; + if (gapped_calculation && normal) { + if (gap_open == 0 && gap_extend == 0 && linear) + { + *alpha = linear[0][kAlphaIndex]; + *beta = linear[0][kBetaIndex]; + found = TRUE; + } + else + { + + for (index = 0; index < num_combinations; ++index) { + if (normal[index][kGapOpenIndex] == gap_open && + normal[index][kGapExtIndex] == gap_extend) { + *alpha = normal[index][kAlphaIndex]; + *beta = normal[index][kBetaIndex]; + found = TRUE; + break; + } } } + } /* If input values not found in tables, or if this is an ungapped search, return the ungapped values of alpha and beta. */ - *alpha = kbp->Lambda/kbp->H; - *beta = s_GetUngappedBeta(reward, penalty); + if (!found) + { + *alpha = kbp->Lambda/kbp->H; + *beta = s_GetUngappedBeta(reward, penalty); + } + sfree(linear); + sfree(normal); return 0; } @@ -3997,6 +4304,51 @@ BLAST_ComputeLengthAdjustment(double K, * =========================================================================== * * $Log: blast_stat.c,v $ + * Revision 1.136 2005/11/14 15:55:42 madden + * Correct comment + * + * Revision 1.135 2005/11/04 13:48:09 madden + * Doxygen fixes + * + * Revision 1.134 2005/11/01 18:49:01 madden + * Changes to s_GetNuclValuesArray and calling functions to support (for blastn) reward and penalty values that are multiples of already supported values + * + * Revision 1.133 2005/10/31 14:05:24 madden + * 1.) add support for blastn reward/penalty values of 1/-5, 3/-4, and 3/-2. + * 2.) BLAST_GetNucleotideGapExistenceExtendParams now validates value as well as suggesting a + * reasonable value. + * + * Revision 1.132 2005/10/14 17:29:22 madden + * Add preliminary support for vecscreen parameters + * + * Revision 1.131 2005/10/12 19:15:47 madden + * Fix bug in s_GetNuclValuesArray + * + * Revision 1.130 2005/09/27 14:43:56 madden + * Centralize round_down decision in s_GetNuclValuesArray + * + * Revision 1.129 2005/09/16 14:01:45 madden + * 1.) BLAST_GetGapExistenceExtendParams renamed to BLAST_GetProteinGapExistenceExtendParams + * 2.) Added BLAST_GetNucleotideGapExistenceExtendParams + * 3.) Added informative error message to s_GetNuclValuesArray + * + * Revision 1.128 2005/09/12 19:16:38 coulouri + * Enable precomputed statistical parameters for blastn + * + * Revision 1.127 2005/09/08 14:48:11 ucko + * Tweak Blast_KarlinBlkNuclGappedCalc and Blast_GetNuclAlphaBeta to + * declare kValues unconditionally, to fix compilation errors when + * NEW_BLASTN_STAT is undefined. + * + * Revision 1.126 2005/09/08 13:40:34 coulouri + * Call s_GetNuclValuesArray iff NEW_BLASTN_STAT + * + * Revision 1.125 2005/08/30 15:42:58 madden + * BLAST_GetGapExistenceExtendParams now takes program_number as an argument so it can properly identify blastn queries + * + * Revision 1.124 2005/08/29 13:52:05 madden + * Add BLAST_GetGapExistenceExtendParams + * * Revision 1.123 2005/08/19 17:56:18 dondosha * Removed unnecessary redefinition of HUGE_VAL * diff --git a/algo/blast/core/blast_stat.h b/algo/blast/core/blast_stat.h index 2ab38c76..e8295c92 100644 --- a/algo/blast/core/blast_stat.h +++ b/algo/blast/core/blast_stat.h @@ -1,4 +1,4 @@ -/* $Id: blast_stat.h,v 1.70 2005/08/15 16:10:41 dondosha Exp $ +/* $Id: blast_stat.h,v 1.74 2005/09/27 14:43:16 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -160,6 +160,7 @@ protein alphabet (e.g., ncbistdaa etc.), FALSE for nt. alphabets. */ Uint1* ambiguous_res; /**< Array of ambiguous res. (e.g, 'X', 'N')*/ Int2 ambig_size, /**< size of array above. FIXME: not needed here? */ ambig_occupy; /**< How many occupied? */ + Boolean round_down; /**< Score must be rounded down to nearest even score if odd. */ } BlastScoreBlk; /** @@ -264,12 +265,15 @@ Int2 Blast_KarlinBlkGappedCalc (Blast_KarlinBlk* kbp, Int4 gap_open, * @param reward Match reward score [in] * @param penalty Mismatch penalty score [in] * @param kbp_ungap Karlin block with ungapped Karlin-Altschul parameters [in] + * @param round_down specifies that the score should be rounded down to nearest even + * score in some cases [in|out] * @param error_return Pointer to error message. [in] [out] */ Int2 Blast_KarlinBlkNuclGappedCalc(Blast_KarlinBlk* kbp, Int4 gap_open, Int4 gap_extend, Int4 reward, Int4 penalty, Blast_KarlinBlk* kbp_ungap, + Boolean* round_down, Blast_Message** error_return); @@ -418,6 +422,30 @@ double BLAST_LargeGapSumE (Int2 num, double xsum, Int4 query_length, Int4 subject_length, Int8 searchsp_eff, double weight_divisor ); +/** Extract the recommended gap existence and extension values. + * Only to be used with protein matrices. + * @param matrixName name of the matrix [in] + * @param gap_existence returns recommended existence cost [in|out] + * @param gap_extension returns recommended extension cost [in|out] + * @return zero on success + */ +Int2 BLAST_GetProteinGapExistenceExtendParams(const char* matrixName, + Int4* gap_existence, + Int4* gap_extension); + +/** Extract the recommended gap existence and extension values. + * Only to be used with blastn searches. + * @param reward match score [in] + * @param penalty mismatch score [in] + * @param gap_existence returns recommended existence cost [in|out] + * @param gap_extension returns recommended extension cost [in|out] + * @return zero on success + */ +Int2 BLAST_GetNucleotideGapExistenceExtendParams(Int4 reward, + Int4 penalty, + Int4* gap_existence, + Int4* gap_extension); + /** Extract the alpha and beta settings for this matrixName, and these * gap open and gap extension costs * @param matrixName name of the matrix used [in] diff --git a/algo/blast/core/blast_traceback.c b/algo/blast/core/blast_traceback.c index 6cdfa6f2..04aa836a 100644 --- a/algo/blast/core/blast_traceback.c +++ b/algo/blast/core/blast_traceback.c @@ -1,4 +1,4 @@ -/* $Id: blast_traceback.c,v 1.175 2005/08/15 16:11:20 dondosha Exp $ +/* $Id: blast_traceback.c,v 1.179 2005/12/01 14:47:48 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. @@ -23,7 +23,6 @@ * * =========================================================================== * - * Author: Ilya Dondoshansky * */ @@ -39,7 +38,7 @@ * for ( all HSP lists ) * Blast_TracebackFromHSPList * else if ( composition based statistics ) - * Kappa_RedoAlignmentCore + * Blast_RedoAlignmentCore * else * for ( all HSP lists ) * if ( PHI BLAST) @@ -51,7 +50,7 @@ #ifndef SKIP_DOXYGEN_PROCESSING static char const rcsid[] = - "$Id: blast_traceback.c,v 1.175 2005/08/15 16:11:20 dondosha Exp $"; + "$Id: blast_traceback.c,v 1.179 2005/12/01 14:47:48 madden Exp $"; #endif /* SKIP_DOXYGEN_PROCESSING */ #include <algo/blast/core/blast_traceback.h> @@ -269,10 +268,7 @@ s_HSPListPostTracebackUpdate(EBlastProgramType program_number, /* For nucleotide search, if match score is = 2, the odd scores are rounded down to the nearest even number. */ - if (program_number == eBlastTypeBlastn && - score_params->options->reward == 2) { - Blast_HSPListAdjustOddBlastnScores(hsp_list); - } + Blast_HSPListAdjustOddBlastnScores(hsp_list, kGapped, sbp); Blast_HSPListGetEvalues(query_info, hsp_list, kGapped, sbp, 0, scale_factor); @@ -387,7 +383,7 @@ Blast_TracebackFromHSPList(EBlastProgramType program_number, hsp = hsp_array[index]; if (program_number == eBlastTypeBlastx && kIsOutOfFrame) { Int4 context = hsp->context - hsp->context % 3; - Int4 context_offset = query_info->contexts[hsp->context].query_offset; + Int4 context_offset = query_info->contexts[context].query_offset; query = query_blk->oof_sequence + CODON_LENGTH + context_offset; query_length = query_info->contexts[context+2].query_offset + @@ -1015,7 +1011,7 @@ BLAST_ComputeTraceback(EBlastProgramType program_number, (ext_params->options->compositionBasedStats == TRUE || ext_params->options->eTbackExt == eSmithWatermanTbck)) { status = - Kappa_RedoAlignmentCore(program_number, query, query_info, sbp, + Blast_RedoAlignmentCore(program_number, query, query_info, sbp, hsp_stream, seq_src, gen_code_string, score_params, ext_params, hit_params, psi_options, results); diff --git a/algo/blast/core/blast_traceback.h b/algo/blast/core/blast_traceback.h index fe353777..5b31affa 100644 --- a/algo/blast/core/blast_traceback.h +++ b/algo/blast/core/blast_traceback.h @@ -1,4 +1,4 @@ -/* $Id: blast_traceback.h,v 1.46 2005/05/10 16:07:35 camacho Exp $ +/* $Id: blast_traceback.h,v 1.47 2005/11/16 14:31:36 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. @@ -23,7 +23,6 @@ * * =========================================================================== * - * Author: Ilya Dondoshansky * */ diff --git a/algo/blast/core/blast_util.c b/algo/blast/core/blast_util.c index 655d1a66..f149f7db 100644 --- a/algo/blast/core/blast_util.c +++ b/algo/blast/core/blast_util.c @@ -1,4 +1,4 @@ -/* $Id: blast_util.c,v 1.100 2005/08/17 16:21:31 dondosha Exp $ +/* $Id: blast_util.c,v 1.105 2005/11/16 14:27:04 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. @@ -34,7 +34,7 @@ #ifndef SKIP_DOXYGEN_PROCESSING static char const rcsid[] = - "$Id: blast_util.c,v 1.100 2005/08/17 16:21:31 dondosha Exp $"; + "$Id: blast_util.c,v 1.105 2005/11/16 14:27:04 madden Exp $"; #endif /* SKIP_DOXYGEN_PROCESSING */ #include <algo/blast/core/blast_util.h> @@ -185,6 +185,8 @@ Int2 BlastProgram2Number(const char *program, EBlastProgramType *number) *number = eBlastTypeRpsTblastn; else if (strcasecmp("psiblast", program) == 0) *number = eBlastTypePsiBlast; + else if (strcasecmp("psitblastn", program) == 0) + *number = eBlastTypePsiTblastn; else if (strcasecmp("phiblastn", program) == 0) *number = eBlastTypePhiBlastn; else if (strcasecmp("phiblastp", program) == 0) @@ -200,10 +202,10 @@ Int2 BlastNumber2Program(EBlastProgramType number, char* *program) return 1; switch (number) { - case eBlastTypeBlastn: case eBlastTypePhiBlastn: + case eBlastTypeBlastn: *program = strdup("blastn"); break; - case eBlastTypeBlastp: case eBlastTypePhiBlastp: + case eBlastTypeBlastp: *program = strdup("blastp"); break; case eBlastTypeBlastx: @@ -224,6 +226,15 @@ Int2 BlastNumber2Program(EBlastProgramType number, char* *program) case eBlastTypePsiBlast: *program = strdup("psiblast"); break; + case eBlastTypePsiTblastn: + *program = strdup("psitblastn"); + break; + case eBlastTypePhiBlastp: + *program = strdup("phiblastp"); + break; + case eBlastTypePhiBlastn: + *program = strdup("phiblastn"); + break; default: *program = strdup("unknown"); break; @@ -644,7 +655,7 @@ Int2 GetReverseNuclSequence(const Uint1* sequence, Int4 length, Int1 BLAST_ContextToFrame(EBlastProgramType prog_number, Uint4 context_number) { - Int1 frame = 127; /* 127 is used to indicate error */ + Int1 frame = INT1_MAX; /* INT1_MAX is used to indicate error */ if (prog_number == eBlastTypeBlastn) { if (context_number % NUM_STRANDS == 0) @@ -653,8 +664,8 @@ Int1 BLAST_ContextToFrame(EBlastProgramType prog_number, Uint4 context_number) frame = -1; } else if (prog_number == eBlastTypeBlastp || prog_number == eBlastTypeRpsBlast || - prog_number == eBlastTypePsiBlast || prog_number == eBlastTypeTblastn || + Blast_ProgramIsPsiBlast(prog_number) || Blast_ProgramIsPhiBlast(prog_number)) { /* Query and subject are protein, no frame. */ frame = 0; @@ -730,6 +741,57 @@ BlastQueryInfo* BlastQueryInfoDup(BlastQueryInfo* query_info) return retval; } +/** Calculates length of the DNA query from the BlastQueryInfo structure that + * contains context information for translated frames for a set of queries. + * @param query_info Query information containing data for all contexts [in] + * @param query_index Which query to find DNA length for? + * @return DNA length of the query, calculated as sum of 3 protein frame lengths, + * plus 2, because 2 last nucleotide residues do not have a + * corresponding codon. + */ +static Int4 +s_GetTranslatedQueryDNALength(const BlastQueryInfo* query_info, Int4 query_index) +{ + Int4 start_context = NUM_FRAMES*query_index; + Int4 dna_length = 2; + Int4 index; + + /* Make sure that query index is within appropriate range, and that this is + really a translated search */ + ASSERT(query_index < query_info->num_queries); + ASSERT(start_context < query_info->last_context); + + /* If only reverse strand is searched, then forward strand contexts don't + have lengths information */ + if (query_info->contexts[start_context].query_length == 0) + start_context += 3; + + for (index = start_context; index < start_context + 3; ++index) + dna_length += query_info->contexts[index].query_length; + + return dna_length; +} + +Int4 BlastQueryInfoGetQueryLength(const BlastQueryInfo* qinfo, + EBlastProgramType program, + Int4 query_index) +{ + const Uint4 kNumContexts = BLAST_GetNumberOfContexts(program); + ASSERT(query_index < qinfo->num_queries); + + if (Blast_QueryIsTranslated(program)) { + return s_GetTranslatedQueryDNALength(qinfo, query_index); + } else if (program == eBlastTypeBlastn) { + Int4 retval = qinfo->contexts[query_index*kNumContexts].query_length; + if (retval <= 0) { + retval = qinfo->contexts[query_index*kNumContexts+1].query_length; + } + return retval; + } else { + return qinfo->contexts[query_index*kNumContexts].query_length; + } +} + Int2 BLAST_PackDNA(const Uint1* buffer, Int4 length, EBlastEncoding encoding, Uint1** packed_seq) { @@ -1320,3 +1382,32 @@ char* BLAST_StrToUpper(const char* string) return retval; } +unsigned int +BLAST_GetNumberOfContexts(EBlastProgramType p) +{ + unsigned int retval = 0; + + switch (p) { + case eBlastTypeBlastn: + case eBlastTypePhiBlastn: + retval = NUM_STRANDS; + break; + case eBlastTypeBlastp: + case eBlastTypeRpsBlast: + case eBlastTypeTblastn: + case eBlastTypePsiBlast: + case eBlastTypePsiTblastn: + case eBlastTypePhiBlastp: + retval = 1; + break; + case eBlastTypeBlastx: + case eBlastTypeTblastx: + case eBlastTypeRpsTblastn: + retval = NUM_FRAMES; + break; + default: + break; + } + + return retval; +} diff --git a/algo/blast/core/blast_util.h b/algo/blast/core/blast_util.h index 212b58e0..d35b34f2 100644 --- a/algo/blast/core/blast_util.h +++ b/algo/blast/core/blast_util.h @@ -1,4 +1,4 @@ -/* $Id: blast_util.h,v 1.68 2005/08/09 19:25:30 dondosha Exp $ +/* $Id: blast_util.h,v 1.72 2005/11/16 14:31:36 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. @@ -23,7 +23,6 @@ * * =========================================================================== * - * Author: Ilya Dondoshansky * */ @@ -194,7 +193,7 @@ Int2 GetReverseNuclSequence(const Uint1* sequence, Int4 length, * @param prog_number Integer corresponding to the BLAST program * @param context_number Context number * @return Sequence frame: -1,1 for nucleotides, -3,-2,-1,1,2,3 for - * translations, 0 for proteins and 127 in case of unsupported program + * translations, 0 for proteins and INT1_MAX in case of unsupported program */ NCBI_XBLAST_EXPORT Int1 BLAST_ContextToFrame(EBlastProgramType prog_number, Uint4 context_number); @@ -215,6 +214,18 @@ BlastQueryInfo* BlastQueryInfoFree(BlastQueryInfo* query_info); NCBI_XBLAST_EXPORT BlastQueryInfo* BlastQueryInfoDup(BlastQueryInfo* query_info); +/** Obtains the sequence length for a given query in the query, without taking + * into consideration any applicable translations + * @param qinfo BlastQueryInfo structure [in] + * @param program CORE program type [in] + * @param query_index number of the query + * (query_index < BlastQueryInfo::num_queries) [in] + * @return the length of the query sequence requested + */ +Int4 BlastQueryInfoGetQueryLength(const BlastQueryInfo* qinfo, + EBlastProgramType program, + Int4 query_index); + /** Create auxiliary query structures with all data corresponding * to a single query sequence within a concatenated set. Allocates the * structures if the pointers are NULL on input; otherwise only changes the @@ -400,6 +411,13 @@ Blast_SetUpSubjectTranslation(BLAST_SequenceBlk* subject_blk, Int4** frame_offsets, Boolean* partial_translation); +/** Get the number of contexts for a given program. This corresponds to the + * number of translation frames or strands whenever applicable. + * @return 0 on unsupported program, non-zero otherwise + */ +NCBI_XBLAST_EXPORT +unsigned int BLAST_GetNumberOfContexts(EBlastProgramType program); + #ifdef __cplusplus } #endif diff --git a/algo/blast/core/gapinfo.c b/algo/blast/core/gapinfo.c index fa5aba13..d7e2b7e6 100644 --- a/algo/blast/core/gapinfo.c +++ b/algo/blast/core/gapinfo.c @@ -1,4 +1,4 @@ -/* $Id: gapinfo.c,v 1.16 2005/04/27 19:55:13 dondosha Exp $ +/* $Id: gapinfo.c,v 1.17 2005/11/16 14:27:04 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. @@ -34,7 +34,7 @@ #ifndef SKIP_DOXYGEN_PROCESSING static char const rcsid[] = - "$Id: gapinfo.c,v 1.16 2005/04/27 19:55:13 dondosha Exp $"; + "$Id: gapinfo.c,v 1.17 2005/11/16 14:27:04 madden Exp $"; #endif /* SKIP_DOXYGEN_PROCESSING */ #include <algo/blast/core/gapinfo.h> diff --git a/algo/blast/core/gapinfo.h b/algo/blast/core/gapinfo.h index 3360c26d..48f3374f 100644 --- a/algo/blast/core/gapinfo.h +++ b/algo/blast/core/gapinfo.h @@ -1,4 +1,4 @@ -/* $Id: gapinfo.h,v 1.20 2005/04/27 19:49:49 dondosha Exp $ +/* $Id: gapinfo.h,v 1.21 2005/11/16 14:31:37 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. diff --git a/algo/blast/core/greedy_align.c b/algo/blast/core/greedy_align.c index cd53764b..1a2bfec1 100644 --- a/algo/blast/core/greedy_align.c +++ b/algo/blast/core/greedy_align.c @@ -1,4 +1,4 @@ -/* $Id: greedy_align.c,v 1.35 2005/04/07 20:09:28 madden Exp $ +/* $Id: greedy_align.c,v 1.38 2005/11/18 14:43:58 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -36,7 +36,7 @@ #ifndef SKIP_DOXYGEN_PROCESSING static char const rcsid[] = - "$Id: greedy_align.c,v 1.35 2005/04/07 20:09:28 madden Exp $"; + "$Id: greedy_align.c,v 1.38 2005/11/18 14:43:58 madden Exp $"; #endif /* SKIP_DOXYGEN_PROCESSING */ #include <algo/blast/core/greedy_align.h> @@ -308,7 +308,8 @@ Int4 BLAST_GreedyAlign(const Uint1* seq1, Int4 len1, very similar, the average running time will be sig- nificantly better than this */ - max_dist = len2 / GREEDY_MAX_COST_FRACTION + 1; + max_dist = MIN(GREEDY_MAX_COST, + len2 / GREEDY_MAX_COST_FRACTION + 1); /* the main loop assumes that the index of all diagonals is biased to lie in the middle of allocated bookkeeping @@ -382,7 +383,7 @@ Int4 BLAST_GreedyAlign(const Uint1* seq1, Int4 len1, if (index == len1 || index == len2) { if (edit_block != NULL) GapPrelimEditBlockAdd(edit_block, eGapAlignSub, index); - return best_dist; + return 0; /* This function returns number of differences, here it is zero. */ } /* set up the memory pool */ @@ -549,7 +550,9 @@ Int4 BLAST_GreedyAlign(const Uint1* seq1, Int4 len1, } /* clamp the bounds on diagonals to avoid walking off - either sequence */ + either sequence. Because the bounds increase by at + most one for each distance, diag_lower and diag_upper + can each be of size at most max_diags+2 */ if (seq2_index == len2) { diag_lower = k + 1; @@ -586,7 +589,10 @@ Int4 BLAST_GreedyAlign(const Uint1* seq1, Int4 len1, if (diag_lower > diag_upper) break; - /* set up for the next distance to examine */ + /* set up for the next distance to examine. Because the + bounds increase by at most one for each distance, + diag_lower and diag_upper can each be of size at + most max_diags+2 */ if (!end2_reached) diag_lower--; @@ -776,7 +782,8 @@ Int4 BLAST_AffineGreedyAlign (const Uint1* seq1, Int4 len1, /* set the number of distinct distances the algorithm will examine in the search for an optimal alignment */ - max_dist = len2 / GREEDY_MAX_COST_FRACTION + 1; + max_dist = MIN(GREEDY_MAX_COST, + len2 / GREEDY_MAX_COST_FRACTION + 1); scaled_max_dist = max_dist * gap_extend; /* the main loop assumes that the index of all diagonals is @@ -853,7 +860,7 @@ Int4 BLAST_AffineGreedyAlign (const Uint1* seq1, Int4 len1, if (index == len1 || index == len2) { if (edit_block != NULL) GapPrelimEditBlockAdd(edit_block, eGapAlignSub, index); - return best_dist; + return (index*match_score); } /* set up the memory pool */ @@ -1147,7 +1154,9 @@ Int4 BLAST_AffineGreedyAlign (const Uint1* seq1, Int4 len1, /* compute the range of diagonals to test for the next value of d. These must be conservative, in that any - diagonal that could possibly contribute must be allowed */ + diagonal that could possibly contribute must be allowed. + curr_diag_lower and curr_diag_upper can each be of size at + most scaled_max_diags+2 */ d++; curr_diag_lower = MIN(diag_lower[d - gap_open_extend], diff --git a/algo/blast/core/greedy_align.h b/algo/blast/core/greedy_align.h index bafb5e7a..7ed78bbc 100644 --- a/algo/blast/core/greedy_align.h +++ b/algo/blast/core/greedy_align.h @@ -1,4 +1,4 @@ -/* $Id: greedy_align.h,v 1.21 2005/04/07 20:09:54 madden Exp $ +/* $Id: greedy_align.h,v 1.23 2005/11/16 14:31:37 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. @@ -46,6 +46,9 @@ extern "C" { this gives a worst case bound on the number of loop iterations */ #define GREEDY_MAX_COST_FRACTION 2 +/** The largest distance to be examined for an optimal alignment */ +#define GREEDY_MAX_COST 1000 + /* ----- pool allocator ----- */ /** Bookkeeping structure for greedy alignment. When aligning diff --git a/algo/blast/core/hspstream_collector.c b/algo/blast/core/hspstream_collector.c index ec1d1c23..2565f086 100644 --- a/algo/blast/core/hspstream_collector.c +++ b/algo/blast/core/hspstream_collector.c @@ -1,4 +1,4 @@ -/* $Id: hspstream_collector.c,v 1.14 2005/05/16 12:21:40 madden Exp $ +/* $Id: hspstream_collector.c,v 1.15 2005/09/30 12:17:10 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -34,7 +34,7 @@ #ifndef SKIP_DOXYGEN_PROCESSING static char const rcsid[] = - "$Id: hspstream_collector.c,v 1.14 2005/05/16 12:21:40 madden Exp $"; + "$Id: hspstream_collector.c,v 1.15 2005/09/30 12:17:10 madden Exp $"; #endif /* SKIP_DOXYGEN_PROCESSING */ @@ -154,6 +154,7 @@ static int s_BlastHSPListCollectorWrite(BlastHSPStream* hsp_stream, BlastHSPList** hsp_list) { + Int2 status = 0; BlastHSPListCollectorData* stream_data = (BlastHSPListCollectorData*) GetData(hsp_stream); @@ -166,18 +167,24 @@ s_BlastHSPListCollectorWrite(BlastHSPStream* hsp_stream, * every read after a write. */ if (stream_data->results_sorted) { + MT_LOCK_Do(stream_data->x_lock, eMT_Unlock); return kBlastHSPStream_Error; } /* For RPS BLAST saving procedure is different, because HSPs from different subjects are bundled in one HSP list */ if (Blast_ProgramIsRpsBlast(stream_data->program)) { - Blast_HSPResultsSaveRPSHSPList(stream_data->program, + status = Blast_HSPResultsSaveRPSHSPList(stream_data->program, stream_data->results, *hsp_list, stream_data->blasthit_params); } else { - Blast_HSPResultsSaveHSPList(stream_data->program, stream_data->results, + status = Blast_HSPResultsSaveHSPList(stream_data->program, stream_data->results, *hsp_list, stream_data->blasthit_params); } + if (status != 0) + { + MT_LOCK_Do(stream_data->x_lock, eMT_Unlock); + return kBlastHSPStream_Error; + } /* Results structure is no longer sorted, even if it was before. The following assignment is only necessary if the logic to prohibit writing after the first read is removed. */ diff --git a/algo/blast/core/link_hsps.c b/algo/blast/core/link_hsps.c index b90ee1cf..c2c4f56a 100644 --- a/algo/blast/core/link_hsps.c +++ b/algo/blast/core/link_hsps.c @@ -1,5 +1,5 @@ -/* $Id: link_hsps.c,v 1.56 2005/06/08 17:27:53 madden Exp $ +/* $Id: link_hsps.c,v 1.58 2005/11/16 14:27:04 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -7,7 +7,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. @@ -34,7 +34,7 @@ #ifndef SKIP_DOXYGEN_PROCESSING static char const rcsid[] = - "$Id: link_hsps.c,v 1.56 2005/06/08 17:27:53 madden Exp $"; + "$Id: link_hsps.c,v 1.58 2005/11/16 14:27:04 madden Exp $"; #endif /* SKIP_DOXYGEN_PROCESSING */ #include <algo/blast/core/link_hsps.h> @@ -1805,6 +1805,7 @@ BLAST_LinkHsps(EBlastProgramType program_number, BlastHSPList* hsp_list, /* The HSP's may be in a different order than they were before, but hsp contains the first one. */ } else { + Blast_HSPListAdjustOddBlastnScores(hsp_list, gapped_calculation, sbp); /* Calculate individual HSP e-values first - they'll be needed to compare with sum e-values. Use decay rate to compensate for multiple tests. */ diff --git a/algo/blast/core/link_hsps.h b/algo/blast/core/link_hsps.h index dba93fda..55a60f86 100644 --- a/algo/blast/core/link_hsps.h +++ b/algo/blast/core/link_hsps.h @@ -1,4 +1,4 @@ -/* $Id: link_hsps.h,v 1.14 2004/09/23 14:57:46 dondosha Exp $ +/* $Id: link_hsps.h,v 1.15 2005/11/16 14:31:37 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. diff --git a/algo/blast/core/lookup_util.c b/algo/blast/core/lookup_util.c index 9d135cd9..1c8b15b1 100644 --- a/algo/blast/core/lookup_util.c +++ b/algo/blast/core/lookup_util.c @@ -1,4 +1,4 @@ -/* $Id: lookup_util.c,v 1.11 2005/03/01 14:00:56 coulouri Exp $ +/* $Id: lookup_util.c,v 1.12 2005/11/16 14:27:04 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. @@ -30,7 +30,7 @@ #ifndef SKIP_DOXYGEN_PROCESSING static char const rcsid[] = - "$Id: lookup_util.c,v 1.11 2005/03/01 14:00:56 coulouri Exp $"; + "$Id: lookup_util.c,v 1.12 2005/11/16 14:27:04 madden Exp $"; #endif /* SKIP_DOXYGEN_PROCESSING */ #include <algo/blast/core/lookup_util.h> diff --git a/algo/blast/core/lookup_util.h b/algo/blast/core/lookup_util.h index 060d8af7..988dc42b 100644 --- a/algo/blast/core/lookup_util.h +++ b/algo/blast/core/lookup_util.h @@ -1,4 +1,4 @@ -/* $Id: lookup_util.h,v 1.10 2005/03/01 13:59:51 coulouri Exp $ +/* $Id: lookup_util.h,v 1.11 2005/11/16 14:31:37 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. diff --git a/algo/blast/core/lookup_wrap.c b/algo/blast/core/lookup_wrap.c index 27ce7e2e..7c7dfe51 100644 --- a/algo/blast/core/lookup_wrap.c +++ b/algo/blast/core/lookup_wrap.c @@ -1,4 +1,4 @@ -/* $Id: lookup_wrap.c,v 1.16 2005/07/29 15:21:15 camacho Exp $ +/* $Id: lookup_wrap.c,v 1.17 2005/11/16 14:27:04 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. @@ -37,7 +37,7 @@ #ifndef SKIP_DOXYGEN_PROCESSING static char const rcsid[] = - "$Id: lookup_wrap.c,v 1.16 2005/07/29 15:21:15 camacho Exp $"; + "$Id: lookup_wrap.c,v 1.17 2005/11/16 14:27:04 madden Exp $"; #endif /* SKIP_DOXYGEN_PROCESSING */ #include <algo/blast/core/lookup_wrap.h> diff --git a/algo/blast/core/lookup_wrap.h b/algo/blast/core/lookup_wrap.h index 0c9f9e67..d7dc376f 100644 --- a/algo/blast/core/lookup_wrap.h +++ b/algo/blast/core/lookup_wrap.h @@ -1,4 +1,4 @@ -/* $Id: lookup_wrap.h,v 1.11 2005/07/29 15:21:08 camacho Exp $ +/* $Id: lookup_wrap.h,v 1.12 2005/11/16 14:31:37 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. diff --git a/algo/blast/core/mb_lookup.c b/algo/blast/core/mb_lookup.c index 739d6184..6d372e93 100644 --- a/algo/blast/core/mb_lookup.c +++ b/algo/blast/core/mb_lookup.c @@ -1,4 +1,4 @@ -/* $Id: mb_lookup.c,v 1.56 2005/08/17 16:21:31 dondosha Exp $ +/* $Id: mb_lookup.c,v 1.57 2005/11/16 14:27:04 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. @@ -34,7 +34,7 @@ #ifndef SKIP_DOXYGEN_PROCESSING static char const rcsid[] = - "$Id: mb_lookup.c,v 1.56 2005/08/17 16:21:31 dondosha Exp $"; + "$Id: mb_lookup.c,v 1.57 2005/11/16 14:27:04 madden Exp $"; #endif /* SKIP_DOXYGEN_PROCESSING */ #include <algo/blast/core/blast_options.h> diff --git a/algo/blast/core/mb_lookup.h b/algo/blast/core/mb_lookup.h index 3478182e..c87d2801 100644 --- a/algo/blast/core/mb_lookup.h +++ b/algo/blast/core/mb_lookup.h @@ -1,4 +1,4 @@ -/* $Id: mb_lookup.h,v 1.25 2005/06/06 15:37:02 papadopo Exp $ +/* $Id: mb_lookup.h,v 1.26 2005/11/16 14:31:37 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. diff --git a/algo/blast/core/ncbi_std.c b/algo/blast/core/ncbi_std.c index eddf7c97..581f1970 100644 --- a/algo/blast/core/ncbi_std.c +++ b/algo/blast/core/ncbi_std.c @@ -1,4 +1,4 @@ -/* $Id: ncbi_std.c,v 1.16 2005/02/24 15:39:34 madden Exp $ +/* $Id: ncbi_std.c,v 1.17 2005/11/16 14:27:04 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. @@ -31,7 +31,7 @@ #ifndef SKIP_DOXYGEN_PROCESSING static char const rcsid[] = - "$Id: ncbi_std.c,v 1.16 2005/02/24 15:39:34 madden Exp $"; + "$Id: ncbi_std.c,v 1.17 2005/11/16 14:27:04 madden Exp $"; #endif /* SKIP_DOXYGEN_PROCESSING */ #include <algo/blast/core/blast_def.h> /* for sfree() macro */ diff --git a/algo/blast/core/ncbi_std.h b/algo/blast/core/ncbi_std.h index c2aff8ff..e7e022a6 100644 --- a/algo/blast/core/ncbi_std.h +++ b/algo/blast/core/ncbi_std.h @@ -1,4 +1,4 @@ -/* $Id: ncbi_std.h,v 1.34 2004/12/14 17:11:24 ucko Exp $ +/* $Id: ncbi_std.h,v 1.37 2005/11/16 14:31:37 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. @@ -23,7 +23,6 @@ * * =========================================================================== * - * Author: Ilya Dondoshansky * */ @@ -127,26 +126,55 @@ typedef Uint1 Boolean; #ifndef _NCBISTD_ /* if we're not in the C toolkit ... */ +#ifndef UINT4_MAX /** largest number represented by unsigned int. */ #define UINT4_MAX 4294967295U +#endif + +#ifndef INT4_MAX /** largest nubmer represented by signed int */ #define INT4_MAX 2147483647 +#endif + +#ifndef INT4_MIN /** Smallest (most negative) number represented by signed int */ #define INT4_MIN (-2147483647-1) +#endif + +#ifndef NCBIMATH_LN2 /** natural log of 2. */ #define NCBIMATH_LN2 0.69314718055994530941723212145818 +#endif + +#ifndef INT2_MAX /** largest number represented by signed (two byte) short */ #define INT2_MAX 32767 +#endif + +#ifndef INT2_MIN /** smallest (most negative) number represented by signed (two byte) short */ #define INT2_MIN (-32768) +#endif + +#ifndef INT1_MAX +/** largest number represented by signed short (one byte) */ +#define INT1_MAX 127 +#endif + +#ifndef INT1_MIN +/** smallest (most negative) number represented by signed short (one byte) */ +#define INT1_MIN (-128) +#endif #ifndef DIM /** dimension of an array. */ #define DIM(A) (sizeof(A)/sizeof((A)[0])) #endif +#ifndef NULLB /** terminating byte of a char* string. */ #define NULLB '\0' +#endif #endif /* _NCBISTD_ */ diff --git a/algo/blast/core/pattern.c b/algo/blast/core/pattern.c index b970d61e..44665e85 100644 --- a/algo/blast/core/pattern.c +++ b/algo/blast/core/pattern.c @@ -1,4 +1,4 @@ -/* $Id: pattern.c,v 1.17 2005/07/18 19:38:33 bealer Exp $ +/* $Id: pattern.c,v 1.18 2005/11/16 14:27:04 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. @@ -56,7 +56,7 @@ #ifndef SKIP_DOXYGEN_PROCESSING static char const rcsid[] = - "$Id: pattern.c,v 1.17 2005/07/18 19:38:33 bealer Exp $"; + "$Id: pattern.c,v 1.18 2005/11/16 14:27:04 madden Exp $"; #endif /* SKIP_DOXYGEN_PROCESSING */ #include <algo/blast/core/pattern.h> diff --git a/algo/blast/core/pattern.h b/algo/blast/core/pattern.h index 864cadf7..a7eb8cae 100644 --- a/algo/blast/core/pattern.h +++ b/algo/blast/core/pattern.h @@ -1,4 +1,4 @@ -/* $Id: pattern.h,v 1.7 2005/07/18 19:38:33 bealer Exp $ +/* $Id: pattern.h,v 1.8 2005/11/16 14:31:37 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. diff --git a/algo/blast/core/pattern_priv.h b/algo/blast/core/pattern_priv.h index 541bf734..1a79bc49 100644 --- a/algo/blast/core/pattern_priv.h +++ b/algo/blast/core/pattern_priv.h @@ -1,4 +1,4 @@ -/* $Id: pattern_priv.h,v 1.3 2005/07/18 19:38:33 bealer Exp $ +/* $Id: pattern_priv.h,v 1.4 2005/11/16 14:27:04 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. diff --git a/algo/blast/core/phi_extend.c b/algo/blast/core/phi_extend.c index 149c77b9..de73206d 100644 --- a/algo/blast/core/phi_extend.c +++ b/algo/blast/core/phi_extend.c @@ -1,4 +1,4 @@ -/* $Id: phi_extend.c,v 1.12 2005/04/27 19:56:13 dondosha Exp $ +/* $Id: phi_extend.c,v 1.13 2005/11/16 14:27:04 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. @@ -33,7 +33,7 @@ #ifndef SKIP_DOXYGEN_PROCESSING static char const rcsid[] = - "$Id: phi_extend.c,v 1.12 2005/04/27 19:56:13 dondosha Exp $"; + "$Id: phi_extend.c,v 1.13 2005/11/16 14:27:04 madden Exp $"; #endif /* SKIP_DOXYGEN_PROCESSING */ #include <algo/blast/core/blast_def.h> diff --git a/algo/blast/core/phi_extend.h b/algo/blast/core/phi_extend.h index 74d39ca6..33bbcfd0 100644 --- a/algo/blast/core/phi_extend.h +++ b/algo/blast/core/phi_extend.h @@ -1,4 +1,4 @@ -/* $Id: phi_extend.h,v 1.10 2005/05/18 15:27:19 papadopo Exp $ +/* $Id: phi_extend.h,v 1.11 2005/11/16 14:31:37 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. diff --git a/algo/blast/core/phi_gapalign.c b/algo/blast/core/phi_gapalign.c index 02d5668e..7750f73e 100644 --- a/algo/blast/core/phi_gapalign.c +++ b/algo/blast/core/phi_gapalign.c @@ -1,4 +1,4 @@ -/* $Id: phi_gapalign.c,v 1.7 2005/08/17 16:21:31 dondosha Exp $ +/* $Id: phi_gapalign.c,v 1.9 2005/11/30 18:25:32 papadopo Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. @@ -58,13 +58,14 @@ #ifndef SKIP_DOXYGEN_PROCESSING static char const rcsid[] = - "$Id: phi_gapalign.c,v 1.7 2005/08/17 16:21:31 dondosha Exp $"; + "$Id: phi_gapalign.c,v 1.9 2005/11/30 18:25:32 papadopo Exp $"; #endif /* SKIP_DOXYGEN_PROCESSING */ #include <algo/blast/core/blast_options.h> #include <algo/blast/core/blast_def.h> #include <algo/blast/core/phi_gapalign.h> #include <algo/blast/core/blast_encoding.h> +#include <algo/blast/core/blast_gapalign.h> #include "blast_gapalign_priv.h" #include "pattern_priv.h" @@ -101,7 +102,7 @@ s_Align(Uint1 * seq1, Uint1 * seq2, Int4 end1, Int4 end2, Int4 lowDiag, inclusive*/ Int4 diagIndex; /*loop index over diagonals*/ Int4 leftd, rightd; /* diagonal indices for CC, DD, CP and DP */ - BlastGapSmallDP* score_array; /*array for dynamic program information*/ + BlastGapDP* score_array; /*array for dynamic program information*/ Int4 curd; /* current index for CC, DD CP and DP */ Int4 i; /*loop index*/ Int4 index1; /*index on seq1*/ @@ -109,7 +110,7 @@ s_Align(Uint1 * seq1, Uint1 * seq2, Int4 end1, Int4 end2, Int4 lowDiag, Int4 temp_indel_score = 0; /*placeholder for an indel score */ Int4 tempHorScore; /*dual of temp_indel_score for the case where a horizontal edge (insertion) is the last step*/ - BlastGapSmallDP* score_row = NULL; /*points to a row of CD*/ + BlastGapDP* score_row = NULL; /*points to a row of CD*/ Int4 stateDecoder; /*used to decode the edge information in a state*/ Int4 initialScore; /*score to initialize dynamic program entries*/ Int4 *matrixRow; /*row of score matrix*/ @@ -129,7 +130,7 @@ s_Align(Uint1 * seq1, Uint1 * seq2, Int4 end1, Int4 end2, Int4 lowDiag, band = highDiag-lowDiag+1; /* Allocate array of scores. */ - score_array = (BlastGapSmallDP*) calloc(band+2, sizeof(BlastGapSmallDP)); + score_array = (BlastGapDP*) calloc(band+2, sizeof(BlastGapDP)); state = (Int1 **) malloc(sizeof(Int1 *)*(end1+1)); state[0] = (Int1 *) malloc((end1+1)*(band+2)); diff --git a/algo/blast/core/phi_lookup.c b/algo/blast/core/phi_lookup.c index 818807ce..af40b90f 100644 --- a/algo/blast/core/phi_lookup.c +++ b/algo/blast/core/phi_lookup.c @@ -1,4 +1,4 @@ -/* $Id: phi_lookup.c,v 1.29 2005/08/23 20:26:58 camacho Exp $ +/* $Id: phi_lookup.c,v 1.30 2005/11/16 14:27:04 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. @@ -34,7 +34,7 @@ #ifndef SKIP_DOXYGEN_PROCESSING static char const rcsid[] = - "$Id: phi_lookup.c,v 1.29 2005/08/23 20:26:58 camacho Exp $"; + "$Id: phi_lookup.c,v 1.30 2005/11/16 14:27:04 madden Exp $"; #endif /* SKIP_DOXYGEN_PROCESSING */ #include <algo/blast/core/blast_def.h> diff --git a/algo/blast/core/phi_lookup.h b/algo/blast/core/phi_lookup.h index 112f2306..58ccf747 100644 --- a/algo/blast/core/phi_lookup.h +++ b/algo/blast/core/phi_lookup.h @@ -1,4 +1,4 @@ -/* $Id: phi_lookup.h,v 1.9 2005/04/27 19:50:47 dondosha Exp $ +/* $Id: phi_lookup.h,v 1.10 2005/11/16 14:31:37 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -6,7 +6,7 @@ * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of - * the author's offical duties as a United States Government employee and + * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. |