summaryrefslogtreecommitdiff
path: root/algo
diff options
context:
space:
mode:
authorAaron M. Ucko <ucko@debian.org>2005-12-08 22:58:31 +0000
committerAaron M. Ucko <ucko@debian.org>2005-12-08 22:58:31 +0000
commit4b1edc60532e16ec7d0255e1c3552c2e6a33737e (patch)
treeacef5e2808d7dbc8a9c41da8ef71b1436f51718e /algo
parent23f1d44b259776e399f7fe0fe883c7e9445e192b (diff)
Load /tmp/.../ncbi-tools6-6.1.20051206 into
branches/upstream/current.
Diffstat (limited to 'algo')
-rw-r--r--algo/blast/api/blast_api.c33
-rw-r--r--algo/blast/api/blast_api.h12
-rw-r--r--algo/blast/api/blast_format.c6
-rw-r--r--algo/blast/api/blast_options_api.c106
-rw-r--r--algo/blast/api/blast_options_api.h47
-rw-r--r--algo/blast/api/blast_seq.c135
-rw-r--r--algo/blast/api/blast_seq.h6
-rw-r--r--algo/blast/api/blast_tabular.c15
-rw-r--r--algo/blast/api/dust_filter.c5
-rw-r--r--algo/blast/api/repeats_filter.c25
-rw-r--r--algo/blast/api/twoseq_api.c21
-rw-r--r--algo/blast/composition_adjustment/compo_heap.c510
-rw-r--r--algo/blast/composition_adjustment/compo_heap.h124
-rw-r--r--algo/blast/composition_adjustment/compo_mode_condition.c240
-rw-r--r--algo/blast/composition_adjustment/compo_mode_condition.h117
-rw-r--r--algo/blast/composition_adjustment/composition_adjustment.c1376
-rw-r--r--algo/blast/composition_adjustment/composition_adjustment.h168
-rw-r--r--algo/blast/composition_adjustment/composition_constants.h60
-rw-r--r--algo/blast/composition_adjustment/matrix_frequency_data.c230
-rw-r--r--algo/blast/composition_adjustment/matrix_frequency_data.h54
-rw-r--r--algo/blast/composition_adjustment/nlm_linear_algebra.c228
-rw-r--r--algo/blast/composition_adjustment/nlm_linear_algebra.h121
-rw-r--r--algo/blast/composition_adjustment/optimize_target_freq.c536
-rw-r--r--algo/blast/composition_adjustment/optimize_target_freq.h101
-rw-r--r--algo/blast/composition_adjustment/redo_alignment.c1367
-rw-r--r--algo/blast/composition_adjustment/redo_alignment.h333
-rw-r--r--algo/blast/composition_adjustment/smith_waterman.c715
-rw-r--r--algo/blast/composition_adjustment/smith_waterman.h103
-rw-r--r--algo/blast/core/aa_ungapped.c6
-rw-r--r--algo/blast/core/aa_ungapped.h4
-rw-r--r--algo/blast/core/blast_def.h42
-rw-r--r--algo/blast/core/blast_diagnostics.c6
-rw-r--r--algo/blast/core/blast_diagnostics.h5
-rw-r--r--algo/blast/core/blast_dust.c11
-rw-r--r--algo/blast/core/blast_dust.h4
-rw-r--r--algo/blast/core/blast_engine.c37
-rw-r--r--algo/blast/core/blast_engine.h34
-rw-r--r--algo/blast/core/blast_extend.c109
-rw-r--r--algo/blast/core/blast_filter.c668
-rw-r--r--algo/blast/core/blast_filter.h100
-rw-r--r--algo/blast/core/blast_gapalign.c675
-rw-r--r--algo/blast/core/blast_gapalign.h13
-rw-r--r--algo/blast/core/blast_gapalign_priv.h25
-rw-r--r--algo/blast/core/blast_hits.c73
-rw-r--r--algo/blast/core/blast_hits.h8
-rw-r--r--algo/blast/core/blast_inline.h4
-rw-r--r--algo/blast/core/blast_itree.c6
-rw-r--r--algo/blast/core/blast_itree.h4
-rw-r--r--algo/blast/core/blast_kappa.c3998
-rw-r--r--algo/blast/core/blast_kappa.h7
-rw-r--r--algo/blast/core/blast_lookup.c6
-rw-r--r--algo/blast/core/blast_lookup.h4
-rw-r--r--algo/blast/core/blast_message.c9
-rw-r--r--algo/blast/core/blast_options.c99
-rw-r--r--algo/blast/core/blast_options.h33
-rw-r--r--algo/blast/core/blast_parameters.c56
-rw-r--r--algo/blast/core/blast_parameters.h4
-rw-r--r--algo/blast/core/blast_program.c4
-rw-r--r--algo/blast/core/blast_program.h4
-rw-r--r--algo/blast/core/blast_psi.c4
-rw-r--r--algo/blast/core/blast_psi.h4
-rw-r--r--algo/blast/core/blast_psi_priv.c49
-rw-r--r--algo/blast/core/blast_rps.h4
-rw-r--r--algo/blast/core/blast_seg.h4
-rw-r--r--algo/blast/core/blast_setup.c72
-rw-r--r--algo/blast/core/blast_setup.h13
-rw-r--r--algo/blast/core/blast_stat.c612
-rw-r--r--algo/blast/core/blast_stat.h30
-rw-r--r--algo/blast/core/blast_traceback.c18
-rw-r--r--algo/blast/core/blast_traceback.h5
-rw-r--r--algo/blast/core/blast_util.c105
-rw-r--r--algo/blast/core/blast_util.h26
-rw-r--r--algo/blast/core/gapinfo.c6
-rw-r--r--algo/blast/core/gapinfo.h4
-rw-r--r--algo/blast/core/greedy_align.c27
-rw-r--r--algo/blast/core/greedy_align.h7
-rw-r--r--algo/blast/core/hspstream_collector.c15
-rw-r--r--algo/blast/core/link_hsps.c7
-rw-r--r--algo/blast/core/link_hsps.h4
-rw-r--r--algo/blast/core/lookup_util.c6
-rw-r--r--algo/blast/core/lookup_util.h4
-rw-r--r--algo/blast/core/lookup_wrap.c6
-rw-r--r--algo/blast/core/lookup_wrap.h4
-rw-r--r--algo/blast/core/mb_lookup.c6
-rw-r--r--algo/blast/core/mb_lookup.h4
-rw-r--r--algo/blast/core/ncbi_std.c6
-rw-r--r--algo/blast/core/ncbi_std.h34
-rw-r--r--algo/blast/core/pattern.c6
-rw-r--r--algo/blast/core/pattern.h4
-rw-r--r--algo/blast/core/pattern_priv.h4
-rw-r--r--algo/blast/core/phi_extend.c6
-rw-r--r--algo/blast/core/phi_extend.h4
-rw-r--r--algo/blast/core/phi_gapalign.c13
-rw-r--r--algo/blast/core/phi_lookup.c6
-rw-r--r--algo/blast/core/phi_lookup.h4
95 files changed, 9022 insertions, 4963 deletions
diff --git a/algo/blast/api/blast_api.c b/algo/blast/api/blast_api.c
index d2792239..1dc5e16e 100644
--- a/algo/blast/api/blast_api.c
+++ b/algo/blast/api/blast_api.c
@@ -1,4 +1,4 @@
-/* $Id: blast_api.c,v 1.18 2005/08/22 19:24:02 madden Exp $
+/* $Id: blast_api.c,v 1.20 2005/09/19 15:40:03 camacho Exp $
***************************************************************************
* *
* COPYRIGHT NOTICE *
@@ -429,7 +429,6 @@ Blast_RunSearch(SeqLoc* query_seqloc,
BlastTabularFormatData* tf_data,
BlastHSPResults **results,
SeqLoc** filter_out,
- Boolean* mask_at_hash,
Blast_SummaryReturn* extra_returns)
{
Int2 status = 0;
@@ -439,6 +438,7 @@ Blast_RunSearch(SeqLoc* query_seqloc,
BlastSeqLoc* lookup_segments = NULL;
BlastScoreBlk* sbp = NULL;
LookupTableWrap* lookup_wrap = NULL;
+ BlastMaskLoc* mask_loc = NULL;
BlastHSPStream* hsp_stream = NULL;
const EBlastProgramType kProgram = options->program;
const Boolean kRpsBlast =
@@ -447,7 +447,6 @@ Blast_RunSearch(SeqLoc* query_seqloc,
BlastRPSInfo* rps_info = NULL;
Nlm_MemMapPtr rps_mmap = NULL;
Nlm_MemMapPtr rps_pssm_mmap = NULL;
- BlastMaskInformation mask_info;
const QuerySetUpOptions* query_options = options->query_options;
const LookupTableOptions* lookup_options = options->lookup_options;
const BlastScoringOptions* score_options = options->score_options;
@@ -468,12 +467,9 @@ Blast_RunSearch(SeqLoc* query_seqloc,
if (options->program == eBlastTypeBlastn)
{
SeqLoc* dust_mask = NULL; /* Dust mask locations */
-
Blast_FindDustSeqLoc(query_seqloc, options, &dust_mask);
-
/* Combine dust mask with lower case mask */
- if (dust_mask)
- masking_locs = ValNodeLink(&masking_locs, dust_mask);
+ ValNodeLink(&masking_locs, dust_mask);
}
if (kRpsBlast) {
@@ -495,22 +491,17 @@ Blast_RunSearch(SeqLoc* query_seqloc,
}
status =
- BLAST_MainSetUp(kProgram, query_options, score_options,
- hit_options, query, query_info, scale_factor,
- &lookup_segments, &mask_info, &sbp,
- &extra_returns->error);
-
- if (mask_at_hash)
- *mask_at_hash = mask_info.mask_at_hash;
+ BLAST_MainSetUp(kProgram, query_options, score_options, query,
+ query_info, scale_factor, &lookup_segments, &mask_loc,
+ &sbp, &extra_returns->error);
if (filter_out) {
*filter_out =
- BlastMaskLocToSeqLoc(kProgram, mask_info.filter_slp,
- query_seqloc);
+ BlastMaskLocToSeqLoc(kProgram, mask_loc, query_seqloc);
}
/* Mask locations in BlastMaskLoc form are no longer needed. */
- BlastMaskLocFree(mask_info.filter_slp);
+ BlastMaskLocFree(mask_loc);
if (status)
return status;
@@ -559,7 +550,6 @@ Blast_DatabaseSearch(SeqLoc* query_seqloc, char* db_name,
BlastTabularFormatData* tf_data,
SeqAlign **seqalign_out,
SeqLoc** filter_out,
- Boolean* mask_at_hash,
Blast_SummaryReturn* extra_returns)
{
BlastSeqSrc *seq_src = NULL;
@@ -597,7 +587,7 @@ Blast_DatabaseSearch(SeqLoc* query_seqloc, char* db_name,
status =
Blast_RunSearch(query_seqloc, seq_src, masking_locs, options, tf_data,
- &results, filter_out, mask_at_hash, extra_returns);
+ &results, filter_out, extra_returns);
/* The ReadDBFILE structure will not be destroyed here, because the
initialising function used readdb_attach */
@@ -709,7 +699,7 @@ PHIBlastRunSearch(SeqLoc* query_seqloc, char* db_name, SeqLoc* masking_locs,
PHI BLAST, so pass NULL in corresponding arguments. */
status =
Blast_RunSearch(query_seqloc, seq_src, masking_locs, options, NULL,
- &results, filter_out, NULL, extra_returns);
+ &results, filter_out, extra_returns);
/* The ReadDBFILE structure will not be destroyed here, because the
initialising function used readdb_attach */
@@ -738,7 +728,6 @@ Blast_TwoSeqLocSetsAdvanced(SeqLoc* query_seqloc,
BlastTabularFormatData* tf_data,
SeqAlign **seqalign_out,
SeqLoc** filter_out,
- Boolean* mask_at_hash,
Blast_SummaryReturn* extra_returns)
{
BlastSeqSrc *seq_src = NULL;
@@ -766,7 +755,7 @@ Blast_TwoSeqLocSetsAdvanced(SeqLoc* query_seqloc,
status =
Blast_RunSearch(query_seqloc, seq_src, masking_locs, options, tf_data,
- &results, filter_out, mask_at_hash, extra_returns);
+ &results, filter_out, extra_returns);
/* The ReadDBFILE structure will not be destroyed here, because the
initialising function used readdb_attach */
diff --git a/algo/blast/api/blast_api.h b/algo/blast/api/blast_api.h
index 8e76ee8c..47d9b9d6 100644
--- a/algo/blast/api/blast_api.h
+++ b/algo/blast/api/blast_api.h
@@ -1,4 +1,4 @@
-/* $Id: blast_api.h,v 1.4 2005/04/27 19:59:26 dondosha Exp $
+/* $Id: blast_api.h,v 1.5 2005/08/29 14:44:19 camacho Exp $
***************************************************************************
* *
* COPYRIGHT NOTICE *
@@ -61,8 +61,6 @@ extern "C" {
* @param tf_data Structure to use for on-the-fly tabular formatting [in]
* @param seqalign_out All results in Seq-align form. [out]
* @param filter_out Filtering locations [out]
- * @param mask_at_hash Was filtering performed only for lookup table, but not
- * for extension? [out]
* @param extra_returns Additional information about the search [out]
*/
Int2
@@ -72,7 +70,6 @@ Blast_DatabaseSearch(SeqLoc* query_seqloc, char* db_name,
BlastTabularFormatData* tf_data,
SeqAlign **seqalign_out,
SeqLoc** filter_out,
- Boolean* mask_at_hash,
Blast_SummaryReturn* extra_returns);
/** Compares a list of SeqLoc's against another list of SeqLoc's,
@@ -84,8 +81,6 @@ Blast_DatabaseSearch(SeqLoc* query_seqloc, char* db_name,
* @param tf_data Structure to use for on-the-fly tabular formatting [in]
* @param seqalign_out All results in Seq-align form. [out]
* @param filter_out Filtering locations [out]
- * @param mask_at_hash Was filtering performed only for lookup table, but not
- * for extension? [out]
* @param extra_returns Additional information about the search [out]
*/
Int2
@@ -96,7 +91,6 @@ Blast_TwoSeqLocSetsAdvanced(SeqLoc* query_seqloc,
BlastTabularFormatData* tf_data,
SeqAlign **seqalign_out,
SeqLoc** filter_out,
- Boolean* mask_at_hash,
Blast_SummaryReturn* extra_returns);
/** Compare a list of query SeqLoc's against a source of subject sequences.
@@ -108,7 +102,6 @@ Blast_TwoSeqLocSetsAdvanced(SeqLoc* query_seqloc,
* @param results Search results [out]
* @param filter_out Query locations that were masked (filtered) during the
* search [out]
- * @param mask_at_hash Was masking performed only for in the lookup table? [out]
* @param extra_returns Additional search statistits [out]
* @return 0 on success, -1 on failure.
*/
@@ -120,7 +113,6 @@ Blast_RunSearch(SeqLoc* query_seqloc,
BlastTabularFormatData* tf_data,
BlastHSPResults **results,
SeqLoc** filter_out,
- Boolean* mask_at_hash,
Blast_SummaryReturn* extra_returns);
/** Run a PHI BLAST search for a query SeqLoc against a database. Return results
@@ -135,8 +127,6 @@ Blast_RunSearch(SeqLoc* query_seqloc,
* ValNode data points to a Seq-align. [out]
* @param filter_out Query locations that were masked (filtered) during the
* search. [out]
- * NB: masking at hash is not applicable to PHI BLAST,
- * so there is no mask_at_hash output argument.
* @param extra_returns Additional search statistits [out]
* @return 0 on success, -1 on failure.
*/
diff --git a/algo/blast/api/blast_format.c b/algo/blast/api/blast_format.c
index b38f2f9c..02c32543 100644
--- a/algo/blast/api/blast_format.c
+++ b/algo/blast/api/blast_format.c
@@ -1,4 +1,4 @@
-/* $Id: blast_format.c,v 1.95 2005/08/08 15:50:20 dondosha Exp $
+/* $Id: blast_format.c,v 1.96 2005/11/22 13:31:05 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -31,7 +31,7 @@
* Formatting of BLAST results (SeqAlign)
*/
-static char const rcsid[] = "$Id: blast_format.c,v 1.95 2005/08/08 15:50:20 dondosha Exp $";
+static char const rcsid[] = "$Id: blast_format.c,v 1.96 2005/11/22 13:31:05 madden Exp $";
#include <algo/blast/api/blast_format.h>
#include <algo/blast/api/blast_seq.h>
@@ -1182,7 +1182,7 @@ Blast_SeqIdGetDefLine(SeqId* sip, char** buffer_ptr, Boolean ncbi_gi,
Boolean accession_only)
{
char* seqid_buffer = NULL;
- Int4 gi;
+ Int4 gi = 0;
Boolean numeric_id_type = FALSE;
*buffer_ptr = NULL;
diff --git a/algo/blast/api/blast_options_api.c b/algo/blast/api/blast_options_api.c
index 715f152a..7eb9a47b 100644
--- a/algo/blast/api/blast_options_api.c
+++ b/algo/blast/api/blast_options_api.c
@@ -1,4 +1,4 @@
-/* $Id: blast_options_api.c,v 1.9 2005/08/08 15:48:22 dondosha Exp $
+/* $Id: blast_options_api.c,v 1.15 2005/10/31 14:14:29 madden Exp $
***************************************************************************
* *
* COPYRIGHT NOTICE *
@@ -145,6 +145,52 @@ Int2 SBlastOptionsSetWordSize(SBlastOptions* options, Int4 word_size)
return -1;
}
+Int2 SBlastOptionsSetThreshold(SBlastOptions* options, Int4 threshold)
+{
+
+ if (!options || !options->lookup_options || !options->score_options)
+ return -1;
+
+ if (threshold < 0)
+ return -2;
+
+ if (Blast_QueryIsNucleotide(options->program) == TRUE && Blast_QueryIsTranslated(options->program) == FALSE)
+ return 0;
+
+ if (threshold == 0)
+ {
+ Int2 status=0;
+ if ((status=BLAST_GetSuggestedThreshold(options->program, options->score_options->matrix, &threshold)) != 0)
+ return status;
+ }
+
+ options->lookup_options->threshold = threshold;
+
+ return 0;
+}
+
+Int2 SBlastOptionsSetWindowSize(SBlastOptions* options, Int4 window_size)
+{
+
+ if (!options || !options->score_options || !options->word_options)
+ return -1;
+
+ if (window_size < 0)
+ return -2;
+
+ if (Blast_QueryIsNucleotide(options->program) == TRUE && Blast_QueryIsTranslated(options->program) == FALSE)
+ return 0;
+
+ if (window_size == 0)
+ {
+ Int2 status=0;
+ if ((status=BLAST_GetSuggestedWindowSize(options->program, options->score_options->matrix, &window_size)) != 0)
+ return status;
+ }
+
+ options->word_options->window_size = window_size;
+}
+
Int2 SBlastOptionsSetDiscMbParams(SBlastOptions* options, Int4 template_length,
Int4 template_type)
{
@@ -167,9 +213,59 @@ Int2 SBlastOptionsSetMatrixAndGapCosts(SBlastOptions* options,
if (!matrix_name || !options || !options->score_options)
return -1;
+ /* Reward penalty do not apply to blastn. */
+ if (options->program == eBlastTypeBlastn)
+ return 0;
+
status =
BlastScoringOptionsSetMatrix(options->score_options, matrix_name);
+ if (status != 0)
+ return status;
+
+ if (gap_open < 0 || gap_extend < 0)
+ {
+ Int4 gap_open_priv = 0;
+ Int4 gap_extend_priv = 0;
+
+ BLAST_GetProteinGapExistenceExtendParams(matrix_name, &gap_open_priv, &gap_extend_priv);
+ if (gap_open < 0)
+ gap_open = gap_open_priv;
+ if (gap_extend < 0)
+ gap_extend = gap_extend_priv;
+ }
+
+ options->score_options->gap_open = gap_open;
+ options->score_options->gap_extend = gap_extend;
+
+ return status;
+}
+
+Int2 SBlastOptionsSetRewardPenaltyAndGapCosts(SBlastOptions* options,
+ Int4 reward, Int4 penalty,
+ Int4 gap_open, Int4 gap_extend)
+{
+ Int2 status = 0;
+
+ if (reward <= 0 || penalty >= 0 || !options || !options->score_options)
+ return -1;
+
+ /* Reward penalty only apply to blastn. */
+ if (options->program != eBlastTypeBlastn)
+ return 0;
+
+ if (gap_open < 0 || gap_extend < 0)
+ {
+ Int4 gap_open_priv = BLAST_GAP_OPEN_NUCL;
+ Int4 gap_extend_priv = BLAST_GAP_EXTN_NUCL;
+
+ BLAST_GetNucleotideGapExistenceExtendParams(reward, penalty, &gap_open_priv, &gap_extend_priv);
+ if (gap_open < 0)
+ gap_open = gap_open_priv;
+ if (gap_extend < 0)
+ gap_extend = gap_extend_priv;
+ }
+
options->score_options->gap_open = gap_open;
options->score_options->gap_extend = gap_extend;
@@ -216,5 +312,13 @@ Int2 SBlastOptionsSetDbGeneticCode(SBlastOptions* options, Int4 gc)
}
+Boolean SBlastOptionsGetMaskAtHash(const SBlastOptions* options)
+{
+ ASSERT(options && options->query_options &&
+ options->query_options->filtering_options);
+
+ return options->query_options->filtering_options->mask_at_hash;
+}
+
/* @} */
diff --git a/algo/blast/api/blast_options_api.h b/algo/blast/api/blast_options_api.h
index 8121518e..a60b6247 100644
--- a/algo/blast/api/blast_options_api.h
+++ b/algo/blast/api/blast_options_api.h
@@ -1,4 +1,4 @@
-/* $Id: blast_options_api.h,v 1.3 2005/04/06 23:27:53 dondosha Exp $
+/* $Id: blast_options_api.h,v 1.7 2005/10/31 14:14:29 madden Exp $
***************************************************************************
* *
* COPYRIGHT NOTICE *
@@ -105,15 +105,50 @@ Int2 SBlastOptionsSetDiscMbParams(SBlastOptions* options, Int4 template_length,
Int4 template_type);
/** Reset matrix name and gap costs to new values.
+ *
* @param options Options structure to update. [in] [out]
* @param matrix_name New matrix name [in]
- * @param gap_open New gap opening cost [in]
- * @param gap_extend New gap extension cost [in]
+ * @param gap_open New gap existence cost. If zero default for matrix is used. [in]
+ * @param gap_extend New gap extension cost. If zero default for matrix is used. [in]
*/
Int2 SBlastOptionsSetMatrixAndGapCosts(SBlastOptions* options,
const char* matrix_name,
+ Int4 gap_open,
+ Int4 gap_extend);
+
+
+/** Reset rewared, penalty and gap costs to new values.
+ * Will suggest and use conservative values if gap_open and gap_extend are zero
+ * and suggest is TRUE.
+ *
+ * @param options Options structure to update. [in] [out]
+ * @param reward match score [in]
+ * @param penalty mismatch score [in]
+ * @param gap_open New gap existence cost. If zero default for reward/penalty is used. [in]
+ * @param gap_extend New gap extension cost. If zero default for reward/penalty is used. [in]
+ */
+Int2 SBlastOptionsSetRewardPenaltyAndGapCosts(SBlastOptions* options,
+ Int4 reward, Int4 penalty,
Int4 gap_open, Int4 gap_extend);
+/** Set threshold value.
+ * @param options options Options structure to update. [in] [out]
+ * @param threshold New value to set, if zero default value for matrix
+ * will be used. [in]
+ * @param zero unless error (e.g., threshold is < zero)
+ */
+Int2 SBlastOptionsSetThreshold(SBlastOptions* options,
+ Int4 threshold);
+
+/** Set window size for two hit extension.
+ * @param options options Options structure to update. [in] [out]
+ * @param window_size New value to set, if zero default value for matrix
+ * will be used. [in]
+ * @param zero unless error (e.g., window_size is < zero)
+ */
+Int2 SBlastOptionsSetWindowSize(SBlastOptions* options,
+ Int4 window_size);
+
/** Reset database (subject) genetic code option to a new value.
* @param options Options structure to update. [in] [out]
* @param gc New genetic code value. [in]
@@ -128,6 +163,12 @@ Int2 SBlastOptionsSetDbGeneticCode(SBlastOptions* options, Int4 gc);
*/
Int2 SBlastOptionsSetFilterString(SBlastOptions* options, const char* str);
+/** Returns the mask-at-hash option value.
+ * @param options The options structure [in]
+ * @return Boolean value of the masking at hash option.
+ */
+Boolean SBlastOptionsGetMaskAtHash(const SBlastOptions* options);
+
/* @} */
#ifdef __cplusplus
diff --git a/algo/blast/api/blast_seq.c b/algo/blast/api/blast_seq.c
index b3e63b43..a82051b3 100644
--- a/algo/blast/api/blast_seq.c
+++ b/algo/blast/api/blast_seq.c
@@ -1,4 +1,4 @@
-static char const rcsid[] = "$Id: blast_seq.c,v 1.70 2005/07/27 12:38:18 madden Exp $";
+static char const rcsid[] = "$Id: blast_seq.c,v 1.73 2005/11/09 14:49:49 camacho Exp $";
/*
* ===========================================================================
*
@@ -44,23 +44,23 @@ static char const rcsid[] = "$Id: blast_seq.c,v 1.70 2005/07/27 12:38:18 madden
*/
/** Converts a SeqLocPtr to a BlastSeqLoc, used for formatting.
- * @param mask_slp SeqLocPtr to be converted [in]
+ * @param slp SeqLocPtr to be converted [in]
* @param head_loc BlastSeqLoc returned from last call [in]
* @return pointer to BlastSeqLoc
*/
static BlastSeqLoc*
-s_BlastSeqLocFromSeqLoc(SeqLocPtr mask_slp, BlastSeqLoc* head_loc)
+s_BlastSeqLocFromSeqLoc(SeqLocPtr slp, BlastSeqLoc* head_loc)
{
BlastSeqLoc* last_loc = head_loc;
- if (mask_slp == NULL)
+ if (slp == NULL)
return NULL;
- if (mask_slp->choice == SEQLOC_PACKED_INT)
- mask_slp = (SeqLocPtr) mask_slp->data.ptrvalue;
+ if (slp->choice == SEQLOC_PACKED_INT)
+ slp = (SeqLocPtr) slp->data.ptrvalue;
- for ( ; mask_slp; mask_slp = mask_slp->next) {
- SeqIntPtr si = (SeqIntPtr) mask_slp->data.ptrvalue;
+ for ( ; slp; slp = slp->next) {
+ SeqIntPtr si = (SeqIntPtr) slp->data.ptrvalue;
if (!head_loc) {
last_loc = head_loc = BlastSeqLocNew(&last_loc, si->from, si->to);
} else {
@@ -71,43 +71,70 @@ s_BlastSeqLocFromSeqLoc(SeqLocPtr mask_slp, BlastSeqLoc* head_loc)
}
BlastMaskLoc*
-BlastMaskLocFromSeqLoc(SeqLoc* mask_locs, SeqLoc* query_locs)
+BlastMaskLocFromSeqLoc(SeqLoc* mask_seqlocs, SeqLoc* query_seqlocs,
+ EBlastProgramType program_number)
{
- const Int4 kNumSeqs = ValNodeLen(query_locs);
- BlastMaskLoc* blast_mask;
- Int4 tmp_index=0;
- SeqLocPtr current_query_loc;
+ const Int4 kNumSeqs = ValNodeLen(query_seqlocs);
+ BlastMaskLoc* retval = NULL;
+ Int4 query_index = 0;
+ const unsigned int kNumContexts = BLAST_GetNumberOfContexts(program_number);
+ SeqLocPtr current_query_loc = NULL;
- if (!mask_locs)
+ if (!mask_seqlocs)
return NULL;
- blast_mask = BlastMaskLocNew(kNumSeqs);
+ retval = BlastMaskLocNew(kNumSeqs*kNumContexts);
- for (current_query_loc = query_locs; current_query_loc;
- current_query_loc = current_query_loc->next) {
- SeqLocPtr mask_var;
- for (mask_var = mask_locs; mask_var; mask_var = mask_var->next)
+ for (current_query_loc = query_seqlocs, query_index = 0;
+ current_query_loc;
+ current_query_loc = current_query_loc->next, query_index++) {
+
+ const int kCtxIndex = kNumContexts * query_index; /* context index */
+ SeqLocPtr mask_slp = NULL;
+
+ for (mask_slp = mask_seqlocs; mask_slp; mask_slp = mask_slp->next)
{
- SeqLocPtr current_mask = (SeqLocPtr) mask_var->data.ptrvalue;
+ SeqLocPtr current_mask = (SeqLocPtr) mask_slp->data.ptrvalue;
/* If mask is empty, advance to the next link in the mask chain.
If mask Seq-id does not match sequence Seq-id, stay with this mask
for the next link in the sequence Seq-loc chain. */
if (current_mask &&
- SeqIdMatch(SeqLocId(current_mask), SeqLocId(current_query_loc))) {
- blast_mask->seqloc_array[tmp_index] =
- s_BlastSeqLocFromSeqLoc(current_mask, blast_mask->seqloc_array[tmp_index]);
+ SeqIdMatch(SeqLocId(current_mask), SeqLocId(current_query_loc)))
+ {
+ retval->seqloc_array[kCtxIndex] =
+ s_BlastSeqLocFromSeqLoc(current_mask,
+ retval->seqloc_array[kCtxIndex]);
}
}
- if (blast_mask->seqloc_array[tmp_index])
+ if (retval->seqloc_array[kCtxIndex])
{
- BlastSeqLoc_RestrictToInterval(&blast_mask->seqloc_array[tmp_index],
- SeqLocStart(current_query_loc), SeqLocStop(current_query_loc));
+ const Boolean kIsNa = Blast_QueryIsNucleotide(program_number) &&
+ !Blast_QueryIsTranslated(program_number) &&
+ !Blast_ProgramIsPhiBlast(program_number);
+ BlastSeqLoc_RestrictToInterval(&retval->seqloc_array[kCtxIndex],
+ SeqLocStart(current_query_loc),
+ SeqLocStop(current_query_loc));
+ if (kIsNa) {
+ /* N.B.: Unlike in the C++ APIs, this logic is only applied to
+ * non-translated nucleotide queries. See comment for
+ * BlastMaskLocDNAToProtein */
+ Uint1 strand = SeqLocStrand(current_query_loc);
+ if (strand == Seq_strand_minus) {
+ retval->seqloc_array[kCtxIndex+1] =
+ retval->seqloc_array[kCtxIndex];
+ retval->seqloc_array[kCtxIndex] = NULL;
+ } else if (strand == Seq_strand_plus) {
+ retval->seqloc_array[kCtxIndex+1] = NULL;
+ } else {
+ retval->seqloc_array[kCtxIndex+1] =
+ BlastSeqLocListDup(retval->seqloc_array[kCtxIndex]);
+ }
+ }
}
- tmp_index++;
}
- return blast_mask;
+ return retval;
}
SeqLoc*
@@ -124,12 +151,10 @@ SeqLocPtr BlastMaskLocToSeqLoc(EBlastProgramType program_number,
const BlastMaskLoc* mask_loc,
SeqLoc* query_loc)
{
- SeqLocPtr mask_head = NULL, last_mask = NULL;
+ SeqLocPtr retval = NULL, retval_tail = NULL;
Int4 index;
- const Boolean k_translate = (program_number == eBlastTypeBlastx ||
- program_number == eBlastTypeTblastx ||
- program_number == eBlastTypeRpsTblastn);
- const Uint1 k_num_frames = (k_translate ? NUM_FRAMES : 1);
+ const Boolean k_translate = Blast_QueryIsTranslated(program_number);
+ const Uint1 k_num_frames = BLAST_GetNumberOfContexts(program_number);
SeqLoc* slp;
if (mask_loc == NULL || mask_loc->seqloc_array == NULL)
@@ -137,27 +162,25 @@ SeqLocPtr BlastMaskLocToSeqLoc(EBlastProgramType program_number,
for (index=0, slp = query_loc; slp; ++index, slp = slp->next)
{
- Int4 frame_index = index*k_num_frames;
+ const int kCtxIndex = k_num_frames * index; /* context index */
Int4 tmp_index;
Int4 slp_from = SeqLocStart(slp);
SeqIdPtr seqid = SeqLocId(slp);
- for (tmp_index=frame_index; tmp_index<(frame_index+k_num_frames); tmp_index++)
+ for (tmp_index=kCtxIndex; tmp_index<(kCtxIndex+k_num_frames); tmp_index++)
{
BlastSeqLoc* loc = NULL;
- SeqLocPtr mask_slp_head = NULL, mask_slp_last = NULL;
+ SeqLocPtr mask_slp_head = NULL, mask_slp_tail = NULL;
for (loc = mask_loc->seqloc_array[tmp_index]; loc; loc = loc->next)
{
- SSeqRange* di = loc->ssr;
SeqIntPtr si = SeqIntNew();
- si->from = di->left + slp_from;
- si->to = di->right + slp_from;
+ si->from = loc->ssr->left + slp_from;
+ si->to = loc->ssr->right + slp_from;
si->id = SeqIdDup(seqid);
- if (!mask_slp_last)
- mask_slp_last =
- ValNodeAddPointer(&mask_slp_head, SEQLOC_INT, si);
- else
- mask_slp_last =
- ValNodeAddPointer(&mask_slp_last, SEQLOC_INT, si);
+ /* Append the pointer, but also keep track of the tail of the list
+ * so that appending to the list is a constant operation */
+ mask_slp_tail = ValNodeAddPointer
+ ( (mask_slp_tail ? &mask_slp_tail : &mask_slp_head),
+ SEQLOC_INT, si);
}
if (mask_slp_head) {
@@ -172,15 +195,15 @@ SeqLocPtr BlastMaskLocToSeqLoc(EBlastProgramType program_number,
else
tmp_choice = 0;
- if (!last_mask) {
- last_mask = ValNodeAddPointer(&mask_head, tmp_choice, new_mask_slp);
- } else {
- last_mask = ValNodeAddPointer(&last_mask, tmp_choice, new_mask_slp);
- }
+ /* Append the pointer, but also keep track of the tail of the list
+ * so that appending to the list is a constant operation */
+ retval_tail = ValNodeAddPointer
+ ( (retval_tail ? &retval_tail : &retval),
+ tmp_choice, new_mask_slp);
}
}
}
- return mask_head;
+ return retval;
}
/** Set field values for one element of the context array of a
@@ -465,7 +488,7 @@ Int2 BLAST_GeneticCodeFind(Int4 gc, Uint1** genetic_code)
* @param query_info The query information structure, pre-initialized,
* but filled here [in]
* @param query_options Query setup options, containing the genetic code for
- * translation [in]
+ * translation. N.B.: its strand_option field is ignored [in]
* @param num_frames How many frames to get for this sequence? [in]
* @param encoding In what encoding to retrieve the sequence? [in]
* @param buffer_out Buffer to hold plus strand or protein [out]
@@ -620,10 +643,10 @@ Int2 BLAST_SetUpQuery(EBlastProgramType program_number,
return status;
if (masking_locs) {
- BlastMaskLoc* lcase_mask = BlastMaskLocFromSeqLoc(masking_locs, query_slp);
- if (program_number == eBlastTypeBlastx ||
- program_number == eBlastTypeTblastx ||
- program_number == eBlastTypeRpsTblastn)
+ BlastMaskLoc* lcase_mask = BlastMaskLocFromSeqLoc(masking_locs,
+ query_slp,
+ program_number);
+ if (Blast_QueryIsTranslated(program_number))
BlastMaskLocDNAToProtein(lcase_mask, *query_info);
(*query_blk)->lcase_mask = lcase_mask;
(*query_blk)->lcase_mask_allocated = TRUE;
diff --git a/algo/blast/api/blast_seq.h b/algo/blast/api/blast_seq.h
index 0fdc83c4..4c287fda 100644
--- a/algo/blast/api/blast_seq.h
+++ b/algo/blast/api/blast_seq.h
@@ -1,4 +1,4 @@
-/* $Id: blast_seq.h,v 1.26 2005/07/27 12:34:46 madden Exp $
+/* $Id: blast_seq.h,v 1.27 2005/09/20 00:04:02 camacho Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -62,12 +62,14 @@ BlastMaskLocToSeqLoc(EBlastProgramType program_number,
/** Convert a list of mask locations in a form of SeqLoc into a BlastMaskLoc
* structure. In case of multiple queries, it is not required to create a mask
* SeqLoc for every query.
+ * @param program_number identifies blastn, blastp, etc. [in]
* @param mask_locs Masking locations [in]
* @param seq_locs Sequence locations [in]
* @return Allocated and populated BlastMaskLoc structure.
*/
BlastMaskLoc*
-BlastMaskLocFromSeqLoc(SeqLoc* mask_locs, SeqLoc* seq_locs);
+BlastMaskLocFromSeqLoc(SeqLoc* mask_locs, SeqLoc* seq_locs,
+ EBlastProgramType program_number);
/** Frees a special type of SeqLoc list, used in BLAST for masking locations.
* @param mask_loc Input list of mask SeqLocs [in]
diff --git a/algo/blast/api/blast_tabular.c b/algo/blast/api/blast_tabular.c
index 7cfe61c8..3f1901e6 100644
--- a/algo/blast/api/blast_tabular.c
+++ b/algo/blast/api/blast_tabular.c
@@ -1,4 +1,4 @@
-/* $Id: blast_tabular.c,v 1.29 2005/08/05 22:29:50 dondosha Exp $
+/* $Id: blast_tabular.c,v 1.30 2005/11/22 13:30:34 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -28,7 +28,7 @@
* On-the-fly tabular formatting of BLAST results
*/
-static char const rcsid[] = "$Id: blast_tabular.c,v 1.29 2005/08/05 22:29:50 dondosha Exp $";
+static char const rcsid[] = "$Id: blast_tabular.c,v 1.30 2005/11/22 13:30:34 madden Exp $";
#include <algo/blast/api/blast_tabular.h>
#include <algo/blast/core/blast_util.h>
@@ -308,8 +308,10 @@ void* Blast_TabularFormatThread(void* data)
query_lengths = (Int4*) malloc(num_queries*sizeof(Int4));
for (index = 0, slp = tf_data->query_slp; slp; ++index, slp = slp->next) {
- query_id_array[index] = SeqLocId(slp);
- query_lengths[index] = SeqLocLen(slp);
+ BioseqPtr bsp = BioseqLockById(SeqLocId(slp));
+ query_id_array[index] = SeqIdSetDup(bsp->id);
+ query_lengths[index] = BioseqGetLen(bsp);
+ BioseqUnlockById(SeqLocId(slp));
}
one_seq_update_params = (BlastSeqSrcGetTotLen(seq_src) == 0);
@@ -482,6 +484,11 @@ void* Blast_TabularFormatThread(void* data)
BlastSequenceBlkFree(seq_arg.seq);
+ for (index = 0; index<num_queries; ++index)
+ {
+ SeqIdSetFree(query_id_array[index]);
+ query_id_array[index] = NULL;
+ }
sfree(query_lengths);
sfree(query_id_array);
diff --git a/algo/blast/api/dust_filter.c b/algo/blast/api/dust_filter.c
index 091ec11d..890fb6d0 100644
--- a/algo/blast/api/dust_filter.c
+++ b/algo/blast/api/dust_filter.c
@@ -1,4 +1,4 @@
-static char const rcsid[] = "$Id: dust_filter.c,v 1.4 2005/08/17 16:24:44 dondosha Exp $";
+static char const rcsid[] = "$Id: dust_filter.c,v 1.5 2005/09/20 00:04:27 camacho Exp $";
/*
* ===========================================================================
@@ -57,6 +57,7 @@ s_GetFilteringLocations(BLAST_SequenceBlk* query_blk, BlastQueryInfo* query_info
const Boolean kIsNucl = TRUE;
Boolean no_forward_strand = (query_info->first_context > 0); /* filtering needed on reverse strand. */
SeqLoc* slp_var = query_seqloc;
+Int4 qindex = 0;
ASSERT(query_info && query_blk && filter_maskloc && query_seqloc);
@@ -82,7 +83,7 @@ s_GetFilteringLocations(BLAST_SequenceBlk* query_blk, BlastQueryInfo* query_info
if (!reverse || no_forward_strand)
{
BlastSeqLoc *filter_slp = NULL; /* Used to hold combined SeqLoc's */
- Int4 filter_index = BlastGetMaskLocIndexFromContext(kIsNucl, context);
+ Int4 filter_index = context;
Int4 context_offset = query_info->contexts[context].query_offset;
Uint1* buffer = &query_blk->sequence[context_offset];
SDustOptions* dust_options = filter_options->dustOptions;
diff --git a/algo/blast/api/repeats_filter.c b/algo/blast/api/repeats_filter.c
index 11c11466..36566dc2 100644
--- a/algo/blast/api/repeats_filter.c
+++ b/algo/blast/api/repeats_filter.c
@@ -1,4 +1,4 @@
-static char const rcsid[] = "$Id: repeats_filter.c,v 1.7 2005/04/21 15:00:36 dondosha Exp $";
+static char const rcsid[] = "$Id: repeats_filter.c,v 1.12 2005/09/20 18:27:50 kans Exp $";
/*
* ===========================================================================
@@ -39,6 +39,7 @@ static char const rcsid[] = "$Id: repeats_filter.c,v 1.7 2005/04/21 15:00:36 don
#include <algo/blast/api/repeats_filter.h>
#include <algo/blast/api/blast_api.h>
#include <algo/blast/core/blast_filter.h>
+#include <algo/blast/core/blast_util.h>
#include <algo/blast/api/blast_seq.h>
#include <algo/blast/api/seqsrc_readdb.h>
@@ -93,6 +94,8 @@ s_FillMaskLocFromBlastHSPResults(SeqLoc* query_seqloc, BlastHSPResults* results,
Int4 query_index;
SeqLoc* slp;
BlastMaskLoc* mask;
+ const EBlastProgramType kProgram = eBlastTypeBlastn;
+ const Uint4 kNumContexts = BLAST_GetNumberOfContexts(eBlastTypeBlastn);
if (!query_seqloc || !mask_seqloc)
return -1;
@@ -104,14 +107,13 @@ s_FillMaskLocFromBlastHSPResults(SeqLoc* query_seqloc, BlastHSPResults* results,
}
num_seqs = ValNodeLen(query_seqloc);
- mask = BlastMaskLocNew(num_seqs);
+ mask = BlastMaskLocNew(num_seqs*kNumContexts);
for (query_index = 0, slp = query_seqloc; slp;
++query_index, slp = slp->next) {
Int4 query_length, query_start;
Int4 hit_index;
BlastSeqLoc* loc_list = NULL, *ordered_loc_list = NULL;
- BlastSeqLoc* last_loc = NULL;
BlastHitList* hit_list = results->hitlist_array[query_index];
if (!hit_list) {
@@ -142,25 +144,19 @@ s_FillMaskLocFromBlastHSPResults(SeqLoc* query_seqloc, BlastHSPResults* results,
sequence. */
left += query_start;
right += query_start;
- /* If this is the first mask for this query, create a new
- BlastSeqLoc, otherwise append to the end of the list. */
- if (!last_loc)
- loc_list = last_loc = BlastSeqLocNew(NULL, left, right);
- else
- last_loc = BlastSeqLocNew(&last_loc, left, right);
+ BlastSeqLocNew(&loc_list, left, right);
}
}
/* Make the intervals unique */
- CombineMaskLocations(loc_list, &ordered_loc_list,
- REPEAT_MASK_LINK_VALUE);
+ ordered_loc_list = BlastSeqLocCombine(loc_list, REPEAT_MASK_LINK_VALUE);
/* Free the list of locations that's no longer needed. */
loc_list = BlastSeqLocFree(loc_list);
- mask->seqloc_array[query_index] = ordered_loc_list;
+ mask->seqloc_array[query_index*kNumContexts] = ordered_loc_list;
}
- *mask_seqloc = BlastMaskLocToSeqLoc(eBlastTypeBlastn, mask, query_seqloc);
+ *mask_seqloc = BlastMaskLocToSeqLoc(kProgram, mask, query_seqloc);
mask = BlastMaskLocFree(mask);
@@ -179,7 +175,6 @@ Blast_FindRepeatFilterSeqLoc(SeqLoc* query_seqloc,
BlastSeqSrc* seq_src = NULL;
SeqLoc* filter_loc = NULL; /* Dummy variable, since search will be performed
without filtering. */
- Boolean mask_at_hash = FALSE; /* Dummy variable. */
BlastHSPResults* results = NULL;
SBlastFilterOptions* filtering_options = NULL;
@@ -227,7 +222,7 @@ Blast_FindRepeatFilterSeqLoc(SeqLoc* query_seqloc,
status =
Blast_RunSearch(query_seqloc, seq_src, NULL, options, NULL,
- &results, &filter_loc, &mask_at_hash, sum_returns);
+ &results, &filter_loc, sum_returns);
/* The ReadDBFILE structure will not be destroyed here, because the
initialising function used readdb_attach */
diff --git a/algo/blast/api/twoseq_api.c b/algo/blast/api/twoseq_api.c
index 67fc4275..a414ec92 100644
--- a/algo/blast/api/twoseq_api.c
+++ b/algo/blast/api/twoseq_api.c
@@ -1,4 +1,4 @@
-/* $Id: twoseq_api.c,v 1.48 2005/06/06 15:40:17 papadopo Exp $
+/* $Id: twoseq_api.c,v 1.51 2005/10/20 20:58:58 madden Exp $
***************************************************************************
* *
* COPYRIGHT NOTICE *
@@ -71,6 +71,8 @@ Int2 BLAST_SummaryOptionsInit(BLAST_SummaryOptions **options)
new_options->nucleotide_mismatch = -3;
new_options->longest_intron = 0;
new_options->init_seed_method = eDefaultSeedType;
+ new_options->gap_open = -1;
+ new_options->gap_extend = -1;
*options = new_options;
return 0;
@@ -106,7 +108,6 @@ s_TwoSeqBasicFillOptions(const BLAST_SummaryOptions* basic_options,
BlastEffectiveLengthsOptions* eff_len_options = options->eff_len_options;
BlastDatabaseOptions* db_options = options->db_options;
Boolean do_megablast = FALSE;
- Boolean do_ag_blast = FALSE;
Boolean do_discontig = FALSE;
Int4 greedy_align = 0;
Int2 word_size = basic_options->word_size;
@@ -138,13 +139,6 @@ s_TwoSeqBasicFillOptions(const BLAST_SummaryOptions* basic_options,
greedy_align = 1; /* one-pass, no ungapped */
}
- /* For a megablast search or a blastn search with
- a non-default word size, turn on striding. Note that
- striding is beneficial even if the wordsize is
- smaller than the default */
-
- if (word_size != 0 || do_megablast)
- do_ag_blast = TRUE;
/* If megablast was turned on but the input indicates a sensitive search
is desired, or if word size is <=12, which is not used in contiguous
@@ -157,8 +151,10 @@ s_TwoSeqBasicFillOptions(const BLAST_SummaryOptions* basic_options,
if (word_size == 0 || word_size > 12)
word_size = 11;
do_discontig = TRUE;
- do_ag_blast = FALSE;
}
+
+ if (do_megablast && !do_discontig)
+ greedy_align = 1;
}
@@ -384,9 +380,12 @@ BLAST_TwoSeqLocSets(const BLAST_SummaryOptions *basic_options,
status =
Blast_TwoSeqLocSetsAdvanced(query_seqloc, subject_seqloc,
masking_locs, options, NULL, seqalign_out, filter_out,
- mask_at_hash, extra_returns);
+ extra_returns);
}
+ if (mask_at_hash)
+ *mask_at_hash = SBlastOptionsGetMaskAtHash(options);
+
options = SBlastOptionsFree(options);
if (extra_returns_ptr)
diff --git a/algo/blast/composition_adjustment/compo_heap.c b/algo/blast/composition_adjustment/compo_heap.c
new file mode 100644
index 00000000..f3cb4d83
--- /dev/null
+++ b/algo/blast/composition_adjustment/compo_heap.c
@@ -0,0 +1,510 @@
+/* ===========================================================================
+*
+* PUBLIC DOMAIN NOTICE
+* National Center for Biotechnology Information
+*
+* This software/database is a "United States Government Work" under the
+* terms of the United States Copyright Act. It was written as part of
+* the author's official duties as a United States Government employee and
+* thus cannot be copyrighted. This software/database is freely available
+* to the public for use. The National Library of Medicine and the U.S.
+* Government have not placed any restriction on its use or reproduction.
+*
+* Although all reasonable efforts have been taken to ensure the accuracy
+* and reliability of the software and data, the NLM and the U.S.
+* Government do not and cannot warrant the performance or results that
+* may be obtained by using this software or data. The NLM and the U.S.
+* Government disclaim all warranties, express or implied, including
+* warranties of performance, merchantability or fitness for any particular
+* purpose.
+*
+* Please cite the author in any work or product based on this material.
+*
+* ===========================================================================*/
+
+/** @file compo_heap.c
+ * @author E. Michael Gertz, Alejandro Schaffer
+ *
+ * Defines a "heap" data structure that is used to store computed alignments
+ * when composition adjustment of scoring matrices is used.
+ */
+
+#ifndef SKIP_DOXYGEN_PROCESSING
+static char const rcsid[] =
+ "$Id: compo_heap.c,v 1.1 2005/12/01 13:48:09 gertz Exp $";
+#endif /* SKIP_DOXYGEN_PROCESSING */
+
+#include <assert.h>
+#include <algo/blast/core/ncbi_std.h>
+#include <algo/blast/composition_adjustment/compo_heap.h>
+
+
+/** Define COMPO_INTENSE_DEBUG to be true to turn on rigorous but
+ * expensive consistency tests in the composition_adjustment
+ * module.
+ *
+ * This macro is usually used as part of a C-conditional
+ * if (COMPO_INTENSE_DEBUG) {
+ * perform expensive tests
+ * }
+ * The C compiler will then validate the code to perform the tests, but
+ * will almost always strip the code if COMPO_INTENSE_DEBUG is false.
+ */
+#ifndef COMPO_INTENSE_DEBUG
+#define COMPO_INTENSE_DEBUG 0
+#endif
+
+/** The initial capacity of the heap will be set to the smaller of this
+ * and the heap threshold */
+#define HEAP_INITIAL_CAPACITY 100
+/** When the heap is about to exceed its capacity, it will be grown by
+ * the minimum of a multiplicative factor of HEAP_RESIZE_FACTOR
+ * and an additive factor of HEAP_MIN_RESIZE. The heap never
+ * decreases in size */
+#define HEAP_RESIZE_FACTOR 1.5
+/** @sa HEAP_RESIZE_FACTOR */
+#define HEAP_MIN_RESIZE 100
+
+/* Return -1/0/1 if a is less than/equal to/greater than b. */
+#define CMP(a,b) ((a)>(b) ? 1 : ((a)<(b) ? -1 : 0))
+
+
+/**
+ * The struct BlastCompo_HeapRecord data type is used below to define
+ * the internal structure of a BlastCompo_Heap (see below). A
+ * BlastCompo_HeapRecord represents all alignments of a query sequence
+ * to a particular matching sequence.
+ */
+struct BlastCompo_HeapRecord {
+ double bestEvalue; /**< best (smallest) evalue of all
+ alignments in the record */
+ int bestScore; /**< best (largest) score; used to
+ break ties between records with
+ the same e-value */
+ int subject_index; /**< index of the subject sequence in
+ the database */
+ void * theseAlignments; /**< a collection of alignments */
+};
+typedef struct BlastCompo_HeapRecord BlastCompo_HeapRecord;
+
+
+/** Compare two records in the heap. */
+static int
+s_CompoHeapRecordCompare(BlastCompo_HeapRecord * place1,
+ BlastCompo_HeapRecord * place2)
+{
+ int result;
+ if (0 == (result = CMP(place1->bestEvalue, place2->bestEvalue)) &&
+ 0 == (result = CMP(place2->bestScore, place1->bestScore))) {
+ result = CMP(place1->subject_index, place2->subject_index);
+ }
+ return result > 0;
+}
+
+
+/** Swap two records in the heap. */
+static void
+s_CompoHeapRecordSwap(BlastCompo_HeapRecord * record1,
+ BlastCompo_HeapRecord * record2)
+{
+ /* bestEvalue, bestScore, theseAlignments and subject_index are temporary
+ * variables used to perform the swap. */
+ double bestEvalue;
+ int bestScore, subject_index;
+ void * theseAlignments;
+
+ bestEvalue = record1->bestEvalue;
+ record1->bestEvalue = record2->bestEvalue;
+ record2->bestEvalue = bestEvalue;
+
+ bestScore = record1->bestScore;
+ record1->bestScore = record2->bestScore;
+ record2->bestScore = bestScore;
+
+ subject_index = record1->subject_index;
+ record1->subject_index = record2->subject_index;
+ record2->subject_index = subject_index;
+
+ theseAlignments = record1->theseAlignments;
+ record1->theseAlignments = record2->theseAlignments;
+ record2->theseAlignments = theseAlignments;
+}
+
+
+/**
+ * Verify that the subtree rooted at element i is ordered so as to be
+ * as to be a valid heap. This routine checks every element in the
+ * subtree, and so is very time consuming. It is for debugging
+ * purposes only.
+ */
+static int
+s_CompoHeapIsValid(BlastCompo_HeapRecord * heapArray, int i, int n)
+{
+ /* indices of nodes to the left and right of node i */
+ int left = 2 * i, right = 2 * i + 1;
+
+ if (right <= n) {
+ return !s_CompoHeapRecordCompare(&(heapArray[right]),
+ &(heapArray[i])) &&
+ s_CompoHeapIsValid(heapArray, right, n);
+ }
+ if (left <= n) {
+ return !s_CompoHeapRecordCompare(&(heapArray[left]),
+ &(heapArray[i])) &&
+ s_CompoHeapIsValid(heapArray, left, n);
+ }
+ return TRUE;
+}
+
+
+/**
+ * Relocate the top element of a subtree so that on exit the subtree
+ * is in valid heap order. On entry, all elements but the root of the
+ * subtree must be in valid heap order.
+ *
+ * @param heapArray array representing the heap stored as a binary tree
+ * @param top the index of the root element of a subtree
+ * @param n the size of the entire heap.
+ */
+static void
+s_CompoHeapifyDown(BlastCompo_HeapRecord * heapArray,
+ int top, int n)
+{
+ int i, left, right, largest; /* placeholders for indices in swapping */
+
+ largest = top;
+ do {
+ i = largest;
+ left = 2 * i;
+ right = 2 * i + 1;
+ if (left <= n &&
+ s_CompoHeapRecordCompare(&heapArray[left],
+ &heapArray[i])) {
+ largest = left;
+ } else {
+ largest = i;
+ }
+ if (right <= n &&
+ s_CompoHeapRecordCompare(&heapArray[right],
+ &heapArray[largest])) {
+ largest = right;
+ }
+ if (largest != i) {
+ s_CompoHeapRecordSwap(&heapArray[i], &heapArray[largest]);
+ }
+ } while (largest != i);
+ if (COMPO_INTENSE_DEBUG) {
+ assert(s_CompoHeapIsValid(heapArray, top, n));
+ }
+}
+
+
+/**
+ * Relocate a leaf in the heap so that the entire heap is in valid
+ * heap order. On entry, all elements but the leaf must be in valid
+ * heap order.
+ *
+ * @param heapArray array representing the heap as a binary tree
+ * @param i element in heap array that may be out of order [in]
+ */
+static void
+s_CompoHeapifyUp(BlastCompo_HeapRecord * heapArray, int i)
+{
+ int parent = i / 2; /* index to the node that is the
+ parent of node i */
+ while (parent >= 1 && s_CompoHeapRecordCompare(&heapArray[i],
+ &heapArray[parent]))
+ {
+ s_CompoHeapRecordSwap(&heapArray[i], &heapArray[parent]);
+
+ i = parent;
+ parent /= 2;
+ }
+ if (COMPO_INTENSE_DEBUG) {
+ assert(s_CompoHeapIsValid(heapArray, 1, i));
+ }
+}
+
+
+/** Convert a BlastCompo_Heap from a representation as an unordered array to
+ * a representation as a heap-ordered array.
+ *
+ * @param self the BlastCompo_Heap to convert
+ */
+static void
+s_ConvertToHeap(BlastCompo_Heap * self)
+{
+ if (NULL != self->array) { /* If we aren't already a heap */
+ int i; /* heap node index */
+ int n; /* number of elements in the heap */
+ self->heapArray = self->array;
+ self->array = NULL;
+
+ n = self->n;
+ for (i = n / 2; i >= 1; --i) {
+ s_CompoHeapifyDown(self->heapArray, i, n);
+ }
+ }
+ if (COMPO_INTENSE_DEBUG) {
+ assert(s_CompoHeapIsValid(self->heapArray, 1, self->n));
+ }
+}
+
+
+/** Return true if self may insert a match that had the given eValue,
+ * score and subject_index.
+ *
+ * @param self a BlastCompo_Heap
+ * @param eValue the evalue to be tested.
+ * @param score the score to be tested
+ * @param subject_index the subject_index to be tested.
+ */
+int
+BlastCompo_HeapWouldInsert(BlastCompo_Heap * self,
+ double eValue,
+ int score,
+ int subject_index)
+{
+ if (self->n < self->heapThreshold ||
+ eValue <= self->ecutoff ||
+ eValue < self->worstEvalue) {
+ return TRUE;
+ } else {
+ /* self is either currently a heap, or must be converted to
+ * one; use s_CompoHeapRecordCompare to compare against
+ * the worst element in the heap */
+ BlastCompo_HeapRecord heapRecord; /* temporary record to
+ compare against */
+ if (self->heapArray == NULL) s_ConvertToHeap(self);
+
+ heapRecord.bestEvalue = eValue;
+ heapRecord.bestScore = score;
+ heapRecord.subject_index = subject_index;
+ heapRecord.theseAlignments = NULL;
+
+ return s_CompoHeapRecordCompare(&self->heapArray[1], &heapRecord);
+ }
+}
+
+
+/**
+ * Insert a new heap record at the end of *array, possibly resizing
+ * the array to hold the new record.
+ *
+ * @param *array the array to receive the new record
+ * @param *length number of records already in *array
+ * @param *capacity allocated size of *array
+ * @param alignments a list of alignments
+ * @param eValue the best evalue among the alignments
+ * @param score the best score among the alignments
+ * @param subject_index the index of the subject sequence in the database
+ * @return 0 on success, -1 on failure (out-of-memory)
+ */
+static int
+s_CompHeapRecordInsertAtEnd(BlastCompo_HeapRecord **array,
+ int * length,
+ int * capacity,
+ void * alignments,
+ double eValue,
+ int score,
+ int subject_index)
+{
+ BlastCompo_HeapRecord *heapRecord; /* destination for the new
+ alignments */
+ if (*length >= *capacity) {
+ /* The destination array must be resized */
+ int new_capacity; /* capacity the resized heap */
+ BlastCompo_HeapRecord * new_array;
+
+ new_capacity = MAX(HEAP_MIN_RESIZE + *capacity,
+ (int) (HEAP_RESIZE_FACTOR * (*capacity)));
+ new_array = realloc(*array, (new_capacity + 1) *
+ sizeof(BlastCompo_HeapRecord));
+ if (new_array == NULL) { /* out of memory */
+ return -1;
+ }
+ *array = new_array;
+ *capacity = new_capacity;
+ }
+ heapRecord = &(*array)[++(*length)];
+ heapRecord->bestEvalue = eValue;
+ heapRecord->bestScore = score;
+ heapRecord->theseAlignments = alignments;
+ heapRecord->subject_index = subject_index;
+
+ return 0;
+}
+
+
+/**
+ * Try to insert a collection of alignments into a heap.
+ *
+ * @param self the heap
+ * @param alignments a collection of alignments, in an unspecified
+ * format
+ * @param eValue the best evalue among the alignments
+ * @param score the best score among the alignments
+ * @param subject_index the index of the subject sequence in the database
+ * @param discardedAlignment a collection of alignments that must be
+ * deleted (passed back to the calling routine
+ * as this routine does know how to delete them)
+ * @return 0 on success, -1 for out of memory */
+int
+BlastCompo_HeapInsert(BlastCompo_Heap * self,
+ void * alignments,
+ double eValue,
+ int score,
+ int subject_index,
+ void ** discardedAlignments)
+{
+ *discardedAlignments = NULL;
+ if (self->array && self->n >= self->heapThreshold) {
+ s_ConvertToHeap(self);
+ }
+ if (self->array != NULL) {
+ /* "self" is currently a list. Add the new alignments to the end */
+ int status =
+ s_CompHeapRecordInsertAtEnd(&self->array, &self->n,
+ &self->capacity, alignments,
+ eValue, score,
+ subject_index);
+ if (status != 0) { /* out of memory */
+ return -1;
+ }
+ if (self->worstEvalue < eValue) {
+ self->worstEvalue = eValue;
+ }
+ } else { /* "self" is currently a heap */
+ if (self->n < self->heapThreshold ||
+ (eValue <= self->ecutoff &&
+ self->worstEvalue <= self->ecutoff)) {
+ /* The new alignments must be inserted into the heap, and all old
+ * alignments retained */
+ int status =
+ s_CompHeapRecordInsertAtEnd(&self->heapArray,
+ &self->n,
+ &self->capacity,
+ alignments, eValue,
+ score, subject_index);
+ if (status != 0) { /* out of memory */
+ return -1;
+ }
+ s_CompoHeapifyUp(self->heapArray, self->n);
+ } else {
+ /* Some set of alignments must be discarded; discardedAlignments
+ * will hold a pointer to these alignments. */
+ BlastCompo_HeapRecord heapRecord; /* Candidate record
+ for insertion */
+ heapRecord.bestEvalue = eValue;
+ heapRecord.bestScore = score;
+ heapRecord.theseAlignments = alignments;
+ heapRecord.subject_index = subject_index;
+
+ if (s_CompoHeapRecordCompare(&self->heapArray[1],
+ &heapRecord)) {
+ /* The new record should be inserted, and the largest
+ * element currently in the heap may be discarded */
+ *discardedAlignments = self->heapArray[1].theseAlignments;
+ memcpy(&self->heapArray[1], &heapRecord,
+ sizeof(BlastCompo_HeapRecord));
+ } else {
+ *discardedAlignments = heapRecord.theseAlignments;
+ }
+ s_CompoHeapifyDown(self->heapArray, 1, self->n);
+ }
+ /* end else some set of alignments must be discarded */
+ self->worstEvalue = self->heapArray[1].bestEvalue;
+ if (COMPO_INTENSE_DEBUG) {
+ assert(s_CompoHeapIsValid(self->heapArray, 1, self->n));
+ }
+ }
+ /* end else "self" is currently a heap. */
+ return 0; /* success */
+}
+
+
+/**
+ * Return true if only matches with evalue <= self->ecutoff may be
+ * inserted.
+ *
+ * @param self a BlastCompo_Heap
+ */
+int
+BlastCompo_HeapFilledToCutoff(const BlastCompo_Heap * self)
+{
+ return self->n >= self->heapThreshold &&
+ self->worstEvalue <= self->ecutoff;
+}
+
+
+/** Initialize a new BlastCompo_Heap; parameters to this function correspond
+ * directly to fields in the BlastCompo_Heap
+ *
+ * @return 0 on success, -1 for out of memory */
+int
+BlastCompo_HeapInitialize(BlastCompo_Heap * self, int heapThreshold,
+ double ecutoff)
+{
+ self->n = 0;
+ self->heapThreshold = heapThreshold;
+ self->ecutoff = ecutoff;
+ self->heapArray = NULL;
+ self->capacity = MIN(HEAP_INITIAL_CAPACITY, heapThreshold);
+ self->worstEvalue = 0;
+ /* Begin life as a list */
+ self->array = calloc(self->capacity + 1, sizeof(BlastCompo_HeapRecord));
+
+ return self->array != NULL ? 0 : -1;
+}
+
+
+/**
+ * Release the storage associated with the fields of a BlastCompo_Heap. Don't
+ * delete the BlastCompo_Heap structure itself.
+ *
+ * @param self BlastCompo_Heap whose storage will be released
+ */
+void
+BlastCompo_HeapRelease(BlastCompo_Heap * self)
+{
+ if (self->heapArray) free(self->heapArray);
+ if (self->array) free(self->array);
+
+ self->n = self->capacity = self->heapThreshold = 0;
+ self->heapArray = NULL; self->array = NULL;
+}
+
+
+/**
+ * Remove and return the element in the BlastCompo_Heap with largest
+ * (worst) evalue; ties are broken according to the order specified
+ * by the s_CompoHeapRecordCompare routine.
+ *
+ * @param self a BlastCompo_Heap
+ */
+void *
+BlastCompo_HeapPop(BlastCompo_Heap * self)
+{
+ void * results = NULL; /* the list of SeqAligns to be returned */
+
+ s_ConvertToHeap(self);
+ if (self->n > 0) { /* The heap is not empty */
+ BlastCompo_HeapRecord *first, *last; /* The first and last
+ elements of the array
+ that represents the
+ heap. */
+ first = &self->heapArray[1];
+ last = &self->heapArray[self->n];
+
+ results = first->theseAlignments;
+ if (--self->n > 0) {
+ /* The heap is still not empty */
+ memcpy(first, last, sizeof(BlastCompo_HeapRecord));
+ s_CompoHeapifyDown(self->heapArray, 1, self->n);
+ }
+ }
+ if (COMPO_INTENSE_DEBUG) {
+ assert(s_CompoHeapIsValid(self->heapArray, 1, self->n));
+ }
+ return results;
+}
diff --git a/algo/blast/composition_adjustment/compo_heap.h b/algo/blast/composition_adjustment/compo_heap.h
new file mode 100644
index 00000000..731edda7
--- /dev/null
+++ b/algo/blast/composition_adjustment/compo_heap.h
@@ -0,0 +1,124 @@
+/* $Id: compo_heap.h,v 1.1 2005/12/01 13:52:20 gertz Exp $
+ * ===========================================================================
+ *
+ * PUBLIC DOMAIN NOTICE
+ * National Center for Biotechnology Information
+ *
+ * This software/database is a "United States Government Work" under the
+ * terms of the United States Copyright Act. It was written as part of
+ * the author's official duties as a United States Government employee and
+ * thus cannot be copyrighted. This software/database is freely available
+ * to the public for use. The National Library of Medicine and the U.S.
+ * Government have not placed any restriction on its use or reproduction.
+ *
+ * Although all reasonable efforts have been taken to ensure the accuracy
+ * and reliability of the software and data, the NLM and the U.S.
+ * Government do not and cannot warrant the performance or results that
+ * may be obtained by using this software or data. The NLM and the U.S.
+ * Government disclaim all warranties, express or implied, including
+ * warranties of performance, merchantability or fitness for any particular
+ * purpose.
+ *
+ * Please cite the author in any work or product based on this material.
+ *
+ * ===========================================================================*/
+
+/** @file compo_heap.h
+ * @author Alejandro Schaffer, E. Michael Gertz
+ *
+ * Declares a "heap" data structure that is used to store computed alignments
+ * when composition adjustment of scoring matrices is used.
+ */
+
+#ifndef __COMPO_HEAP__
+#define __COMPO_HEAP__
+
+#include <algo/blast/core/blast_export.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct BlastCompo_HeapRecord;
+
+/**
+ * A BlastCompo_Heap represents a collection of alignments between one
+ * query sequence and several matching subject sequences.
+ *
+ * Each matching sequence is allocated one record in a
+ * BlastCompo_Heap. The eValue of a query-subject pair is the best
+ * (smallest positive) evalue of all alignments between the two
+ * sequences.
+ *
+ * The comparison function for matches is BlastCompo_HeapRecordCompare. A
+ * match will be inserted in the the BlastCompo_Heap if:
+ * - there are fewer that BlastCompo_Heap::heapThreshold elements in
+ * the BlastCompo_Heap;
+ * - the eValue of the match is <= BlastCompo_Heap::ecutoff; or
+ * - the match is less than (as determined by BlastCompo_HeapRecordCompare) the
+ * largest (worst) match already in the BlastCompo_Heap.
+ *
+ * If there are >= BlastCompo_Heap::heapThreshold matches already in
+ * the BlastCompo_Heap when a new match is to be inserted, then the
+ * largest match (as determined by BlastCompo_HeapRecordCompare) is
+ * removed, unless the eValue of the largest match <=
+ * BlastCompo_Heap::ecutoff. Matches with eValue <=
+ * BlastCompo_Heap::ecutoff are never removed by the insertion
+ * routine. As a consequence, the BlastCompo_Heap can hold an
+ * arbitrarily large number of matches, although it is atypical for
+ * the number of matches to be greater than
+ * BlastCompo_Heap::heapThreshold.
+ *
+ * Once all matches have been collected, the BlastCompo_HeapPop
+ * routine may be invoked to return all alignments in order.
+ *
+ * While the number of elements in a heap < BlastCompo_Heap::heapThreshold,
+ * the BlastCompo_Heap is implemented as an unordered array, rather
+ * than a heap-ordered array. The BlastCompo_Heap is converted to a
+ * heap-ordered array as soon as it becomes necessary to order the
+ * matches by evalue. The routines that operate on a BlastCompo_Heap
+ * should behave properly whichever state the BlastCompo_Heap is in.
+ */
+typedef struct BlastCompo_Heap {
+ int n; /**< The current number of elements */
+ int capacity; /**< The maximum number of elements
+ that may be inserted before the
+ BlastCompo_Heap must be resized, this
+ number must be >= heapThreshold */
+ int heapThreshold; /**< see above */
+ double ecutoff; /**< matches with evalue below ecutoff may
+ always be inserted in the BlastCompo_Heap */
+ double worstEvalue; /**< the worst (biggest) evalue currently in
+ the heap */
+
+ struct BlastCompo_HeapRecord *array;
+ struct BlastCompo_HeapRecord *heapArray;
+} BlastCompo_Heap;
+
+
+NCBI_XBLAST_EXPORT
+int BlastCompo_HeapWouldInsert(BlastCompo_Heap * self, double eValue,
+ int score, int subject_index);
+NCBI_XBLAST_EXPORT
+int BlastCompo_HeapInsert(BlastCompo_Heap * self, void * alignments,
+ double eValue, int score, int
+ subject_index, void ** discardedAligns);
+
+NCBI_XBLAST_EXPORT
+int BlastCompo_HeapFilledToCutoff(const BlastCompo_Heap * self);
+
+NCBI_XBLAST_EXPORT
+int BlastCompo_HeapInitialize(BlastCompo_Heap * self, int heapThreshold,
+ double ecutoff);
+
+NCBI_XBLAST_EXPORT
+void BlastCompo_HeapRelease(BlastCompo_Heap * self);
+
+NCBI_XBLAST_EXPORT
+void * BlastCompo_HeapPop(BlastCompo_Heap * self);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/algo/blast/composition_adjustment/compo_mode_condition.c b/algo/blast/composition_adjustment/compo_mode_condition.c
index 2489f657..95344dc1 100644
--- a/algo/blast/composition_adjustment/compo_mode_condition.c
+++ b/algo/blast/composition_adjustment/compo_mode_condition.c
@@ -1,5 +1,3 @@
-static char const rcsid[] = "$Id: Mode_condition.c,v 1.1 2005/05/16 16:11:41 papadopo Exp $";
-
/* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -24,119 +22,110 @@ static char const rcsid[] = "$Id: Mode_condition.c,v 1.1 2005/05/16 16:11:41 pap
*
* ===========================================================================*/
-/*****************************************************************************
-
-File name: Mode_condition.c
-
-Authors: Alejandro Schaffer, Yi-Kuo Yu
-
-Contents: Functions to test whether conditional score matrix
- adjustment should be applied for a pair of matching sequences.
-
-******************************************************************************/
-/*
- * $Log: Mode_condition.c,v $
- * Revision 1.1 2005/05/16 16:11:41 papadopo
- * Initial revision
+/**
+ * @file compo_mode_condition.c
*
+ * Authors: Alejandro Schaffer, Yi-Kuo Yu
+ *
+ * Functions to test whether conditional score matrix adjustment
+ * should be applied for a pair of matching sequences.
*/
-#include <ncbi.h>
-#include <NRdefs.h>
-#include <Mode_condition.h>
+#ifndef SKIP_DOXYGEN_PROCESSING
+static char const rcsid[] =
+ "$Id: compo_mode_condition.c,v 1.5 2005/12/01 13:49:43 gertz Exp $";
+#endif /* SKIP_DOXYGEN_PROCESSING */
-double BLOSUM62_bg[Alphsize] =
- { 0.0742356686, 0.0515874541, 0.0446395713, 0.0536092024, 0.0246865086,
- 0.0342500470, 0.0543174458, 0.0741431988, 0.0262119099, 0.0679331197,
- 0.0989057232, 0.0581774322, 0.0249972837, 0.0473970070, 0.0385382904,
- 0.0572279733, 0.0508996546, 0.0130298868, 0.0322925130, 0.0729201182
- };
- /* BLOSUM 62 is the correct bg */
+#include <algo/blast/core/ncbi_std.h>
+#include <algo/blast/composition_adjustment/composition_adjustment.h>
+#include <algo/blast/composition_adjustment/compo_mode_condition.h>
+#include <algo/blast/composition_adjustment/matrix_frequency_data.h>
+/* 180 degrees in half a circle */
#define HALF_CIRCLE_DEGREES 180
+/* some digits of PI */
#define PI 3.1415926543
+/* thresholds used to determine which composition mode to use */
#define QUERY_MATCH_DISTANCE_THRESHOLD 0.16
#define LENGTH_RATIO_THRESHOLD 3.0
-#define ANGLE_DEGREE_THRESHOLD 70
-
-/* declaration of Htype function for future use
- *
- * typedef int (*Condition) (int , int , int *, int *, char *);
- *
- * variable orders: Queryseq_length, Matchseq_length,
- * query_amino_count, match_amino_account, matrix_name
- */
-
-Int4 TestToApplyREAdjustmentUnconditional(Int4,
- Int4,
- Nlm_FloatHi *,
- Nlm_FloatHi *,
- char *);
+#define ANGLE_DEGREE_THRESHOLD 70.0
-Int4 TestToApplyREAdjustmentConditional(Int4,
- Int4,
- Nlm_FloatHi *,
- Nlm_FloatHi *,
- char *);
+/* type of function used to choose a mode for composition-based
+ * statistics. The variables are Queryseq_length, Matchseq_length,
+ * query_amino_count, match_amino_account and matrix_name.*/
+typedef ECompoAdjustModes
+(*Condition) (int, int, const double *, const double *,
+ const char *);
-/* If this function is used relative-entropy score adjustment is
+/* A function used to choose a mode for composition-based statistics.
+ * If this function is used relative-entropy score adjustment is
* always applied, with a fixed value as the target relative entropy*/
-Int4
-TestToApplyREAdjustmentUnconditional(Int4 Len_query,
- Int4 Len_match,
- Nlm_FloatHi * P_query,
- Nlm_FloatHi * P_match,
- char *matrix_name)
+static ECompoAdjustModes
+TestToApplyREAdjustmentUnconditional(int Len_query,
+ int Len_match,
+ const double * P_query,
+ const double * P_match,
+ const char *matrix_name)
{
- return RE_USER_SPECIFIED;
+ /* Suppress unused variable warnings */
+ (void) Len_query;
+ (void) Len_match;
+ (void) P_query;
+ (void) P_match;
+ (void) matrix_name;
+
+ return eUserSpecifiedRelEntropy;
}
-/* Decide whether a relative-entropy score adjustment should be used
+/**
+ * A function used to choose a mode for composition-based statistics.
+ * Decide whether a relative-entropy score adjustment should be used
* based on lengths and letter counts of the two matched sequences;
* matrix_name is the underlying score matrix; for now only BLOSUM62
* is supported */
-Int4
-TestToApplyREAdjustmentConditional(Int4 Len_query,
- Int4 Len_match,
- Nlm_FloatHi * P_query,
- Nlm_FloatHi * P_match,
- char *matrix_name)
+static ECompoAdjustModes
+TestToApplyREAdjustmentConditional(int Len_query,
+ int Len_match,
+ const double * P_query,
+ const double * P_match,
+ const char *matrix_name)
{
- Int4 mode_value; /* which relative entropy mode to return */
- Int4 i; /* loop indices */
- Nlm_FloatHi p_query[Alphsize], p_match[Alphsize]; /*letter probabilities
- *for query and match*/
- Nlm_FloatHi *p_matrix; /* letter probabilities used in constructing
- * matrix name*/
- Nlm_FloatHi D_m_mat, D_q_mat, D_m_q; /* distances between
- * match and original
- * between query and
- * original between
- * match and query*/
- Nlm_FloatHi corr_factor = 0.0; /* correlation between how
- p_query and p_match deviate
- from p_matrix */
- Nlm_FloatHi len_q, len_m; /* lengths of query and matching
- sequence in floating point */
- Nlm_FloatHi len_large, len_small; /* store the larger and smaller of
- * len_q and len_m */
- Nlm_FloatHi angle; /* angle between query and match
- probabilities */
-
- p_matrix = Get_bg_freq(matrix_name);
-
- for(i = 0; i < Alphsize; i++) {
+ ECompoAdjustModes mode_value; /* which relative entropy mode to
+ return */
+ int i; /* loop indices */
+ double p_query[COMPO_NUM_TRUE_AA];
+ double p_match[COMPO_NUM_TRUE_AA]; /*letter probabilities
+ for query and match*/
+ const double *p_matrix; /* letter probabilities used in
+ constructing matrix name*/
+ double D_m_mat, D_q_mat, D_m_q; /* distances between match and
+ original between query and
+ original between match and
+ query*/
+ double corr_factor = 0.0; /* correlation between how p_query
+ and p_match deviate from p_matrix
+ */
+ double len_q, len_m; /* lengths of query and matching
+ sequence in floating point */
+ double len_large, len_small; /* store the larger and smaller of
+ len_q and len_m */
+ double angle; /* angle between query and match
+ probabilities */
+
+ p_matrix = Blast_GetMatrixBackgroundFreq(matrix_name);
+
+ for (i = 0; i < COMPO_NUM_TRUE_AA; i++) {
p_query[i] = P_query[i];
p_match[i] = P_match[i];
corr_factor +=
(p_query[i] - p_matrix[i]) * (p_match[i] - p_matrix[i]);
}
- D_m_mat = Get_RE(p_match, p_matrix);
- D_q_mat = Get_RE(p_query, p_matrix);
- D_m_q = Get_RE(p_match, p_query); /* distance between match and query */
+ D_m_mat = Blast_GetRelativeEntropy(p_match, p_matrix);
+ D_q_mat = Blast_GetRelativeEntropy(p_query, p_matrix);
+ D_m_q = Blast_GetRelativeEntropy(p_match, p_query);
angle =
acos((D_m_mat * D_m_mat + D_q_mat * D_q_mat -
@@ -146,61 +135,54 @@ TestToApplyREAdjustmentConditional(Int4 Len_query,
len_q = 1.0 * Len_query;
len_m = 1.0 * Len_match;
- if(len_q > len_m) {
+ if (len_q > len_m) {
len_large = len_q;
len_small = len_m;
} else {
len_large = len_m;
len_small = len_q;
}
-
- if((D_m_q > QUERY_MATCH_DISTANCE_THRESHOLD) &&
- (len_large / len_small > LENGTH_RATIO_THRESHOLD) &&
- (angle > ANGLE_DEGREE_THRESHOLD)) {
- mode_value = KEEP_OLD_MATRIX;
+ if ((D_m_q > QUERY_MATCH_DISTANCE_THRESHOLD) &&
+ (len_large / len_small > LENGTH_RATIO_THRESHOLD) &&
+ (angle > ANGLE_DEGREE_THRESHOLD)) {
+ mode_value = eCompoKeepOldMatrix;
} else {
- mode_value = RE_USER_SPECIFIED;
+ mode_value = eUserSpecifiedRelEntropy;
}
-
return mode_value;
}
-/* Retrieve the background letter probabilities implicitly used in
- * constructing the score matrix matrix_name*/
-Nlm_FloatHi *
-Get_bg_freq(char *matrix_name)
-{
- if(0 == strcmp(matrix_name, "BLOSUM62")) {
- return BLOSUM62_bg;
- } else { /* default */
- printf("matrix not supported, exit now! \n");
- exit(1);
- }
-}
+/**
+ * An array of functions that can be used to decide which optimization
+ * formulation should be used for score adjustment */
+static Condition Cond_func[] = {
+ TestToApplyREAdjustmentConditional,
+ TestToApplyREAdjustmentUnconditional,
+ NULL
+};
-/* initialization of array of functions that can be used to decide
- * which optimization formulation should be used for score
- * adjustment */
-Condition Cond_func[] ={ TestToApplyREAdjustmentConditional,
- TestToApplyREAdjustmentUnconditional,
- NULL };
-
-/* Choose how the relative entropy should be constrained based on
- * properties of the two sequences to be aligned. length1 an length2
- * are the lengths of the two sequences; probArray1 and probArray2 are
- * arrays of probabilities of letters in each sequence, using the
- * 20-letter alphabet; matrixName is the name of the underlying 20x20
- * score matrix; testFunctionIndex allows different rules to be tested
- * for the relative entropy decision. */
-Int4
-chooseMode(Int4 length1,
- Int4 length2,
- Nlm_FloatHi * probArray1,
- Nlm_FloatHi * probArray2,
- char *matrixName,
- Int4 testFunctionIndex)
+/**
+ * Choose how the relative entropy should be constrained based on
+ * properties of the two sequences to be aligned.
+ *
+ * @param length1 length of the first sequence
+ * @param length2 length of the second sequence
+ * @param probArray1 arrays of probabilities for the first sequence, in
+ * a 20 letter amino-acid alphabet
+ * @param probArray2 arrays of probabilities for the other sequence
+ * @param matrixName name of the scoring matrix
+ * @param testFunctionIndex allows different rules to be tested
+ * for the relative entropy decision.
+ */
+ECompoAdjustModes
+Blast_ChooseCompoAdjustMode(int length1,
+ int length2,
+ const double * probArray1,
+ const double * probArray2,
+ const char *matrixName,
+ int testFunctionIndex)
{
return
Cond_func[testFunctionIndex] (length1, length2,
diff --git a/algo/blast/composition_adjustment/compo_mode_condition.h b/algo/blast/composition_adjustment/compo_mode_condition.h
index ff8943ee..9ad535a9 100644
--- a/algo/blast/composition_adjustment/compo_mode_condition.h
+++ b/algo/blast/composition_adjustment/compo_mode_condition.h
@@ -1,79 +1,56 @@
-/* ===========================================================================
-*
-* PUBLIC DOMAIN NOTICE
-* National Center for Biotechnology Information
-*
-* This software/database is a "United States Government Work" under the
-* terms of the United States Copyright Act. It was written as part of
-* the author's official duties as a United States Government employee and
-* thus cannot be copyrighted. This software/database is freely available
-* to the public for use. The National Library of Medicine and the U.S.
-* Government have not placed any restriction on its use or reproduction.
-*
-* Although all reasonable efforts have been taken to ensure the accuracy
-* and reliability of the software and data, the NLM and the U.S.
-* Government do not and cannot warrant the performance or results that
-* may be obtained by using this software or data. The NLM and the U.S.
-* Government disclaim all warranties, express or implied, including
-* warranties of performance, merchantability or fitness for any particular
-* purpose.
-*
-* Please cite the author in any work or product based on this material.
-*
-* ===========================================================================*/
-
-/*****************************************************************************
-
-File name: Mode_condition.h
-
-Authors: Alejandro Schaffer, Yi-Kuo Yu
-
-Contents: Definitions used only in Mode_condition.c
-
-******************************************************************************/
-/*
- * $Log: Mode_condition.h,v $
- * Revision 1.1 2005/05/16 16:11:41 papadopo
- * Initial revision
+/* $Id: compo_mode_condition.h,v 1.5 2005/12/01 13:54:04 gertz Exp $
+ * ===========================================================================
+ *
+ * PUBLIC DOMAIN NOTICE
+ * National Center for Biotechnology Information
+ *
+ * This software/database is a "United States Government Work" under the
+ * terms of the United States Copyright Act. It was written as part of
+ * the author's official duties as a United States Government employee and
+ * thus cannot be copyrighted. This software/database is freely available
+ * to the public for use. The National Library of Medicine and the U.S.
+ * Government have not placed any restriction on its use or reproduction.
+ *
+ * Although all reasonable efforts have been taken to ensure the accuracy
+ * and reliability of the software and data, the NLM and the U.S.
+ * Government do not and cannot warrant the performance or results that
+ * may be obtained by using this software or data. The NLM and the U.S.
+ * Government disclaim all warranties, express or implied, including
+ * warranties of performance, merchantability or fitness for any particular
+ * purpose.
*
+ * Please cite the author in any work or product based on this material.
+ *
+ * ===========================================================================*/
+/**
+ * @file compo_mode_condition.h
+ * @author Alejandro Schaffer, Yi-Kuo Yu
+ *
+ * Declarations of functions used to choose the mode for
+ * composition-based statistics.
*/
-#ifndef MODE_CONDITION
-#define MODE_CONDITION
-#define Mode_1_per 0.3
-#define Mode_unchange_per 0.6
-#define RE_mode_1_limit 0.18
+#ifndef __COMPO_MODE_CONDITION__
+#define __COMPO_MODE_CONDITION__
-double *Get_bg_freq(char *matrix_name);
+#include <algo/blast/core/blast_export.h>
-/* declaration of function type for future use
- *
- * variable orders: Queryseq_length, Matchseq_length, query_amino_count,
- * match_amino_account, matrix_name
- *
- * return values for both Test_0 and Test_1
- * -1: no adjustment; 0: mode 0 (unconstrained);
- * 1: mode 1 (with RE in new context)
- */
-typedef Int4 (*Condition) (Int4 , Int4 ,
- Nlm_FloatHi *, Nlm_FloatHi *, char *);
+#ifdef __cplusplus
+extern "C" {
+#endif
-Int4
-TestToApplyREAdjustmentUnconditional(Int4 Len_query,
- Int4 Len_match,
- Nlm_FloatHi * P_query,
- Nlm_FloatHi * P_match,
- char *matrix_name);
-Int4
-TestToApplyREAdjustmentConditional(Int4 Len_query,
- Int4 Len_match,
- Nlm_FloatHi * P_query,
- Nlm_FloatHi * P_match,
- char *matrix_name);
+#include <algo/blast/composition_adjustment/composition_constants.h>
-Int4
-chooseMode(Int4 length1, Int4 length2,
- Nlm_FloatHi * probArray1, Nlm_FloatHi * probArray2,
- char *matrixName, Int4 testFunctionIndex);
+NCBI_XBLAST_EXPORT
+ECompoAdjustModes
+Blast_ChooseCompoAdjustMode(int length1, int length2,
+ const double * probArray1,
+ const double * probArray2,
+ const char * matrixName,
+ int testFunctionIndex);
+
+#ifdef __cplusplus
+}
+#endif
#endif
diff --git a/algo/blast/composition_adjustment/composition_adjustment.c b/algo/blast/composition_adjustment/composition_adjustment.c
new file mode 100644
index 00000000..2778dfa5
--- /dev/null
+++ b/algo/blast/composition_adjustment/composition_adjustment.c
@@ -0,0 +1,1376 @@
+/* ===========================================================================
+*
+* PUBLIC DOMAIN NOTICE
+* National Center for Biotechnology Information
+*
+* This software/database is a "United States Government Work" under the
+* terms of the United States Copyright Act. It was written as part of
+* the author's official duties as a United States Government employee and
+* thus cannot be copyrighted. This software/database is freely available
+* to the public for use. The National Library of Medicine and the U.S.
+* Government have not placed any restriction on its use or reproduction.
+*
+* Although all reasonable efforts have been taken to ensure the accuracy
+* and reliability of the software and data, the NLM and the U.S.
+* Government do not and cannot warrant the performance or results that
+* may be obtained by using this software or data. The NLM and the U.S.
+* Government disclaim all warranties, express or implied, including
+* warranties of performance, merchantability or fitness for any particular
+* purpose.
+*
+* Please cite the author in any work or product based on this material.
+*
+* ===========================================================================*/
+
+/** @file composition_adjustment.c
+ *
+ * @author Yi-Kuo Yu, Alejandro Schaffer, E. Michael Gertz
+ *
+ * Highest level functions to solve the optimization problem for
+ * compositional score matrix adjustment.
+ */
+#ifndef SKIP_DOXYGEN_PROCESSING
+static char const rcsid[] =
+ "$Id: composition_adjustment.c,v 1.6 2005/12/01 13:51:03 gertz Exp $";
+#endif /* SKIP_DOXYGEN_PROCESSING */
+
+#include <limits.h>
+#include <assert.h>
+#include <algo/blast/core/ncbi_std.h>
+#include <algo/blast/composition_adjustment/composition_constants.h>
+#include <algo/blast/composition_adjustment/composition_adjustment.h>
+#include <algo/blast/composition_adjustment/matrix_frequency_data.h>
+#include <algo/blast/composition_adjustment/nlm_linear_algebra.h>
+#include <algo/blast/composition_adjustment/optimize_target_freq.h>
+
+/**positions of true characters in protein alphabet*/
+static int trueCharPositions[COMPO_NUM_TRUE_AA] =
+{1,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,22};
+
+/**
+ * conversion from 26 letter NCBIstdaa alphabet to 20 letter order
+ * for true amino acids: ARNDCQEGHILKMFPSTWYV. This order is
+ * alphabetical in the standard three-letter abbreviation of each
+ * amino acid */
+static int alphaConvert[COMPO_PROTEIN_ALPHABET] =
+ {(-1), 0, (-1), 4, 3, 6, 13, 7, 8, 9, 11, 10, 12, 2, 14, 5, 1, 15,
+ 16, 19, 17, (-1), 18, (-1), (-1), (-1)};
+
+
+/**
+ * Desired margin between an end of region used for computing a
+ * composition, and the nearest StopChar; the desired margin may
+ * not be attained. */
+static const int kCompositionMargin = 20;
+
+#define SCORE_BOUND 0.0000000001 /* average scores below
+ -SCORE_BOUND are considered
+ effectively nonnegative, and
+ Newton's method will
+ will terminate */
+#define LAMBDA_STEP_FRACTION 0.5 /* default step fraction in
+ Newton's method */
+#define INITIAL_LAMBDA 1.0 /* initial value for Newton's
+ method */
+#define LAMBDA_ITERATION_LIMIT 300 /* iteration limit for Newton's
+ method. */
+#define LAMBDA_ERROR_TOLERANCE 0.0000001 /* bound on error for estimating
+ lambda */
+
+/* bound on error for Newton's method */
+static const double kCompoAdjustErrTolerance = 0.00000001;
+/* iteration limit for Newton's method */
+static const int kCompoAdjustIterationLimit = 2000;
+/** relative entropy of BLOSUM62 */
+static const double kFixedReBlosum62 = 0.44;
+
+/**
+ * Find the weighted average of a set of observed probabilities with a
+ * set of "background" probabilities. All array parameters have
+ * length COMPO_NUM_TRUE_AA.
+ *
+ * @param probs_with_pseudo an array of weighted averages [out]
+ * @param normalized_probs observed frequencies, normalized to sum
+ * to 1.0 [out]
+ * @param observed_freq observed frequencies, not necessarily
+ * normalized to sum to 1.0. [in]
+ * @param background_probs the probability of characters in a
+ * standard sequence.
+ * @param number_of_observations the number of characters used to
+ * form the observed_freq array
+ * @param pseudocounts the number of "standard" characters
+ * to be added to form the weighted
+ * average.
+ */
+static void
+Blast_ApplyPseudocounts(double * probs_with_pseudo,
+ double * normalized_probs,
+ const double * observed_freq,
+ int number_of_observations,
+ const double * background_probs,
+ int pseudocounts)
+{
+ int i; /* loop index */
+ double weight; /* weight assigned to pseudocounts */
+ double sum; /* sum of the observed frequencies */
+ double dpseudocounts; /* pseudocounts as a double */
+
+ dpseudocounts = pseudocounts;
+
+ /* Normalize probabilities */
+ sum = 0.0;
+ for (i = 0; i < COMPO_NUM_TRUE_AA; i++) {
+ sum += observed_freq[i];
+ }
+ if (sum > 0) {
+ for (i = 0; i < COMPO_NUM_TRUE_AA; i++) {
+ normalized_probs[i] = observed_freq[i]/sum;
+ }
+ }
+ weight = dpseudocounts / (number_of_observations + dpseudocounts);
+ for (i = 0; i < COMPO_NUM_TRUE_AA; i++) {
+ probs_with_pseudo[i] =
+ (1.0 - weight) * normalized_probs[i] +
+ weight * background_probs[i];
+ }
+}
+
+
+/**
+ * Create a score matrix from a set of target frequencies. The scores
+ * are scaled so that the Karlin-Altschul statistical parameter Lambda
+ * equals (within numerical precision) 1.0.
+ *
+ * @param score the new score matrix [out]
+ * @param alphasize the number of rows and columns of score
+ * @param freq a matrix of target frequencies [in]
+ * @param row_sum sum of each row of freq [in]
+ * @param col_sum sum of each column of freq[in]
+ */
+static void
+Blast_ScoreMatrixFromFreq(double ** score, int alphsize, double ** freq,
+ const double row_sum[], const double col_sum[])
+{
+ int i, j; /* array indices */
+ double sum; /* sum of values in freq; used to normalize freq */
+
+ sum = 0.0;
+ for (i = 0; i < alphsize; i++) {
+ for (j = 0; j < alphsize; j++) {
+ sum += freq[i][j];
+ }
+ }
+ for (i = 0; i < alphsize; i++) {
+ for (j = 0; j < alphsize; j++) {
+ score[i][j] = log(freq[i][j] / sum / row_sum[i] / col_sum[j]);
+ }
+ }
+}
+
+
+/**
+ * Compute the symmetric form of the relative entropy of two
+ * probability vectors
+ *
+ * In this software relative entropy is expressed in "nats",
+ * meaning that logarithms are base e. In some other scientific
+ * and engineering domains where entropy is used, the logarithms
+ * are taken base 2 and the entropy is expressed in bits.
+ *
+ * @param A an array of length COMPO_NUM_TRUE_AA of
+ * probabilities.
+ * @param B a second array of length COMPO_NUM_TRUE_AA of
+ * probabilities.
+ */
+double
+Blast_GetRelativeEntropy(const double A[], const double B[])
+{
+ int i; /* loop index over letters */
+ double temp; /* intermediate term */
+ double value = 0.0; /* square of relative entropy */
+
+ for (i = 0; i < COMPO_NUM_TRUE_AA; i++) {
+ temp = (A[i] + B[i]) / 2;
+ if (temp > 0) {
+ if (A[i] > 0) {
+ value += A[i] * log(A[i] / temp) / 2;
+ }
+ if (B[i] > 0) {
+ value += B[i] * log(B[i] / temp) / 2;
+ }
+ }
+ }
+ if (value < 0) { /* must be numerical rounding error */
+ value = 0;
+ }
+ return sqrt(value);
+}
+
+
+/**
+ * Convert letter probabilities from a 26-letter NCBIstdaa alphabet to
+ * a 20 letter ARND... amino acid alphabet. (@see alphaConvert)
+ *
+ * @param inputLetterProbs the 26-letter probabilities [in]
+ * @param outputLetterProbs the 20-letter probabilities [out]
+ */
+static void
+s_GatherLetterProbs(double * outputLetterProbs,
+ const double * inputLetterProbs)
+{
+ int c; /*index over characters*/
+
+ for (c = 0; c < COMPO_PROTEIN_ALPHABET; c++) {
+ if ((-1) != alphaConvert[c]) {
+ outputLetterProbs[alphaConvert[c]] = inputLetterProbs[c];
+ }
+ }
+}
+
+
+/**
+ * Scatter and scale a matrix of scores for a 20 letter ARND... amino
+ * acid alphabet into a matrix for a 26 letter NCBIstdaa alphabet
+ * (@see alphaConvert), leaving scores for any character not present
+ * in the smaller alphabet untouched.
+ *
+ * @param dMatrix frequency ratios for the 26 letter alphabet [out]
+ * @param dMatrixTrueAA frequency ratios for the 20 letter alphabet [in]
+ * @param scale multiply the elements in dMatrixTrueAA by
+ * scale when applying the scatter.
+ */
+static void
+s_ScatterScores(double ** dMatrix,
+ double scale,
+ double ** dMatrixTrueAA)
+{
+ int p, c; /*indices over positions and characters*/
+
+ for (p = 0; p < COMPO_PROTEIN_ALPHABET; p++) {
+ for (c = 0; c < COMPO_PROTEIN_ALPHABET; c++) {
+ if (((-1) != alphaConvert[p]) && ((-1) != alphaConvert[c])) {
+ dMatrix[p][c] =
+ scale * dMatrixTrueAA[alphaConvert[p]][alphaConvert[c]];
+ }
+ }
+ }
+}
+
+
+/**
+ * Average the scores for two characters to get scores for an
+ * ambiguity character than represents either of the two original
+ * characters.
+ *
+ * @param dMatrix score matrix -- on entry contains the score data
+ * for characters A and B, and on exit also contains
+ * the score data for ambigAB.
+ * @param A a character in the alphabet
+ * @param B another character in the alphabet
+ * @param ambigAB the combined ambiguity character
+ */
+static void
+Blast_AverageScores(double ** dMatrix, int A, int B, int ambigAB)
+{
+ int i; /* iteration index */
+ double sum; /* sum of scores */
+
+ for (i = 0; i < COMPO_PROTEIN_ALPHABET; i++) {
+ if (-1 != alphaConvert[i]) {
+ sum = dMatrix[i][A] + dMatrix[i][B];
+ dMatrix[i][ambigAB] = sum/2.0;
+ sum = dMatrix[A][i] + dMatrix[B][i];
+ dMatrix[ambigAB][i] = sum/2.0;
+ }
+ }
+ /* Because ambiguity characters are rare, we assume a match of
+ * ambiguity characters represents a match of the true residues, and
+ * so only include matches when computing the average score. */
+ sum = dMatrix[A][A] + dMatrix[B][B];
+ dMatrix[ambigAB][ambigAB] = sum/2.0;
+}
+
+
+/**
+ * Set scores for substitutions that involve the nonstandard amino
+ * acids in the NCBIstdaa alphabet: the ambiguity characters 'B' and
+ * 'Z'; the "don't care" character 'X'; the atypical amino acid 'U'
+ * (Selenocysteine); the stop codon '*'; and the gap or end of
+ * sequence character '-'.
+ *
+ * @param dMatrix a matrix that on entry contains scores for all the
+ * true amino acids, and on exit also contains the
+ * scores for the nonstandard amino acids.
+ * @param startMatrix rounded amino acid substitution scores in
+ * standard context [in]
+ */
+static void
+Blast_SetNonstandardAaScores(double **dMatrix, int **startMatrix)
+{
+ int i; /* loop index */
+ /* An array containing those special characters whose score will be
+ set using startFreqRatios */
+ int specialChars[4] =
+ { eGapChar, eXchar, eSelenocysteine, eStopChar };
+
+ /* Set the scores for ambiguity characters B and Z */
+ Blast_AverageScores(dMatrix, eDchar, eNchar, eBchar);
+ Blast_AverageScores(dMatrix, eEchar, eQchar, eZchar);
+
+ /* (B,Z) mismatches are so rare we simply set their score to zero. */
+ dMatrix[eBchar][eZchar] = dMatrix[eZchar][eBchar] = 0.0;
+
+ /* Set the other characters using the startMatrix */
+ for (i = 0; i < 4; i++) {
+ int A, B; /* Two characters in the alphabet */
+ A = specialChars[i];
+ for (B = 0; B < COMPO_PROTEIN_ALPHABET; B++) {
+ dMatrix[A][B] = startMatrix[A][B];
+ dMatrix[B][A] = startMatrix[B][A];
+ }
+ }
+}
+
+
+/** Return the nearest integer to x. */
+static long Nint(double x)
+{
+ x += (x >= 0. ? 0.5 : -0.5);
+ return (long)x;
+}
+
+
+/**
+ * Round a matrix of floating point scores.
+ *
+ * @param matrix the matrix of integer valued scores [out]
+ * @param floatScoreMatrix the matrix of floating point valued
+ * scores [in]
+ * @param numPositions the number of rows of the matrices.
+ */
+static void
+s_RoundScoreMatrix(int **matrix, double **floatScoreMatrix,
+ int numPositions)
+{
+ int p, c; /*indices over positions and characters*/
+
+ for (p = 0; p < numPositions; p++) {
+ for (c = 0; c < COMPO_PROTEIN_ALPHABET; c++) {
+ if (floatScoreMatrix[p][c] < INT_MIN) {
+ matrix[p][c] = INT_MIN;
+ } else {
+ matrix[p][c] = Nint(floatScoreMatrix[p][c]);
+ }
+ }
+ }
+}
+
+
+/**
+ * Find the range of scores contained in an scoring matrix.
+ * @param obs_min smallest value in the matrix
+ * @param obs_max largest value in the matrix
+ * @param matrix a matrix with COMPO_NUM_TRUE_AA columns
+ * @param rows number of rows in the matrix
+ */
+static void s_GetScoreRange(int * obs_min, int * obs_max,
+ int ** matrix, int rows)
+{
+ int aa; /* index of an amino-acid in the 20
+ letter alphabet */
+ int irow, jcol; /* matrix row and column indices */
+ int minScore, maxScore; /* largest and smallest observed scores */
+
+ minScore = maxScore = 0;
+ for (irow = 0; irow < rows; irow++) {
+ for (aa = 0; aa < COMPO_NUM_TRUE_AA; aa++) {
+ jcol = trueCharPositions[aa];
+ if (matrix[irow][jcol] < minScore &&
+ matrix[irow][jcol] > COMPO_SCORE_MIN)
+ minScore = matrix[irow][jcol];
+ if (matrix[irow][jcol] > maxScore)
+ maxScore = matrix[irow][jcol];
+ }
+ }
+ *obs_min = minScore;
+ *obs_max = maxScore;
+}
+
+
+/**
+ * Compute the score probabilities for a given amino acid substitution matrix
+ * in the context of given query and subject amino acid frequencies.
+ *
+ * @param *obs_min the smallest score in the score matrix [out]
+ * @param *obs_max the largest score in the score matrix [out]
+ * @param *scoreProb the new array, of length (*obs_max - *obs_min + 1),
+ * of score probabilities, where (*scoreProb)[0] is
+ * the probability for score *obs_min.
+ * @param matrix a amino-acid substitution matrix (not
+ * position-specific)
+ * @param subjectProbArray is an array containing the probability of
+ * occurrence of each residue in the subject
+ * @param queryProbArray is an array containing the probability of
+ * occurrence of each residue in the query
+ * @param scoreProb is an array of probabilities for each score
+ * that is to be used as a field in return_sfp
+ * @return 0 on success, -1 on out-of-memory
+ */
+static int
+s_GetMatrixScoreProbs(double **scoreProb, int * obs_min, int * obs_max,
+ int **matrix, const double *subjectProbArray,
+ const double *queryProbArray)
+{
+ int aa; /* index of an amino-acid in the 20 letter
+ alphabet */
+ int irow, jcol; /* matrix row and column indices */
+ double * sprob; /* a pointer to the element of the score
+ probabilities array that represents the
+ probability of the score 0*/
+ int minScore; /* smallest score in matrix; the same value as
+ (*obs_min). */
+ int range; /* the range of scores in the matrix */
+
+ s_GetScoreRange(obs_min, obs_max, matrix, COMPO_PROTEIN_ALPHABET);
+ minScore = *obs_min;
+ range = *obs_max - *obs_min + 1;
+ *scoreProb = calloc(range, sizeof(double));
+ if (*scoreProb == NULL) {
+ return -1;
+ }
+ sprob = &((*scoreProb)[-(*obs_min)]); /*center around 0*/
+ for (irow = 0; irow < COMPO_PROTEIN_ALPHABET; irow++) {
+ for (aa = 0; aa < COMPO_NUM_TRUE_AA; aa++) {
+ jcol = trueCharPositions[aa];
+ if (matrix[irow][jcol] >= minScore) {
+ sprob[matrix[irow][jcol]] +=
+ (queryProbArray[irow] * subjectProbArray[jcol]);
+ }
+ }
+ }
+ return 0;
+}
+
+
+/**
+ * Compute the score probabilities for a given amino acid position-specific
+ * substitution matrix in the context of a given set of subject amino
+ * acid frequencies.
+ *
+ * @param *obs_min the smallest score in the score matrix [out]
+ * @param *obs_max the largest score in the score matrix [out]
+ * @param *scoreProb the new array, of length (*obs_max - *obs_min + 1),
+ * of score probabilities, where (*scoreProb)[0] is
+ * the probability for score *obs_min.
+ * @param matrix a position-specific amino-acid substitution matrix.
+ * @param rows the number of rows in matrix.
+ * @param subjectProbArray is an array containing the probability of
+ * occurrence of each residue in the subject
+ * @param queryProbArray is an array containing the probability of
+ * occurrence of each residue in the query
+ * @param scoreProb is an array of probabilities for each score
+ * that is to be used as a field in return_sfp
+ * @return 0 on success, -1 on out-of-memory
+ */
+static int
+s_GetPssmScoreProbs(double ** scoreProb, int * obs_min, int * obs_max,
+ int **matrix, int rows,
+ const double *subjectProbArray)
+{
+ int aa; /* index of an amino-acid in the 20 letter
+ alphabet */
+ int irow, jcol; /* matrix row and column indices */
+ double onePosFrac; /* matrix length as a double*/
+ double * sprob; /* pointer to the element of the score
+ * probabilities array the represents the
+ * probability of zero */
+ int minScore; /* smallest score in matrix; the same value as
+ (*obs_min). */
+ int range; /* the range of scores in the matrix */
+
+ s_GetScoreRange(obs_min, obs_max, matrix, rows);
+ minScore = *obs_min;
+ range = *obs_max - *obs_min + 1;
+ *scoreProb = calloc(range, sizeof(double));
+ if (*scoreProb == NULL) {
+ return -1;
+ }
+ sprob = &((*scoreProb)[-(*obs_min)]); /*center around 0*/
+ onePosFrac = 1.0/ ((double) rows);
+ for (irow = 0; irow < rows; irow++) {
+ for (aa = 0; aa < COMPO_NUM_TRUE_AA; aa++) {
+ jcol = trueCharPositions[aa];
+ if (matrix[irow][jcol] >= minScore) {
+ sprob[matrix[irow][jcol]] +=
+ onePosFrac * subjectProbArray[jcol];
+ }
+ }
+ }
+ return 0;
+}
+
+
+/**
+ * Compute an integer-valued amino-acid score matrix from a set of
+ * score frequencies.
+ *
+ * @param matrix the preallocated matrix
+ * @param matrixName the score frequencies
+ * @param Lambda the desired scale of the matrix
+ */
+void
+Blast_Int4MatrixFromFreq(Int4 **matrix, int alphsize,
+ double ** freq, double Lambda)
+{
+ int i,j; /*loop indices*/
+
+ for (i = 0; i < alphsize; i++) {
+ for (j = 0; j < alphsize; j++) {
+ if (0.0 == freq[i][j]) {
+ matrix[i][j] = COMPO_SCORE_MIN;
+ } else {
+ double temp = log(freq[i][j])/Lambda;
+ matrix[i][j] = Nint(temp);
+ }
+ }
+ }
+}
+
+
+/**
+ * Fill in one row of a score matrix; used by the s_ScaleMatrix
+ * routine to fill in all rows. (@sa s_ScaleMatrix)
+ *
+ * @param matrixRow a row of the matrix to be filled in [out].
+ * @param startMatrixRow a row of rounded amino acid substitution scores in
+ * standard context [in]
+ * @param freqRatiosRow a row of frequency ratios of starting matrix [in]
+ * @param Lambda a Karlin-Altschul parameter. [in]
+ * @param LambdaRatio ratio of correct Lambda to it's original value [in]
+ */
+static void
+s_ScaleMatrixRow(int *matrixRow, const int *startMatrixRow,
+ const double *freqRatiosRow,
+ double Lambda, double LambdaRatio)
+{
+ int c; /* column index */
+ double temp; /* intermediate term in computation*/
+
+ for (c = 0; c < COMPO_PROTEIN_ALPHABET; c++) {
+ switch (c) {
+ case eGapChar: case eXchar: case eSelenocysteine: case eStopChar:
+ /* Don't scale these nonstandard residues */
+ matrixRow[c] = startMatrixRow[c];
+ break;
+
+ default:
+ if (0.0 == freqRatiosRow[c]) {
+ matrixRow[c] = COMPO_SCORE_MIN;
+ } else {
+ temp = log(freqRatiosRow[c]);
+ temp = temp/Lambda;
+ temp = temp * LambdaRatio;
+ matrixRow[c] = Nint(temp);
+ } /* end else 0.0 != freqRatiosRow[c] */
+ } /* end switch(c) */
+ } /* end for c */
+}
+
+
+/** Free memory associated with a Blast_MatrixInfo object */
+void Blast_MatrixInfoFree(Blast_MatrixInfo ** ss)
+{
+ if (*ss != NULL) {
+ free((*ss)->matrixName);
+ Nlm_Int4MatrixFree(&(*ss)->startMatrix);
+ Nlm_DenseMatrixFree(&(*ss)->startFreqRatios);
+ free(*ss);
+ *ss = NULL;
+ }
+}
+
+
+/** Create a Blast_MatrixInfo object
+ *
+ * @param rows the number of rows in the matrix, should be
+ * COMPO_PROTEIN_ALPHABET unless the matrix is position
+ * based, in which case it is the query length
+ * @param positionBased is this matrix position-based?
+ */
+Blast_MatrixInfo *
+Blast_MatrixInfoNew(int rows, int positionBased)
+{
+ int i; /* loop index */
+ Blast_MatrixInfo * ss = malloc(sizeof(Blast_MatrixInfo));
+ if (ss != NULL) {
+ ss->rows = rows;
+ ss->positionBased = positionBased;
+
+ ss->matrixName = NULL;
+ ss->startMatrix = NULL;
+ ss->startFreqRatios = NULL;
+
+ ss->startMatrix = Nlm_Int4MatrixNew(rows + 1, COMPO_PROTEIN_ALPHABET);
+ if (ss->startMatrix == NULL)
+ goto error_return;
+ ss->startFreqRatios = Nlm_DenseMatrixNew(rows + 1, COMPO_PROTEIN_ALPHABET);
+ if (ss->startFreqRatios == NULL)
+ goto error_return;
+ for (i = 0; i < COMPO_PROTEIN_ALPHABET; i++) {
+ ss->startMatrix[rows][i] = COMPO_SCORE_MIN;
+ ss->startFreqRatios[rows][i] = (double) COMPO_SCORE_MIN;
+ }
+
+ }
+ goto normal_return;
+error_return:
+ Blast_MatrixInfoFree(&ss);
+normal_return:
+ return ss;
+}
+
+
+/**
+ * Fill in the entries of a score matrix with compositionally adjusted
+ * values. (@sa Blast_CompositionBasedStats)
+ *
+ * @param matrix preallocated matrix to be filled in [out]
+ * @param ss data used to compute matrix scores
+ * @param LambdaRatio ratio of correct Lambda to its value in
+ * standard context.
+ */
+static void
+s_ScaleMatrix(int **matrix, const Blast_MatrixInfo * ss,
+ double LambdaRatio)
+{
+ int p; /* index over matrix rows */
+
+ if (ss->positionBased) {
+ /* scale the matrix rows unconditionally */
+ for (p = 0; p < ss->rows; p++) {
+ s_ScaleMatrixRow(matrix[p], ss->startMatrix[p],
+ ss->startFreqRatios[p],
+ ss->ungappedLambda, LambdaRatio);
+ }
+ } else {
+ /* Scale only the rows for true amino acids and ambiguous residues
+ * B and Z. */
+ for (p = 0; p < COMPO_PROTEIN_ALPHABET; p++) {
+ switch (p) {
+ case eGapChar: case eXchar: case eSelenocysteine: case eStopChar:
+ /* Do not scale the scores of nonstandard amino acids. */
+ memcpy(matrix[p], ss->startMatrix[p],
+ COMPO_PROTEIN_ALPHABET * sizeof(int));
+ break;
+ default:
+ s_ScaleMatrixRow(matrix[p], ss->startMatrix[p],
+ ss->startFreqRatios[p],
+ ss->ungappedLambda, LambdaRatio);
+ }
+ }
+ }
+}
+
+
+/** LambdaRatioLowerBound is used when the expected score is too large
+ * causing impalaKarlinLambdaNR to give a Lambda estimate that
+ * is too small, or to fail entirely returning -1*/
+#define LambdaRatioLowerBound 0.5
+
+
+/**
+ * Use composition-based statistics to adjust the scoring matrix, as
+ * described in
+ *
+ * Schaffer, A.A., Aravaind, L., Madden, T.L., Shavirin, S.,
+ * Spouge, J.L., Wolf, Y.I., Koonin, E.V., and Altschul, S.F.
+ * (2001), "Improving the accuracy of PSI-BLAST protein database
+ * searches with composition-based statistics and other
+ * refinements", Nucleic Acids Res. 29:2994-3005.
+ *
+ * @param matrix a scoring matrix to be adjusted [out]
+ * @param *LambdaRatio the ratio of the corrected lambda to the
+ * original lambda [out]
+ * @param ss data used to compute matrix scores
+ *
+ * @param queryProb amino acid probabilities in the query
+ * @param resProb amino acid probabilities in the subject
+ * @param calc_lambda a function that can calculate the
+ * statistical parameter Lambda from a set of
+ * score frequencies.
+ * @return 0 on success, -1 on out of memory
+ */
+int
+Blast_CompositionBasedStats(int ** matrix, double * LambdaRatio,
+ const Blast_MatrixInfo * ss,
+ const double queryProb[], const double resProb[],
+ double (*calc_lambda)(double*,int,int,double))
+{
+ double correctUngappedLambda; /* new value of ungapped lambda */
+ int obs_min, obs_max;
+ double *scoreArray;
+ int out_of_memory;
+
+ if (ss->positionBased) {
+ out_of_memory =
+ s_GetPssmScoreProbs(&scoreArray, &obs_min, &obs_max,
+ ss->startMatrix, ss->rows, resProb);
+ } else {
+ out_of_memory =
+ s_GetMatrixScoreProbs(&scoreArray, &obs_min, &obs_max,
+ ss->startMatrix, resProb, queryProb);
+ }
+ if (out_of_memory)
+ return -1;
+ correctUngappedLambda =
+ calc_lambda(scoreArray, obs_min, obs_max, ss->ungappedLambda);
+
+ /* calc_lambda will return -1 in the case where the
+ * expected score is >=0; however, because of the MAX statement 3
+ * lines below, LambdaRatio should always be > 0; the succeeding
+ * test is retained as a vestige, in case one wishes to remove the
+ * MAX statement and allow LambdaRatio to take on the error value
+ * -1 */
+ *LambdaRatio = correctUngappedLambda / ss->ungappedLambda;
+ *LambdaRatio = MIN(1, *LambdaRatio);
+ *LambdaRatio = MAX(*LambdaRatio, LambdaRatioLowerBound);
+
+ if (*LambdaRatio > 0) {
+ s_ScaleMatrix(matrix, ss, *LambdaRatio);
+ }
+ free(scoreArray);
+
+ return 0;
+}
+
+
+/**
+ * Compute the amino acid composition of a sequence.
+ *
+ * @param composition the computed composition
+ * @param sequence a sequence of amino acids
+ * @param length length of the sequence
+ */
+void
+Blast_ReadAaComposition(Blast_AminoAcidComposition * composition,
+ const Uint1 * sequence, int length)
+{
+ int frequency[COMPO_PROTEIN_ALPHABET]; /*frequency of each letter*/
+ int i; /*index*/
+ int localLength; /*reduce for X characters*/
+ double * resProb = composition->prob;
+
+ localLength = length;
+ for (i = 0; i < COMPO_PROTEIN_ALPHABET; i++)
+ frequency[i] = 0;
+ for (i = 0; i < length; i++) {
+ if (eXchar != sequence[i])
+ frequency[sequence[i]]++;
+ else
+ localLength--;
+ }
+ for (i = 0; i < COMPO_PROTEIN_ALPHABET; i++) {
+ if (frequency[i] == 0)
+ resProb[i] = 0.0;
+ else {
+ double freq = frequency[i];
+ resProb[i] = freq / (double) localLength;
+ }
+ }
+ composition->numTrueAminoAcids = localLength;
+}
+
+
+/**
+ * Get the range of a sequence to be included when computing a
+ * composition. This function is used for translated sequences, where
+ * the range to use when computing a composition is not the whole
+ * sequence, but is rather a range about an existing alignment.
+ *
+ * @param *pleft, *pright left and right endpoint of the range
+ * @param subject_data data from a translated sequence
+ * @param length length of subject_data
+ * @param start, finish start and finish (one past the end) of a
+ * existing alignment
+ */
+void
+Blast_GetCompositionRange(int * pleft, int * pright,
+ const Uint1 * subject_data, int length,
+ int start, int finish)
+{
+ int i; /* iteration index */
+ int left, right;
+
+ left = start;
+ /* Search leftward for a StopChar */
+ for (i = left; i > 0; i--) {
+ if (subject_data[i - 1] == eStopChar) {
+ /* We have found a StopChar. Unless the StopChar is
+ * too close to the start of the subject region of the
+ * HSP, */
+ if (i + kCompositionMargin < left) {
+ /* reset the left endpoint. */
+ left = i + kCompositionMargin;
+ }
+ break;
+ }
+ }
+ if (i == 0) {
+ /* No stop codon was found to the left. */
+ left = 0;
+ }
+ right = finish;
+ /* Search rightward for a StopChar */
+ for (i = right; i < length; i++) {
+ if (subject_data[i] == eStopChar) {
+ /* We have found a StopChar. Unless the StopChar is
+ * too close to the end of the subject region of the
+ * HSP, */
+ if (i - kCompositionMargin > right) {
+ /* reset the right endpoint */
+ right = i - kCompositionMargin;
+ }
+ break;
+ }
+ }
+ if (i == length) {
+ /* No stop codon was found to the right. */
+ right = length;
+ }
+ *pleft = left; *pright = right;
+}
+
+
+/** Free memory associated with a record of type
+ * Blast_CompositionWorkspace. */
+void
+Blast_CompositionWorkspaceFree(Blast_CompositionWorkspace ** pNRrecord)
+{
+ Blast_CompositionWorkspace * NRrecord = *pNRrecord;
+
+ if (NRrecord != NULL) {
+ free(NRrecord->first_standard_freq);
+ free(NRrecord->second_standard_freq);
+ free(NRrecord->first_seq_freq);
+ free(NRrecord->second_seq_freq);
+ free(NRrecord->first_seq_freq_wpseudo);
+ free(NRrecord->second_seq_freq_wpseudo);
+
+ Nlm_DenseMatrixFree(&NRrecord->score_old);
+ Nlm_DenseMatrixFree(&NRrecord->score_final);
+ Nlm_DenseMatrixFree(&NRrecord->mat_final);
+ Nlm_DenseMatrixFree(&NRrecord->mat_b);
+
+ free(NRrecord);
+ }
+ pNRrecord = NULL;
+}
+
+
+/** Create a new Blast_CompositionWorkspace object, allocating memory
+ * for all its component arrays. */
+Blast_CompositionWorkspace * Blast_CompositionWorkspaceNew()
+{
+ Blast_CompositionWorkspace * NRrecord; /* record to allocate
+ and return */
+ int i; /* loop index */
+
+ NRrecord = (Blast_CompositionWorkspace *)
+ malloc(sizeof(Blast_CompositionWorkspace));
+ if (NRrecord == NULL) goto error_return;
+
+ NRrecord->first_standard_freq = NULL;
+ NRrecord->second_standard_freq = NULL;
+ NRrecord->first_seq_freq = NULL;
+ NRrecord->second_seq_freq = NULL;
+ NRrecord->first_seq_freq_wpseudo = NULL;
+ NRrecord->second_seq_freq_wpseudo = NULL;
+ NRrecord->score_old = NULL;
+ NRrecord->score_final = NULL;
+ NRrecord->mat_final = NULL;
+ NRrecord->mat_b = NULL;
+
+ NRrecord->first_standard_freq =
+ (double *) malloc(COMPO_NUM_TRUE_AA * sizeof(double));
+ if (NRrecord->first_standard_freq == NULL) goto error_return;
+
+ NRrecord->second_standard_freq =
+ (double *) malloc(COMPO_NUM_TRUE_AA * sizeof(double));
+ if (NRrecord->second_standard_freq == NULL) goto error_return;
+
+ NRrecord->first_seq_freq =
+ (double *) malloc(COMPO_NUM_TRUE_AA * sizeof(double));
+ if (NRrecord->first_seq_freq == NULL) goto error_return;
+
+ NRrecord->second_seq_freq =
+ (double *) malloc(COMPO_NUM_TRUE_AA * sizeof(double));
+ if (NRrecord->second_seq_freq == NULL) goto error_return;
+
+ NRrecord->first_seq_freq_wpseudo =
+ (double *) malloc(COMPO_NUM_TRUE_AA * sizeof(double));
+ if (NRrecord->first_seq_freq_wpseudo == NULL) goto error_return;
+
+ NRrecord->second_seq_freq_wpseudo =
+ (double *) malloc(COMPO_NUM_TRUE_AA * sizeof(double));
+ if (NRrecord->second_seq_freq_wpseudo == NULL) goto error_return;
+
+ NRrecord->score_old = Nlm_DenseMatrixNew(COMPO_NUM_TRUE_AA,
+ COMPO_NUM_TRUE_AA);
+ if (NRrecord->score_old == NULL) goto error_return;
+
+ NRrecord->score_final = Nlm_DenseMatrixNew(COMPO_NUM_TRUE_AA,
+ COMPO_NUM_TRUE_AA);
+ if (NRrecord->score_final == NULL) goto error_return;
+
+ NRrecord->mat_final = Nlm_DenseMatrixNew(COMPO_NUM_TRUE_AA,
+ COMPO_NUM_TRUE_AA);
+ if (NRrecord->mat_final == NULL) goto error_return;
+
+ NRrecord->mat_b = Nlm_DenseMatrixNew(COMPO_NUM_TRUE_AA,
+ COMPO_NUM_TRUE_AA);
+ if (NRrecord->mat_b == NULL) goto error_return;
+
+ for (i = 0; i < COMPO_NUM_TRUE_AA; i++) {
+ NRrecord->first_standard_freq[i] =
+ NRrecord->second_standard_freq[i] = 0.0;
+ NRrecord->first_seq_freq[i] = NRrecord->second_seq_freq[i] = 0.0;
+ NRrecord->first_seq_freq_wpseudo[i] =
+ NRrecord->second_seq_freq_wpseudo[i] = 0.0;
+ }
+
+ goto normal_return;
+error_return:
+ Blast_CompositionWorkspaceFree(&NRrecord);
+normal_return:
+ return NRrecord;
+}
+
+
+/** Initialize the fields of a Blast_CompositionWorkspace for a specific
+ * underlying scoring matrix. */
+int
+Blast_CompositionWorkspaceInit(Blast_CompositionWorkspace * NRrecord,
+ const char *matrixName)
+{
+ double re_o_implicit = 0.0; /* implicit relative entropy of
+ starting matrix */
+ int i, j; /* loop indices */
+
+ if (0 == Blast_GetJointProbsForMatrix(NRrecord->mat_b,
+ NRrecord->first_standard_freq,
+ NRrecord->second_standard_freq,
+ matrixName)) {
+ for (i = 0; i < COMPO_NUM_TRUE_AA; i++) {
+ for (j = 0; j < COMPO_NUM_TRUE_AA; j++) {
+ re_o_implicit +=
+ NRrecord->mat_b[i][j] * log(NRrecord->mat_b[i][j] /
+ NRrecord->
+ first_standard_freq[i] /
+ NRrecord->
+ second_standard_freq[j]);
+ NRrecord->score_old[i][j] =
+ log(NRrecord->mat_b[i][j] /
+ NRrecord->first_standard_freq[i] /
+ NRrecord->second_standard_freq[j]);
+ }
+ }
+ NRrecord->RE_o_implicit = re_o_implicit;
+ return 0;
+ } else {
+ fprintf(stderr,
+ "Matrix %s not currently supported for RE based adjustment\n",
+ matrixName);
+ return -1;
+ }
+}
+
+
+/*compute Lambda and if flag set according return re_o_newcontext,
+ otherwise return 0.0, also test for the possibility of average
+ score >= 0*/
+static double
+Blast_CalcLambdaForComposition(Blast_CompositionWorkspace * NRrecord,
+ int compute_re,
+ double * lambdaToReturn)
+{
+ int iteration_count; /* counter for number of iterations of
+ Newton's method */
+ int i, j; /* loop indices */
+ double sum; /* used to compute the sum for estimating
+ lambda */
+ double lambda_error; /* error when estimating lambda */
+ double lambda; /* scale parameter of the Extreme Value
+ Distribution of scores */
+ double ave_score; /* average score in new context */
+ double slope; /* used to compute the derivative when
+ estimating lambda */
+ double re_to_return; /* relative entropy if using old joint
+ probabilities*/
+
+ lambda_error = 1.0;
+ *lambdaToReturn = 1.0;
+ re_to_return = 0.0;
+
+ if (eRelEntropyOldMatrixNewContext == NRrecord->flag) {
+ ave_score = 0.0;
+ for (i = 0; i < COMPO_NUM_TRUE_AA; i++) {
+ for (j = 0; j < COMPO_NUM_TRUE_AA; j++) {
+ ave_score +=
+ NRrecord->score_old[i][j] * NRrecord->first_seq_freq[i] *
+ NRrecord->second_seq_freq[j];
+ }
+ }
+ }
+ if ((eRelEntropyOldMatrixNewContext == NRrecord->flag) &&
+ (ave_score >= (-SCORE_BOUND))) {
+ /* fall back to no constraint mode when average score becomes
+ global alignment-like */
+ NRrecord->flag = eUnconstrainedRelEntropy;
+
+ printf("scoring matrix has nonnegative average score %12.8f,"
+ " reset to mode 0 \n", ave_score);
+ }
+ /* Need to find the relative entropy here. */
+ if (compute_re) {
+ slope = 0.0;
+ lambda = INITIAL_LAMBDA;
+ while(slope <= LAMBDA_ERROR_TOLERANCE) {
+ /* making sure iteration starting point belongs to nontrivial
+ fixed point */
+ lambda = 2.0 * lambda;
+ for (i = 0; i < COMPO_NUM_TRUE_AA; i++) {
+ for (j = 0; j < COMPO_NUM_TRUE_AA; j++) {
+ if (eRelEntropyOldMatrixNewContext == NRrecord->flag) {
+ slope +=
+ NRrecord->score_old[i][j] *
+ exp(NRrecord->score_old[i][j] * lambda) *
+ NRrecord->first_seq_freq[i] *
+ NRrecord->second_seq_freq[j];
+ } else {
+ slope +=
+ NRrecord->score_final[i][j] *
+ exp(NRrecord->score_final[i][j] * lambda) *
+ NRrecord->first_seq_freq[i] *
+ NRrecord->second_seq_freq[j];
+ }
+ }
+ }
+ }
+ iteration_count = 0;
+ while ((fabs(lambda_error) > LAMBDA_ERROR_TOLERANCE) &&
+ (iteration_count < LAMBDA_ITERATION_LIMIT)) {
+ sum = 0.0;
+ slope = 0.0;
+ for (i = 0; i < COMPO_NUM_TRUE_AA; i++) {
+ for (j = 0; j < COMPO_NUM_TRUE_AA; j++) {
+ if (eRelEntropyOldMatrixNewContext == NRrecord->flag) {
+ sum +=
+ exp(NRrecord->score_old[i][j] * lambda) *
+ NRrecord->first_seq_freq[i] *
+ NRrecord->second_seq_freq[j];
+ slope +=
+ NRrecord->score_old[i][j] *
+ exp(NRrecord->score_old[i][j] * lambda) *
+ NRrecord->first_seq_freq[i] *
+ NRrecord->second_seq_freq[j];
+ } else {
+ if(eUnconstrainedRelEntropy == NRrecord->flag) {
+ sum +=
+ exp(NRrecord->score_final[i][j] * lambda) *
+ NRrecord->first_seq_freq[i] *
+ NRrecord->second_seq_freq[j];
+ slope +=
+ NRrecord->score_final[i][j] *
+ exp(NRrecord->score_final[i][j] * lambda) *
+ NRrecord->first_seq_freq[i] *
+ NRrecord->second_seq_freq[j];
+ }
+ }
+ }
+ }
+ lambda_error = (1.0 - sum) / slope;
+ lambda = lambda + LAMBDA_STEP_FRACTION * lambda_error;
+ iteration_count++;
+ }
+ *lambdaToReturn = lambda;
+ printf("Lambda iteration count %d\n", iteration_count );
+ printf("the lambda value = %f \t sum of jp = %12.10f \n", lambda,
+ sum);
+ re_to_return = 0.0;
+ for (i = 0; i < COMPO_NUM_TRUE_AA; i++) {
+ for (j = 0; j < COMPO_NUM_TRUE_AA; j++) {
+ if (eRelEntropyOldMatrixNewContext == NRrecord->flag) {
+ double scaledScore = lambda * NRrecord->score_old[i][j];
+ re_to_return += scaledScore * exp(scaledScore) *
+ NRrecord->first_seq_freq[i] *
+ NRrecord->second_seq_freq[j];
+ } else {
+ if (eUnconstrainedRelEntropy == NRrecord->flag) {
+ double scaledScore =
+ lambda * NRrecord->score_final[i][j];
+
+ re_to_return += scaledScore * exp(scaledScore) *
+ NRrecord->first_seq_freq[i] *
+ NRrecord->second_seq_freq[j];
+ }
+ }
+ }
+ }
+ }
+ return re_to_return;
+}
+
+
+/**
+ * Use compositional score matrix adjustment, as described in
+ *
+ * Altschul, Stephen F., John C. Wootton, E. Michael Gertz, Richa
+ * Agarwala, Aleksandr Morgulis, Alejandro A. Schaffer, and Yi-Kuo
+ * Yu (2005) "Protein database searches using compositionally
+ * adjusted substitution matrices", FEBS J. 272:5101-5109.
+ *
+ * to optimize a score matrix to a given set of letter frequencies.
+ *
+ * @param length1 adjusted length (not counting X) of the first
+ * sequence
+ * @param length2 adjusted length of the second sequence
+ * @param probArray1 letter probabilities for the first sequence,
+ * in the 20 letter amino-acid alphabet
+ * @param probArray2 letter probabilities for the second sequence
+ * @param pseudocounts number of pseudocounts to add the the
+ * probabilities for each sequence, before optimizing
+ * the scores.
+ * @param specifiedRE a relative entropy that might (subject to
+ * fields in NRrecord) be used to as a constraint
+ * of the optimization problem
+ * @param NRrecord a Blast_CompositionWorkspace that contains
+ * fields used for the composition adjustment and
+ * that will hold the output.
+ * @param lambdaComputed the new computed value of lambda
+ *
+ * @return 0 on success, 1 on failure to converge, -1 for out-of-memory
+ */
+int
+Blast_CompositionMatrixAdj(int length1,
+ int length2,
+ const double * probArray1,
+ const double * probArray2,
+ int pseudocounts,
+ double specifiedRE,
+ Blast_CompositionWorkspace * NRrecord,
+ double * lambdaComputed)
+{
+ int i; /* loop indices */
+ double re_o_newcontext = 0.0; /* relative entropy implied by
+ input single sequence
+ probabilities */
+ static int total_iterations = 0; /* total iterations among all
+ calls to
+ compute_new_score_matrix */
+ int new_iterations = 0; /* number of iterations in the most
+ recent call to
+ compute_new_score_matrix */
+ static int max_iterations = 0; /* maximum number of iterations
+ observed in a call to
+ compute_new_score_matrix */
+ int status; /* status code for operations that may
+ fail */
+ /*Is the relative entropy constrained? Behaves as boolean for now*/
+ int constrain_rel_entropy =
+ eUnconstrainedRelEntropy != NRrecord->flag;
+
+ Blast_ApplyPseudocounts(NRrecord->first_seq_freq_wpseudo,
+ NRrecord->first_seq_freq, probArray1, length1,
+ NRrecord->first_standard_freq, pseudocounts);
+ /* plug in frequencies for second sequence, will be the matching
+ sequence in BLAST */
+ Blast_ApplyPseudocounts(NRrecord->second_seq_freq_wpseudo,
+ NRrecord->second_seq_freq, probArray2, length2,
+ NRrecord->second_standard_freq, pseudocounts);
+ *lambdaComputed = 1.0;
+ re_o_newcontext =
+ Blast_CalcLambdaForComposition(
+ NRrecord, (NRrecord->flag == eRelEntropyOldMatrixNewContext),
+ lambdaComputed);
+ switch (NRrecord->flag) {
+ case eUnconstrainedRelEntropy:
+ /* Initialize to a arbitrary value; it won't be used */
+ NRrecord->RE_final = 0.0;
+ break;
+ case eRelEntropyOldMatrixNewContext:
+ NRrecord->RE_final = re_o_newcontext;
+ break;
+ case eRelEntropyOldMatrixOldContext:
+ NRrecord->RE_final = NRrecord->RE_o_implicit;
+ break;
+ case eUserSpecifiedRelEntropy:
+ NRrecord->RE_final = specifiedRE;
+ break;
+ default: /* I assert that we can't get here */
+ fprintf(stderr, "Unknown flag for setting relative entropy"
+ "in composition matrix adjustment");
+ exit(1);
+ }
+ status =
+ Blast_OptimizeTargetFrequencies(&NRrecord->mat_final[0][0],
+ COMPO_NUM_TRUE_AA,
+ &new_iterations,
+ &NRrecord->mat_b[0][0],
+ NRrecord->first_seq_freq_wpseudo,
+ NRrecord->second_seq_freq_wpseudo,
+ constrain_rel_entropy,
+ NRrecord->RE_final,
+ kCompoAdjustErrTolerance,
+ kCompoAdjustIterationLimit);
+ total_iterations += new_iterations;
+ if (new_iterations > max_iterations)
+ max_iterations = new_iterations;
+
+ if (status == 0) {
+ Blast_ScoreMatrixFromFreq(NRrecord->score_final,
+ COMPO_NUM_TRUE_AA,
+ NRrecord->mat_final,
+ NRrecord->first_seq_freq_wpseudo,
+ NRrecord->second_seq_freq_wpseudo);
+ if (NRrecord->flag == eUnconstrainedRelEntropy) {
+ /* Compute the unconstrained relative entropy */
+ (void) Blast_CalcLambdaForComposition(NRrecord, 1, lambdaComputed);
+ }
+ /* success if and only if the computed lambda is positive */
+ status = (*lambdaComputed > 0) ? 0 : 1;
+ } else if (status == -1) {
+ /* out of memory */
+ status = -1;
+ } else {
+ /* Iteration did not converge */
+ fprintf(stderr, "bad probabilities from sequence 1, length %d\n",
+ length1);
+ for (i = 0; i < COMPO_NUM_TRUE_AA; i++)
+ fprintf(stderr, "%15.12f\n", probArray1[i]);
+ fprintf(stderr, "bad probabilities from sequence 2, length %d\n",
+ length2);
+ for (i = 0; i < COMPO_NUM_TRUE_AA; i++)
+ fprintf(stderr, "%15.12f\n", probArray2[i]);
+ fflush(stderr);
+ status = 1;
+ }
+ return status;
+}
+
+
+/**
+ * Compute a compositionally adjusted scoring matrix.
+ *
+ * @param matrix the adjusted matrix
+ * @param query_composition composition of the query sequence
+ * @param queryLength length of the query sequence
+ * @param subject_composition composition of the subject (database)
+ * sequence
+ * @param queryLength length of the subject sequence
+ * @param matrixInfo information about the underlying,
+ * non-adjusted, scoring matrix.
+ * @param RE_rule the rule to use for computing the scoring
+ * matrix
+ * @param RE_pseudocounts the number of pseudocounts to use in some
+ * rules of composition adjustment
+ * @param NRrecord workspace used to perform compositional
+ * adjustment
+ * @param *whichMode which mode of compositional adjustment was
+ * actually used
+ * @calc_lambda a function that can calculate the statistical
+ * parameter Lambda from a set of score
+ * frequencies.
+ * @return 0 for success, 1 for failure to converge,
+ * -1 for out of memory
+ */
+int
+Blast_AdjustScores(int ** matrix,
+ const Blast_AminoAcidComposition * query_composition,
+ int queryLength,
+ const Blast_AminoAcidComposition * subject_composition,
+ int subjectLength,
+ const Blast_MatrixInfo * matrixInfo,
+ int RE_rule,
+ int RE_pseudocounts,
+ Blast_CompositionWorkspace *NRrecord,
+ ECompoAdjustModes *whichMode,
+ double calc_lambda(double *,int,int,double))
+{
+ double LambdaRatio; /* the ratio of the corrected
+ lambda to the original lambda */
+
+ if (matrixInfo->positionBased || RE_rule == 0) {
+ /* Use old-style composition-based statistics unconditionally. */
+ *whichMode = eCompoKeepOldMatrix;
+ return Blast_CompositionBasedStats(matrix, &LambdaRatio,
+ matrixInfo,
+ query_composition->prob,
+ subject_composition->prob,
+ calc_lambda);
+ } else {
+ /* else call Yi-Kuo's code to choose mode for matrix adjustment. */
+
+ /* The next two arrays are letter probabilities of query and
+ * match in 20 letter ARND... alphabet. */
+ double permutedQueryProbs[COMPO_NUM_TRUE_AA];
+ double permutedMatchProbs[COMPO_NUM_TRUE_AA];
+
+ s_GatherLetterProbs(permutedQueryProbs, query_composition->prob);
+ s_GatherLetterProbs(permutedMatchProbs, subject_composition->prob);
+
+ *whichMode =
+ Blast_ChooseCompoAdjustMode(queryLength, subjectLength,
+ permutedQueryProbs,
+ permutedMatchProbs,
+ matrixInfo->matrixName,
+ RE_rule-1);
+ /* compute and plug in new matrix here */
+ if (eCompoKeepOldMatrix == *whichMode) {
+ /* Yi-Kuo's code chose to use composition-based stats */
+ return Blast_CompositionBasedStats(matrix, &LambdaRatio,
+ matrixInfo,
+ query_composition->prob,
+ subject_composition->prob,
+ calc_lambda);
+ } else {
+ /* else use compositionally adjusted scoring matrices */
+ double correctUngappedLambda; /* new value of ungapped lambda */
+ double ** REscoreMatrix = NULL;
+ int status = 0;
+ REscoreMatrix = Nlm_DenseMatrixNew(COMPO_PROTEIN_ALPHABET,
+ COMPO_PROTEIN_ALPHABET);
+ if (REscoreMatrix != NULL) {
+ NRrecord->flag = *whichMode;
+ status =
+ Blast_CompositionMatrixAdj(query_composition->
+ numTrueAminoAcids,
+ subject_composition->
+ numTrueAminoAcids,
+ permutedQueryProbs,
+ permutedMatchProbs,
+ RE_pseudocounts,
+ kFixedReBlosum62,
+ NRrecord,
+ &correctUngappedLambda);
+ if (status == 0) {
+ LambdaRatio =
+ correctUngappedLambda / matrixInfo->ungappedLambda;
+ if (LambdaRatio <= 0) {
+ status = 1;
+ }
+ }
+ if (status == 0) {
+ s_ScatterScores(REscoreMatrix, LambdaRatio,
+ NRrecord->score_final);
+ /*scale matrix in floating point*/
+ Blast_SetNonstandardAaScores(REscoreMatrix,
+ matrixInfo->startMatrix);
+ s_RoundScoreMatrix(matrix, REscoreMatrix,
+ COMPO_PROTEIN_ALPHABET);
+ }
+ Nlm_DenseMatrixFree(&REscoreMatrix);
+ }
+ return status;
+ } /* end else use compositionally adjusted scoring matrices */
+ } /* end else call Yi-Kuo's code to choose mode for matrix adjustment. */
+}
diff --git a/algo/blast/composition_adjustment/composition_adjustment.h b/algo/blast/composition_adjustment/composition_adjustment.h
new file mode 100644
index 00000000..1d38d0f3
--- /dev/null
+++ b/algo/blast/composition_adjustment/composition_adjustment.h
@@ -0,0 +1,168 @@
+/* $Id: composition_adjustment.h,v 1.6 2005/12/01 13:54:04 gertz Exp $
+ * ===========================================================================
+ *
+ * PUBLIC DOMAIN NOTICE
+ * National Center for Biotechnology Information
+ *
+ * This software/database is a "United States Government Work" under the
+ * terms of the United States Copyright Act. It was written as part of
+ * the author's official duties as a United States Government employee and
+ * thus cannot be copyrighted. This software/database is freely available
+ * to the public for use. The National Library of Medicine and the U.S.
+ * Government have not placed any restriction on its use or reproduction.
+ *
+ * Although all reasonable efforts have been taken to ensure the accuracy
+ * and reliability of the software and data, the NLM and the U.S.
+ * Government do not and cannot warrant the performance or results that
+ * may be obtained by using this software or data. The NLM and the U.S.
+ * Government disclaim all warranties, express or implied, including
+ * warranties of performance, merchantability or fitness for any particular
+ * purpose.
+ *
+ * Please cite the author in any work or product based on this material.
+ *
+ * ===========================================================================*/
+/**
+ * @file composition_adjustment.h
+ * @author E. Michael Gertz, Alejandro Schaffer, Yi-Kuo Yu
+ *
+ * Definitions used in compositional score matrix adjustment
+ */
+
+#ifndef __COMPOSITION_ADJUSTMENT__
+#define __COMPOSITION_ADJUSTMENT__
+
+#include <algo/blast/core/blast_export.h>
+#include <algo/blast/core/ncbi_std.h>
+#include <algo/blast/composition_adjustment/compo_mode_condition.h>
+
+/* Number of standard amino acids */
+#define COMPO_NUM_TRUE_AA 20
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Some characters in the 26 letter NCBIstdaa alphabet, including
+ ambiguity characters, selenocysteine and the stop character. */
+enum { eGapChar = 0, eBchar = 2, eDchar = 4, eEchar = 5, eNchar = 13,
+ eQchar = 15, eXchar = 21, eZchar = 23, eSelenocysteine = 24,
+ eStopChar = 25};
+
+/**
+ * Represents the composition of an amino-acid sequence */
+struct Blast_AminoAcidComposition {
+ double prob[26]; /**< probabilities of each amino acid, including
+ nonstandard amino acids */
+ int numTrueAminoAcids; /**< number of true amino acids in the sequence,
+ omitting X characters */
+};
+typedef struct Blast_AminoAcidComposition Blast_AminoAcidComposition;
+
+NCBI_XBLAST_EXPORT
+void
+Blast_ReadAaComposition(Blast_AminoAcidComposition * composition,
+ const Uint1 * sequence, int length);
+
+struct Blast_MatrixInfo {
+ char * matrixName; /**< name of the matrix */
+ Int4 **startMatrix; /**< Rescaled values of the original matrix */
+ double **startFreqRatios; /**< frequency ratios used to calculate matrix
+ scores */
+ int rows; /**< the number of rows in the scoring
+ matrix. */
+ int positionBased; /**< is the matrix position-based */
+ double ungappedLambda; /**< ungapped Lambda value for this matrix
+ in standard context */
+};
+typedef struct Blast_MatrixInfo Blast_MatrixInfo;
+
+NCBI_XBLAST_EXPORT
+Blast_MatrixInfo * Blast_MatrixInfoNew(int rows, int positionBased);
+
+NCBI_XBLAST_EXPORT
+void Blast_MatrixInfoFree(Blast_MatrixInfo ** ss);
+
+/** Work arrays used to perform composition-based matrix adjustment */
+typedef struct Blast_CompositionWorkspace {
+ int flag; /**< determines which of the optimization
+ problems are solved */
+ double ** mat_b; /**< joint probabilities for the matrix in
+ standard context */
+ double ** score_old; /**< score of the matrix in standard context
+ with scale Lambda == 1 */
+ double ** mat_final; /**< optimized target frequencies */
+ double ** score_final; /**< optimized score matrix */
+
+ double RE_final; /**< the relative entropy used, either
+ re_o_implicit or re_o_newcontext */
+ double RE_o_implicit; /**< used for eRelEntropyOldMatrixOldContext
+ mode */
+
+ double * first_seq_freq; /**< freq vector of first seq */
+ double * second_seq_freq; /**< freq. vector for the second. */
+ double * first_standard_freq; /**< background freq vector of first
+ seq using matrix */
+ double * second_standard_freq; /**< background freq vector for
+ the second. */
+ double * first_seq_freq_wpseudo; /**< freq vector of first seq
+ w/pseudocounts */
+ double * second_seq_freq_wpseudo; /**< freq. vector for the
+ second seq w/pseudocounts */
+} Blast_CompositionWorkspace;
+
+NCBI_XBLAST_EXPORT
+Blast_CompositionWorkspace * Blast_CompositionWorkspaceNew();
+
+NCBI_XBLAST_EXPORT
+int Blast_CompositionWorkspaceInit(Blast_CompositionWorkspace * NRrecord,
+ const char *matrixName);
+
+NCBI_XBLAST_EXPORT
+void Blast_CompositionWorkspaceFree(Blast_CompositionWorkspace ** NRrecord);
+
+NCBI_XBLAST_EXPORT
+void Blast_GetCompositionRange(int * pleft, int * pright,
+ const Uint1 * subject_data, int length,
+ int start, int finish);
+NCBI_XBLAST_EXPORT
+int
+Blast_CompositionBasedStats(Int4 ** matrix, double * LambdaRatio,
+ const Blast_MatrixInfo * ss,
+ const double queryProb[], const double resProb[],
+ double (*calc_lambda)(double*,int,int,double));
+
+NCBI_XBLAST_EXPORT
+int Blast_CompositionMatrixAdj(int length1, int length2,
+ const double *probArray1,
+ const double *probArray2,
+ int pseudocounts, double specifiedRE,
+ Blast_CompositionWorkspace * NRrecord,
+ double * lambdaComputed);
+
+NCBI_XBLAST_EXPORT
+int
+Blast_AdjustScores(Int4 ** matrix,
+ const Blast_AminoAcidComposition * query_composition,
+ int queryLength,
+ const Blast_AminoAcidComposition * subject_composition,
+ int subjectLength,
+ const Blast_MatrixInfo * matrixInfo,
+ int RE_rule,
+ int RE_pseudocounts,
+ Blast_CompositionWorkspace *NRrecord,
+ ECompoAdjustModes *whichMode,
+ double calc_lambda(double *,int,int,double));
+
+NCBI_XBLAST_EXPORT
+void Blast_Int4MatrixFromFreq(Int4 **matrix, int alphsize,
+ double ** freq, double Lambda);
+
+NCBI_XBLAST_EXPORT
+double Blast_GetRelativeEntropy(const double A[], const double B[]);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/algo/blast/composition_adjustment/composition_constants.h b/algo/blast/composition_adjustment/composition_constants.h
new file mode 100644
index 00000000..48e8152b
--- /dev/null
+++ b/algo/blast/composition_adjustment/composition_constants.h
@@ -0,0 +1,60 @@
+/* $Id: composition_constants.h,v 1.1 2005/12/01 13:52:20 gertz Exp $
+ * ===========================================================================
+ *
+ * PUBLIC DOMAIN NOTICE
+ * National Center for Biotechnology Information
+ *
+ * This software/database is a "United States Government Work" under the
+ * terms of the United States Copyright Act. It was written as part of
+ * the author's official duties as a United States Government employee and
+ * thus cannot be copyrighted. This software/database is freely available
+ * to the public for use. The National Library of Medicine and the U.S.
+ * Government have not placed any restriction on its use or reproduction.
+ *
+ * Although all reasonable efforts have been taken to ensure the accuracy
+ * and reliability of the software and data, the NLM and the U.S.
+ * Government do not and cannot warrant the performance or results that
+ * may be obtained by using this software or data. The NLM and the U.S.
+ * Government disclaim all warranties, express or implied, including
+ * warranties of performance, merchantability or fitness for any particular
+ * purpose.
+ *
+ * Please cite the author in any work or product based on this material.
+ *
+ * ===========================================================================*/
+/**
+ * @file composition_constants.h
+ * @author E. Michael Gertz, Alejandro Schaffer, Yi-Kuo Yu
+ *
+ * Constants used in compositional score matrix adjustment
+ */
+
+
+#ifndef __COMPOSITION_CONSTANTS__
+#define __COMPOSITION_CONSTANTS__
+
+#include <algo/blast/core/ncbi_std.h>
+
+/** Number of standard amino acids */
+#define COMPO_NUM_TRUE_AA 20
+
+/** Number of amino acids, including nonstandard ones */
+#define COMPO_PROTEIN_ALPHABET 26
+
+/** Minimum score in a matrix */
+#define COMPO_SCORE_MIN INT2_MIN
+
+/* An collection of constants that specify all permissible
+ * modes of composition adjustment */
+enum ECompoAdjustModes {
+ eNoCompositionAdjustment = (-1),
+ eCompoKeepOldMatrix = 0,
+ eUnconstrainedRelEntropy = 1,
+ eRelEntropyOldMatrixNewContext = 2,
+ eRelEntropyOldMatrixOldContext = 3,
+ eUserSpecifiedRelEntropy = 4,
+ eNumCompoAdjustModes
+};
+typedef enum ECompoAdjustModes ECompoAdjustModes;
+
+#endif
diff --git a/algo/blast/composition_adjustment/matrix_frequency_data.c b/algo/blast/composition_adjustment/matrix_frequency_data.c
new file mode 100644
index 00000000..176d7f31
--- /dev/null
+++ b/algo/blast/composition_adjustment/matrix_frequency_data.c
@@ -0,0 +1,230 @@
+/* ===========================================================================
+*
+* PUBLIC DOMAIN NOTICE
+* National Center for Biotechnology Information
+*
+* This software/database is a "United States Government Work" under the
+* terms of the United States Copyright Act. It was written as part of
+* the author's official duties as a United States Government employee and
+* thus cannot be copyrighted. This software/database is freely available
+* to the public for use. The National Library of Medicine and the U.S.
+* Government have not placed any restriction on its use or reproduction.
+*
+* Although all reasonable efforts have been taken to ensure the accuracy
+* and reliability of the software and data, the NLM and the U.S.
+* Government do not and cannot warrant the performance or results that
+* may be obtained by using this software or data. The NLM and the U.S.
+* Government disclaim all warranties, express or implied, including
+* warranties of performance, merchantability or fitness for any particular
+* purpose.
+*
+* Please cite the author in any work or product based on this material.
+*
+* ===========================================================================*/
+
+/** @file joint_probs.c
+ *
+ * @author Yi-Kuo Yu, Alejandro Schaffer, E. Michael Gertz
+ *
+ * Joint probabilities for specific matrices.
+ */
+#ifndef SKIP_DOXYGEN_PROCESSING
+static char const rcsid[] =
+ "$Id: matrix_frequency_data.c,v 1.1 2005/12/01 13:48:09 gertz Exp $";
+#endif /* SKIP_DOXYGEN_PROCESSING */
+
+#include <stdlib.h>
+#include <algo/blast/core/ncbi_std.h>
+#include <algo/blast/composition_adjustment/composition_constants.h>
+#include <algo/blast/composition_adjustment/matrix_frequency_data.h>
+
+/* bound on error for sum of probabilities*/
+static const double kProbSumTolerance = 0.000000001;
+
+/* Joint probabilities for BLOSUM62 */
+static double
+BLOSUM62_JOINT_PROBS[COMPO_NUM_TRUE_AA][COMPO_NUM_TRUE_AA]
+= {
+ {0.021516461557, 0.002341028532, 0.001941062549, 0.002160193055,
+ 0.001595828537, 0.001934059173, 0.002990874959, 0.005831307116,
+ 0.001108651421, 0.003181451207, 0.004450432543, 0.003350994862,
+ 0.001330482798, 0.001634084433, 0.002159278003, 0.006261897426,
+ 0.003735752688, 0.000404784037, 0.001298558985, 0.005124343367},
+ {0.002341028532, 0.017737158563, 0.001969132731, 0.001581985934,
+ 0.000393496788, 0.002483620870, 0.002678135197, 0.001721914295,
+ 0.001230766890, 0.001239704106, 0.002418976127, 0.006214150782,
+ 0.000796884039, 0.000932356719, 0.000959872904, 0.002260870847,
+ 0.001779897849, 0.000265310579, 0.000918577576, 0.001588408095},
+ {0.001941062549, 0.001969132731, 0.014105369019, 0.003711182199,
+ 0.000436559586, 0.001528401416, 0.002205231268, 0.002856026580,
+ 0.001423459827, 0.000986015608, 0.001369776043, 0.002436729322,
+ 0.000521972796, 0.000746722150, 0.000858953243, 0.003131380307,
+ 0.002237168191, 0.000161021675, 0.000695990541, 0.001203509685},
+ {0.002160193055, 0.001581985934, 0.003711182199, 0.021213070328,
+ 0.000397349231, 0.001642988683, 0.004909362115, 0.002510933422,
+ 0.000948355160, 0.001226071189, 0.001524412852, 0.002443951825,
+ 0.000458902921, 0.000759393269, 0.001235481304, 0.002791458183,
+ 0.001886707235, 0.000161498946, 0.000595157039, 0.001320931409},
+ {0.001595828537, 0.000393496788, 0.000436559586, 0.000397349231,
+ 0.011902428201, 0.000309689150, 0.000380965445, 0.000768969543,
+ 0.000229437747, 0.001092222651, 0.001570843250, 0.000500631539,
+ 0.000373569136, 0.000512643056, 0.000360439075, 0.001038049531,
+ 0.000932287369, 0.000144869300, 0.000344932387, 0.001370634611},
+ {0.001934059173, 0.002483620870, 0.001528401416, 0.001642988683,
+ 0.000309689150, 0.007348611171, 0.003545322222, 0.001374101100,
+ 0.001045402587, 0.000891574240, 0.001623152279, 0.003116305001,
+ 0.000735592074, 0.000544610751, 0.000849940593, 0.001893917959,
+ 0.001381521088, 0.000228499204, 0.000674510708, 0.001174481769},
+ {0.002990874959, 0.002678135197, 0.002205231268, 0.004909362115,
+ 0.000380965445, 0.003545322222, 0.016058942448, 0.001941788215,
+ 0.001359354087, 0.001208575016, 0.002010620195, 0.004137352463,
+ 0.000671608129, 0.000848058651, 0.001418534945, 0.002949177015,
+ 0.002049363253, 0.000264084965, 0.000864998825, 0.001706373779},
+ {0.005831307116, 0.001721914295, 0.002856026580, 0.002510933422,
+ 0.000768969543, 0.001374101100, 0.001941788215, 0.037833882792,
+ 0.000956438296, 0.001381594180, 0.002100349645, 0.002551728599,
+ 0.000726329019, 0.001201930393, 0.001363538639, 0.003819521365,
+ 0.002185818204, 0.000406753457, 0.000831463001, 0.001832653843},
+ {0.001108651421, 0.001230766890, 0.001423459827, 0.000948355160,
+ 0.000229437747, 0.001045402587, 0.001359354087, 0.000956438296,
+ 0.009268821027, 0.000575006579, 0.000990341860, 0.001186603601,
+ 0.000377383962, 0.000807129053, 0.000477177871, 0.001100800912,
+ 0.000744015818, 0.000151511190, 0.001515361861, 0.000650302833},
+ {0.003181451207, 0.001239704106, 0.000986015608, 0.001226071189,
+ 0.001092222651, 0.000891574240, 0.001208575016, 0.001381594180,
+ 0.000575006579, 0.018297094930, 0.011372374833, 0.001566332194,
+ 0.002471405322, 0.003035353009, 0.001002322534, 0.001716150165,
+ 0.002683992649, 0.000360556333, 0.001366091300, 0.011965802769},
+ {0.004450432543, 0.002418976127, 0.001369776043, 0.001524412852,
+ 0.001570843250, 0.001623152279, 0.002010620195, 0.002100349645,
+ 0.000990341860, 0.011372374833, 0.037325284430, 0.002482344486,
+ 0.004923694031, 0.005449900864, 0.001421696216, 0.002434190706,
+ 0.003337092433, 0.000733421681, 0.002210504676, 0.009545821406},
+ {0.003350994862, 0.006214150782, 0.002436729322, 0.002443951825,
+ 0.000500631539, 0.003116305001, 0.004137352463, 0.002551728599,
+ 0.001186603601, 0.001566332194, 0.002482344486, 0.016147683460,
+ 0.000901118905, 0.000950170174, 0.001578353818, 0.003104386139,
+ 0.002360691115, 0.000272260749, 0.000996404634, 0.001952015271},
+ {0.001330482798, 0.000796884039, 0.000521972796, 0.000458902921,
+ 0.000373569136, 0.000735592074, 0.000671608129, 0.000726329019,
+ 0.000377383962, 0.002471405322, 0.004923694031, 0.000901118905,
+ 0.003994917914, 0.001184353682, 0.000404888644, 0.000847632455,
+ 0.001004584462, 0.000197602804, 0.000563431813, 0.002301832938},
+ {0.001634084433, 0.000932356719, 0.000746722150, 0.000759393269,
+ 0.000512643056, 0.000544610751, 0.000848058651, 0.001201930393,
+ 0.000807129053, 0.003035353009, 0.005449900864, 0.000950170174,
+ 0.001184353682, 0.018273718971, 0.000525642239, 0.001195904180,
+ 0.001167245623, 0.000851298193, 0.004226922511, 0.002601386501},
+ {0.002159278003, 0.000959872904, 0.000858953243, 0.001235481304,
+ 0.000360439075, 0.000849940593, 0.001418534945, 0.001363538639,
+ 0.000477177871, 0.001002322534, 0.001421696216, 0.001578353818,
+ 0.000404888644, 0.000525642239, 0.019101516083, 0.001670397698,
+ 0.001352022511, 0.000141505490, 0.000450817134, 0.001257818591},
+ {0.006261897426, 0.002260870847, 0.003131380307, 0.002791458183,
+ 0.001038049531, 0.001893917959, 0.002949177015, 0.003819521365,
+ 0.001100800912, 0.001716150165, 0.002434190706, 0.003104386139,
+ 0.000847632455, 0.001195904180, 0.001670397698, 0.012524165008,
+ 0.004695393160, 0.000286147117, 0.001025667373, 0.002373134246},
+ {0.003735752688, 0.001779897849, 0.002237168191, 0.001886707235,
+ 0.000932287369, 0.001381521088, 0.002049363253, 0.002185818204,
+ 0.000744015818, 0.002683992649, 0.003337092433, 0.002360691115,
+ 0.001004584462, 0.001167245623, 0.001352022511, 0.004695393160,
+ 0.012524453183, 0.000287144142, 0.000940528155, 0.003660378402},
+ {0.000404784037, 0.000265310579, 0.000161021675, 0.000161498946,
+ 0.000144869300, 0.000228499204, 0.000264084965, 0.000406753457,
+ 0.000151511190, 0.000360556333, 0.000733421681, 0.000272260749,
+ 0.000197602804, 0.000851298193, 0.000141505490, 0.000286147117,
+ 0.000287144142, 0.006479671265, 0.000886553355, 0.000357440337},
+ {0.001298558985, 0.000918577576, 0.000695990541, 0.000595157039,
+ 0.000344932387, 0.000674510708, 0.000864998825, 0.000831463001,
+ 0.001515361861, 0.001366091300, 0.002210504676, 0.000996404634,
+ 0.000563431813, 0.004226922511, 0.000450817134, 0.001025667373,
+ 0.000940528155, 0.000886553355, 0.010185916203, 0.001555728244},
+ {0.005124343367, 0.001588408095, 0.001203509685, 0.001320931409,
+ 0.001370634611, 0.001174481769, 0.001706373779, 0.001832653843,
+ 0.000650302833, 0.011965802769, 0.009545821406, 0.001952015271,
+ 0.002301832938, 0.002601386501, 0.001257818591, 0.002373134246,
+ 0.003660378402, 0.000357440337, 0.001555728244, 0.019815247974}
+};
+
+
+
+/* Background frequencies for BLOSUM62 */
+static double BLOSUM62_bg[COMPO_NUM_TRUE_AA] =
+ { 0.0742356686, 0.0515874541, 0.0446395713, 0.0536092024, 0.0246865086,
+ 0.0342500470, 0.0543174458, 0.0741431988, 0.0262119099, 0.0679331197,
+ 0.0989057232, 0.0581774322, 0.0249972837, 0.0473970070, 0.0385382904,
+ 0.0572279733, 0.0508996546, 0.0130298868, 0.0322925130, 0.0729201182
+ };
+
+
+int Blast_FrequencyDataIsAvailable(const char *matrix_name)
+{
+ return NULL != Blast_GetMatrixBackgroundFreq(matrix_name);
+}
+
+
+/** Retrieve the background letter probabilities implicitly used in
+ * constructing the score matrix matrix_name. */
+const double *
+Blast_GetMatrixBackgroundFreq(const char *matrix_name)
+{
+ if (0 == strcmp(matrix_name, "BLOSUM62")) {
+ return BLOSUM62_bg;
+ } else { /* default */
+ fprintf(stderr, "matrix not supported, exit now! \n");
+ return NULL;
+ }
+}
+
+
+/**
+ * Get joint probabilities for the named matrix.
+ *
+ * @param probs the joint probabilities [out]
+ * @param row_sums sum of the values in each row of probs [out]
+ * @param col_sums sum of the values in each column of probs [out]
+ * @param matrixName the name of the matrix sought [in]
+ * @returns 0 if successful; -1 if the named matrix is not known.
+ */
+int
+Blast_GetJointProbsForMatrix(double ** probs, double row_sums[],
+ double col_sums[], const char *matrixName)
+{
+ double sum; /* sum of all joint probabilities -- should
+ be close to one */
+ int i, j; /* loop indices */
+ /* The joint probabilities of the selected matrix */
+ double (*joint_probs)[COMPO_NUM_TRUE_AA];
+
+ /* Choose the matrix */
+ if (0 == strcmp("BLOSUM62", matrixName)) {
+ joint_probs = BLOSUM62_JOINT_PROBS;
+ } else {
+ fprintf(stderr, "matrix %s is not supported "
+ "for RE based adjustment\n", matrixName);
+ return -1;
+ }
+ sum = 0.0;
+ for (i = 0; i < COMPO_NUM_TRUE_AA; i++) {
+ for (j = 0; j < COMPO_NUM_TRUE_AA; j++) {
+ sum += joint_probs[i][j];
+ }
+ }
+ assert(fabs(sum - 1.0) < kProbSumTolerance);
+ /* Normalize and record the data */
+ for (j = 0; j < COMPO_NUM_TRUE_AA; j++) {
+ col_sums[j] = 0.0;
+ }
+ for (i = 0; i < COMPO_NUM_TRUE_AA; i++) {
+ row_sums[i] = 0.0;
+ for (j = 0; j < COMPO_NUM_TRUE_AA; j++) {
+ double probij = joint_probs[i][j];
+
+ probs[i][j] = probij/sum;
+ row_sums[i] += probij/sum;
+ col_sums[j] += probij/sum;
+ }
+ }
+ return 0;
+}
diff --git a/algo/blast/composition_adjustment/matrix_frequency_data.h b/algo/blast/composition_adjustment/matrix_frequency_data.h
new file mode 100644
index 00000000..cd275c78
--- /dev/null
+++ b/algo/blast/composition_adjustment/matrix_frequency_data.h
@@ -0,0 +1,54 @@
+/* $Id: matrix_frequency_data.h,v 1.1 2005/12/01 13:52:20 gertz Exp $
+ * ===========================================================================
+ *
+ * PUBLIC DOMAIN NOTICE
+ * National Center for Biotechnology Information
+ *
+ * This software/database is a "United States Government Work" under the
+ * terms of the United States Copyright Act. It was written as part of
+ * the author's official duties as a United States Government employee and
+ * thus cannot be copyrighted. This software/database is freely available
+ * to the public for use. The National Library of Medicine and the U.S.
+ * Government have not placed any restriction on its use or reproduction.
+ *
+ * Although all reasonable efforts have been taken to ensure the accuracy
+ * and reliability of the software and data, the NLM and the U.S.
+ * Government do not and cannot warrant the performance or results that
+ * may be obtained by using this software or data. The NLM and the U.S.
+ * Government disclaim all warranties, express or implied, including
+ * warranties of performance, merchantability or fitness for any particular
+ * purpose.
+ *
+ * Please cite the author in any work or product based on this material.
+ *
+ * ===========================================================================*/
+/**
+ * @file joint_probs.h
+ * @author Alejandro Schaffer, E. Michael Gertz
+ *
+ * Definitions used to get joint probabilities for a scoring matrix
+ */
+#ifndef __BLAST_JOINT_PROBS__
+#define __BLAST_JOINT_PROBS__
+
+#include <algo/blast/core/blast_export.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+NCBI_XBLAST_EXPORT
+int Blast_GetJointProbsForMatrix(double ** probs, double row_sums[],
+ double col_sums[], const char *matrixName);
+
+NCBI_XBLAST_EXPORT
+const double * Blast_GetMatrixBackgroundFreq(const char *matrix_name);
+
+NCBI_XBLAST_EXPORT
+int Blast_FrequencyDataIsAvailable(const char *matrix_name);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/algo/blast/composition_adjustment/nlm_linear_algebra.c b/algo/blast/composition_adjustment/nlm_linear_algebra.c
index 13d89db5..ff98794d 100644
--- a/algo/blast/composition_adjustment/nlm_linear_algebra.c
+++ b/algo/blast/composition_adjustment/nlm_linear_algebra.c
@@ -1,5 +1,3 @@
-static char const rcsid[] = "$Id: nlm_numerics.c,v 1.1 2005/05/16 16:11:41 papadopo Exp $";
-
/* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -24,25 +22,21 @@ static char const rcsid[] = "$Id: nlm_numerics.c,v 1.1 2005/05/16 16:11:41 papad
*
* ===========================================================================*/
-/*****************************************************************************
-
-File name: nlm_numerics.c
-
-Author: E. Michael Gertz
-
-Contents: Basic matrix and vector operations for use in conjunction
- with higher-level procedures in re_newton.c
-
-******************************************************************************/
-/*
- * $Log: nlm_numerics.c,v $
- * Revision 1.1 2005/05/16 16:11:41 papadopo
- * Initial revision
+/** @file nlm_linear_algebra.c
+ *
+ * @author E. Michael Gertz
*
+ * Basic matrix and vector operations
*/
+#ifndef SKIP_DOXYGEN_PROCESSING
+static char const rcsid[] =
+ "$Id: nlm_linear_algebra.c,v 1.5 2005/12/01 13:49:43 gertz Exp $";
+#endif /* SKIP_DOXYGEN_PROCESSING */
-#include <ncbi.h>
-#include <nlm_numerics.h>
+#include <math.h>
+#include <stdlib.h>
+#include <algo/blast/core/ncbi_std.h>
+#include <algo/blast/composition_adjustment/nlm_linear_algebra.h>
/**
* Create and return a new, dense matrix. Elements of the matrix A
@@ -51,22 +45,26 @@ Contents: Basic matrix and vector operations for use in conjunction
* @param nrows the number of rows for the new matrix.
* @param ncols the number of columns for the new matrix.
*/
-Nlm_FloatHiPtr PNTR
-Nlm_DenseMatrixNew(Int4 nrows,
- Int4 ncols)
+double **
+Nlm_DenseMatrixNew(int nrows,
+ int ncols)
{
- Int4 i; /* iteration index */
- Nlm_FloatHiPtr PNTR mat; /* the new matrix */
-
- mat = (Nlm_FloatHiPtr PNTR) Nlm_Calloc(nrows, sizeof(Nlm_FloatHiPtr));
-
- mat[0] =
- (Nlm_FloatHiPtr) Nlm_MemNew((size_t) nrows *
- (size_t) ncols * sizeof(Nlm_FloatHi));
- for(i = 1; i < nrows; i++) {
- mat[i] = &mat[0][i * ncols];
+ int i; /* iteration index */
+ double ** mat; /* the new matrix */
+
+ mat = (double **) calloc(nrows, sizeof(double *));
+ if (mat != NULL) {
+ mat[0] = (double *) malloc((size_t) nrows *
+ (size_t) ncols * sizeof(double));
+ if (mat[0] != NULL) {
+ for (i = 1; i < nrows; i++) {
+ mat[i] = &mat[0][i * ncols];
+ }
+ } else {
+ free(mat);
+ mat = NULL;
+ }
}
-
return mat;
}
@@ -77,23 +75,27 @@ Nlm_DenseMatrixNew(Int4 nrows,
*
* @param n the dimension of the matrix.
*/
-Nlm_FloatHiPtr PNTR
-Nlm_LtriangMatrixNew(Int4 n)
+double **
+Nlm_LtriangMatrixNew(int n)
{
- Int4 i; /* iteration index */
- Nlm_FloatHiPtr PNTR L; /* the new, lower triangular matrix */
+ int i; /* iteration index */
+ double ** L; /* the new, lower triangular matrix */
size_t nelts; /* the number of elements in
the matrix */
-
nelts = ((size_t) n * (n + 1))/2;
- L = (Nlm_FloatHiPtr PNTR) Nlm_Calloc(n, sizeof(Nlm_FloatHi *));
- L[0] = (Nlm_FloatHiPtr) Nlm_MemNew(nelts * sizeof(Nlm_FloatHi) );
-
- for( i = 1; i < n; i++ ) {
- L[i] = L[i - 1] + i;
+ L = (double**) calloc(n, sizeof(double *));
+ if (L != NULL) {
+ L[0] = (double*) malloc(nelts * sizeof(double));
+ if (L[0] != NULL) {
+ for (i = 1; i < n; i++) {
+ L[i] = L[i - 1] + i;
+ }
+ } else {
+ free(L);
+ L = NULL;
+ }
}
-
return L;
}
@@ -105,17 +107,64 @@ Nlm_LtriangMatrixNew(Int4 n)
* @param mat the matrix to be freed
* @return always NULL
*/
-Nlm_FloatHiPtr PNTR
-Nlm_DenseMatrixFree(Nlm_FloatHiPtr PNTR mat)
+void
+Nlm_DenseMatrixFree(double *** mat)
{
- mat[0] = (Nlm_FloatHiPtr) Nlm_MemFree(mat[0]);
- mat = (Nlm_FloatHiPtr PNTR) Nlm_MemFree(mat);
+ if(*mat != NULL) {
+ free((*mat)[0]);
+ free(*mat);
+ }
+ *mat = NULL;
+}
- return NULL;
+
+/**
+ * Create and return a new Int4 matrix. Elements of the matrix A
+ * may be accessed as A[i][j]
+ *
+ * @param nrows the number of rows for the new matrix.
+ * @param ncols the number of columns for the new matrix.
+ */
+Int4 ** Nlm_Int4MatrixNew(int nrows, int ncols)
+{
+ int i; /* iteration index */
+ Int4 ** mat; /* the new matrix */
+
+ mat = (Int4 **) calloc(nrows, sizeof(Int4 *));
+ if (mat != NULL) {
+ mat[0] = (Int4 *) malloc((size_t) nrows *
+ (size_t) ncols * sizeof(Int4));
+ if (mat[0] != NULL) {
+ for (i = 1; i < nrows; i++) {
+ mat[i] = &mat[0][i * ncols];
+ }
+ } else {
+ free(mat);
+ mat = NULL;
+ }
+ }
+ return mat;
}
/**
+ * Free a matrix created by Nlm_DenseMatrixNew or
+ * Nlm_LtriangMatrixNew.
+ *
+ * @param mat the matrix to be freed
+ * @return always NULL
+ */
+void
+Nlm_Int4MatrixFree(Int4 *** mat)
+{
+ if(*mat != NULL) {
+ free((*mat)[0]);
+ free(*mat);
+ }
+ *mat = NULL;
+}
+
+/**
* Accessing only the lower triangular elements of the symmetric,
* positive definite matrix A, compute a lower triangular matrix L
* such that A = L L^T (Cholesky factorization.) Overwrite the lower
@@ -129,22 +178,22 @@ Nlm_DenseMatrixFree(Nlm_FloatHiPtr PNTR mat)
* @param n the size of A
*/
void
-Nlm_FactorLtriangPosDef(Nlm_FloatHiPtr PNTR A, Int4 n)
+Nlm_FactorLtriangPosDef(double ** A, int n)
{
- Int4 i, j, k; /* iteration indices */
- Nlm_FloatHi temp; /* temporary variable for intermediate
+ int i, j, k; /* iteration indices */
+ double temp; /* temporary variable for intermediate
values in a computation */
- for( i = 0; i < n; i++ ) {
- for( j = 0; j < i; j++ ) {
+ for (i = 0; i < n; i++) {
+ for (j = 0; j < i; j++) {
temp = A[i][j];
- for( k = 0; k < j; k++ ) {
+ for (k = 0; k < j; k++) {
temp -= A[i][k] * A[j][k];
}
A[i][j] = temp/A[j][j];
}
temp = A[i][i];
- for(k = 0; k < i; k++ ) {
+ for (k = 0; k < i; k++) {
temp -= A[i][k] * A[i][k];
}
A[i][i] = sqrt(temp);
@@ -162,29 +211,27 @@ Nlm_FactorLtriangPosDef(Nlm_FloatHiPtr PNTR A, Int4 n)
* @param n the size of x
* @param L a non-singular lower triangular matrix
*/
-void Nlm_SolveLtriangPosDef(Nlm_FloatHiPtr x, Int4 n,
- Nlm_FloatHiPtr PNTR L )
+void Nlm_SolveLtriangPosDef(double * x, int n,
+ double ** L )
{
- Int4 i, j; /* iteration indices */
- Nlm_FloatHi temp; /* temporary variable for intermediate
+ int i, j; /* iteration indices */
+ double temp; /* temporary variable for intermediate
values in a computation */
/* At point x = b in the equation L L\T y = b */
/* Forward solve; L z = b */
- for( i = 0; i < n; i++ ) {
+ for (i = 0; i < n; i++) {
temp = x[i];
- for( j = 0; j < i; j++ ) {
+ for (j = 0; j < i; j++) {
temp -= L[i][j] * x[j];
}
x[i] = temp/L[i][i];
}
- /* Now x = z */
-
- /* Back solve; L\T y = z */
- for( j = n - 1; j >= 0; j-- ) {
+ /* Now x = z. Back solve the system L\T y = z */
+ for (j = n - 1; j >= 0; j--) {
x[j] /= L[j][j];
- for( i = 0; i < j; i++ ) {
+ for (i = 0; i < j; i++) {
x[i] -= L[j][i] * x[j];
}
}
@@ -201,17 +248,17 @@ void Nlm_SolveLtriangPosDef(Nlm_FloatHiPtr x, Int4 n,
* @param v a vector
* @param n the length of v
*/
-Nlm_FloatHi
-Nlm_EuclideanNorm(const Nlm_FloatHi PNTR v, Int4 n)
+double
+Nlm_EuclideanNorm(const double * v, int n)
{
- Nlm_FloatHi sum = 1.0; /* sum of squares of elements in v */
- Nlm_FloatHi scale = 0.0; /* a scale factor for the elements in v */
- Int4 i; /* iteration index */
-
- for( i = 0; i < n; i++ ) {
- if( v[i] != 0.0 ) {
- Nlm_FloatHi absvi = ABS(v[i]);
- if( scale < absvi ) {
+ double sum = 1.0; /* sum of squares of elements in v */
+ double scale = 0.0; /* a scale factor for the elements in v */
+ int i; /* iteration index */
+
+ for (i = 0; i < n; i++) {
+ if (v[i] != 0.0) {
+ double absvi = fabs(v[i]);
+ if (scale < absvi) {
sum = 1.0 + sum * (scale/absvi) * (scale/absvi);
scale = absvi;
} else {
@@ -219,7 +266,6 @@ Nlm_EuclideanNorm(const Nlm_FloatHi PNTR v, Int4 n)
}
}
}
-
return scale * sqrt(sum);
}
@@ -231,12 +277,13 @@ Nlm_EuclideanNorm(const Nlm_FloatHi PNTR v, Int4 n)
* @param x another vector
* @param n the length of x and y
*/
-void Nlm_AddVectors(Nlm_FloatHiPtr y, Int4 n, Nlm_FloatHi alpha,
- const Nlm_FloatHi PNTR x )
+void Nlm_AddVectors(double * y, int n, double alpha, const double * x)
{
- Int4 i; /* iteration index */
+ int i; /* iteration index */
- for( i = 0; i < n; i++ ) y[i] += alpha * x[i];
+ for (i = 0; i < n; i++) {
+ y[i] += alpha * x[i];
+ }
}
@@ -249,24 +296,19 @@ void Nlm_AddVectors(Nlm_FloatHiPtr y, Int4 n, Nlm_FloatHi alpha,
* @param n the size of x and step_x
* @param max a nonnegative scalar
*/
-Nlm_FloatHi
-Nlm_StepBound(const Nlm_FloatHi PNTR x, Int4 n,
- const Nlm_FloatHi PNTR step_x, Nlm_FloatHi max )
+double
+Nlm_StepBound(const double * x, int n, const double * step_x, double max)
{
- Int4 i; /* iteration index */
- Nlm_FloatHi alpha; /* current largest permitted step */
-
- alpha = max;
+ int i; /* iteration index */
+ double alpha = max; /* current largest permitted step */
- for( i = 0; i < n; i++ ) {
- Nlm_FloatHi alpha_i; /* a step to the boundary for the
- current i */
+ for (i = 0; i < n; i++) {
+ double alpha_i; /* a step to the boundary for the current i */
alpha_i = -x[i] / step_x[i];
- if( alpha_i >= 0 && alpha_i < alpha ) {
+ if (alpha_i >= 0 && alpha_i < alpha) {
alpha = alpha_i;
}
}
-
return alpha;
}
diff --git a/algo/blast/composition_adjustment/nlm_linear_algebra.h b/algo/blast/composition_adjustment/nlm_linear_algebra.h
index 637b913d..e706bfb1 100644
--- a/algo/blast/composition_adjustment/nlm_linear_algebra.h
+++ b/algo/blast/composition_adjustment/nlm_linear_algebra.h
@@ -1,61 +1,80 @@
-/* ===========================================================================
-*
-* PUBLIC DOMAIN NOTICE
-* National Center for Biotechnology Information
-*
-* This software/database is a "United States Government Work" under the
-* terms of the United States Copyright Act. It was written as part of
-* the author's official duties as a United States Government employee and
-* thus cannot be copyrighted. This software/database is freely available
-* to the public for use. The National Library of Medicine and the U.S.
-* Government have not placed any restriction on its use or reproduction.
-*
-* Although all reasonable efforts have been taken to ensure the accuracy
-* and reliability of the software and data, the NLM and the U.S.
-* Government do not and cannot warrant the performance or results that
-* may be obtained by using this software or data. The NLM and the U.S.
-* Government disclaim all warranties, express or implied, including
-* warranties of performance, merchantability or fitness for any particular
-* purpose.
-*
-* Please cite the author in any work or product based on this material.
-*
-* ===========================================================================*/
-
-/*****************************************************************************
-
-File name: nlm_numerics.h
-
-Author: E. Michael Gertz
-
-Contents: Definitions used in nlm_numerics.c
-
-******************************************************************************/
-/*
- * $Log: nlm_numerics.h,v $
- * Revision 1.1 2005/05/16 16:11:41 papadopo
- * Initial revision
+/* $Id: nlm_linear_algebra.h,v 1.6 2005/12/01 13:54:04 gertz Exp $
+ * ===========================================================================
*
+ * PUBLIC DOMAIN NOTICE
+ * National Center for Biotechnology Information
+ *
+ * This software/database is a "United States Government Work" under the
+ * terms of the United States Copyright Act. It was written as part of
+ * the author's official duties as a United States Government employee and
+ * thus cannot be copyrighted. This software/database is freely available
+ * to the public for use. The National Library of Medicine and the U.S.
+ * Government have not placed any restriction on its use or reproduction.
+ *
+ * Although all reasonable efforts have been taken to ensure the accuracy
+ * and reliability of the software and data, the NLM and the U.S.
+ * Government do not and cannot warrant the performance or results that
+ * may be obtained by using this software or data. The NLM and the U.S.
+ * Government disclaim all warranties, express or implied, including
+ * warranties of performance, merchantability or fitness for any particular
+ * purpose.
+ *
+ * Please cite the author in any work or product based on this material.
+ *
+ * ===========================================================================*/
+
+/**
+ * @file nlm_linear_algebra.h
+ *
+ * @author E. Michael Gertz
+ *
+ * Declarations for several linear algebra routines
*/
-#ifndef NLMNUMERICS
-#define NLMNUMERICS
-#include <ncbistd.h>
+#ifndef __NLM_LINEAR_ALGEBRA__
+#define __NLM_LINEAR_ALGEBRA__
-Nlm_FloatHiPtr PNTR Nlm_DenseMatrixNew(Int4 nrows, Int4 ncols);
-Nlm_FloatHiPtr PNTR Nlm_LtriangMatrixNew(Int4 n);
-Nlm_FloatHiPtr PNTR Nlm_DenseMatrixFree(Nlm_FloatHiPtr PNTR mat);
+#include <algo/blast/core/blast_export.h>
+#include <algo/blast/core/ncbi_std.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
-void Nlm_FactorLtriangPosDef(Nlm_FloatHiPtr PNTR A, Int4 n);
-void Nlm_SolveLtriangPosDef(Nlm_FloatHiPtr x, Int4 n,
- Nlm_FloatHiPtr PNTR L );
+NCBI_XBLAST_EXPORT
+double ** Nlm_DenseMatrixNew(int nrows, int ncols);
-Nlm_FloatHi Nlm_EuclideanNorm(const Nlm_FloatHi PNTR v, Int4 n);
+NCBI_XBLAST_EXPORT
+double ** Nlm_LtriangMatrixNew(int n);
-void Nlm_AddVectors(Nlm_FloatHiPtr y, Int4 n, Nlm_FloatHi alpha,
- const Nlm_FloatHi PNTR x);
+NCBI_XBLAST_EXPORT
+void Nlm_DenseMatrixFree(double *** mat);
-Nlm_FloatHi Nlm_StepBound(const Nlm_FloatHi PNTR x, Int4 n,
- const Nlm_FloatHi PNTR step_x, Nlm_FloatHi max);
+NCBI_XBLAST_EXPORT
+Int4 ** Nlm_Int4MatrixNew(int nrows, int ncols);
+
+NCBI_XBLAST_EXPORT
+void Nlm_Int4MatrixFree(Int4 *** mat);
+
+NCBI_XBLAST_EXPORT
+void Nlm_FactorLtriangPosDef(double ** A, int n);
+
+NCBI_XBLAST_EXPORT
+void Nlm_SolveLtriangPosDef(double x[], int n, double ** L);
+
+NCBI_XBLAST_EXPORT
+double Nlm_EuclideanNorm(const double v[], int n);
+
+NCBI_XBLAST_EXPORT
+void Nlm_AddVectors(double y[], int n, double alpha,
+ const double x[]);
+
+NCBI_XBLAST_EXPORT
+double Nlm_StepBound(const double x[], int n,
+ const double step_x[], double max);
+
+#ifdef __cplusplus
+}
+#endif
#endif
diff --git a/algo/blast/composition_adjustment/optimize_target_freq.c b/algo/blast/composition_adjustment/optimize_target_freq.c
index a8a331cd..120d9801 100644
--- a/algo/blast/composition_adjustment/optimize_target_freq.c
+++ b/algo/blast/composition_adjustment/optimize_target_freq.c
@@ -1,5 +1,3 @@
-static char const rcsid[] = "$Id: re_newton.c,v 1.3 2005/07/25 12:48:39 camacho Exp $";
-
/* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -24,39 +22,11 @@ static char const rcsid[] = "$Id: re_newton.c,v 1.3 2005/07/25 12:48:39 camacho
*
* ===========================================================================*/
-/*****************************************************************************
-
-File name: re_newton.c
-
-Authors: E. Michael Gertz, Alejandro Schaffer
-
-Contents: Mid-level functions that directly solve the optimization
- problem for compositional score matrix adjustment.
- Used in conjunction with Newton_procedures.c and nlm_numerics
-
-******************************************************************************/
-/*
- * $Log: re_newton.c,v $
- * Revision 1.3 2005/07/25 12:48:39 camacho
- * Updated reference for compositional adjustment
- *
- * Revision 1.2 2005/07/14 20:19:58 coulouri
- * - In OptimizeTargetFrequencies, change the convergence tests to robustly
- * handle NaN (floating point not a number)
- *
- * Revision 1.1 2005/05/16 16:11:41 papadopo
- * Initial revision
- *
- */
-#include <ncbi.h>
-#include <nlm_numerics.h>
-#include <re_newton.h>
-
/**
- * @file re_newton.c
+ * @file optimize_target_freq.c
*
* Author E. Michael Gertz
- *
+ *
* Routines for finding an optimal set of target frequencies for the
* purpose of generating a compositionally adjusted score matrix. The
* function for performing this optimization is named
@@ -111,8 +81,20 @@ Contents: Mid-level functions that directly solve the optimization
* Stephen F. Altschul, John C. Wootton, E. Michael Gertz, Richa
* Agarwala, Aleksandr Morgulis, Alejandro Schaffer and Yi-Kuo Yu
* (2005) Protein Database Searches Using Compositionally Adjusted
- * Substitution Matrices. FEBS Journal, in press.
+ * Substitution Matrices. FEBS Journal, 272,5101-9.
*/
+#ifndef SKIP_DOXYGEN_PROCESSING
+static char const rcsid[] =
+ "$Id: optimize_target_freq.c,v 1.6 2005/12/01 13:49:43 gertz Exp $";
+#endif /* SKIP_DOXYGEN_PROCESSING */
+
+#include <string.h>
+#include <assert.h>
+#include <math.h>
+#include <stdlib.h>
+#include <algo/blast/core/ncbi_std.h>
+#include <algo/blast/composition_adjustment/nlm_linear_algebra.h>
+#include <algo/blast/composition_adjustment/optimize_target_freq.h>
/**
* Compute the symmetric product A D A^T, where A is the matrix of
@@ -133,29 +115,27 @@ Contents: Mid-level functions that directly solve the optimization
* length alphsize * alphsize
*/
static void
-ScaledSymmetricProductA(Nlm_FloatHiPtr PNTR W,
- Nlm_FloatHiPtr diagonal,
- Int4 alphsize)
+ScaledSymmetricProductA(double ** W, const double diagonal[], int alphsize)
{
- Int4 rowW, colW; /* iteration indices over the rows and columns of W */
- Int4 i, j; /* iteration indices over characters in the alphabet */
- Int4 m; /* The number of rows in A; also the size of W */
-
+ int rowW, colW; /* iteration indices over the rows and columns of W */
+ int i, j; /* iteration indices over characters in the alphabet */
+ int m; /* The number of rows in A; also the size of W */
+
m = 2 * alphsize - 1;
- for(rowW = 0; rowW < m; rowW++) {
- for(colW = 0; colW <= rowW; colW++) {
+ for (rowW = 0; rowW < m; rowW++) {
+ for (colW = 0; colW <= rowW; colW++) {
W[rowW][colW] = 0.0;
}
}
- for(i = 0; i < alphsize; i++) {
- for(j = 0; j < alphsize; j++) {
- Nlm_FloatHi dd; /* an individual diagonal element */
+ for (i = 0; i < alphsize; i++) {
+ for (j = 0; j < alphsize; j++) {
+ double dd; /* an individual diagonal element */
dd = diagonal[i * alphsize + j];
W[j][j] += dd;
- if(i > 0) {
+ if (i > 0) {
W[i + alphsize - 1][j] += dd;
W[i + alphsize - 1][i + alphsize - 1] += dd;
}
@@ -177,31 +157,28 @@ ScaledSymmetricProductA(Nlm_FloatHiPtr PNTR W,
* @param x a vector of size alphsize * alphsize
*/
static void
-MultiplyByA(Nlm_FloatHi beta,
- Nlm_FloatHiPtr y,
- Int4 alphsize,
- Nlm_FloatHi alpha,
- const Nlm_FloatHi PNTR x)
+MultiplyByA(double beta, double y[], int alphsize,
+ double alpha, const double x[])
{
- Int4 i, j; /* iteration indices over characters in the alphabet */
- if(beta == 0.0) {
+ int i, j; /* iteration indices over characters in the alphabet */
+ if (beta == 0.0) {
/* Initialize y to zero, without reading any elements from y */
- for(i = 0; i < 2 * alphsize - 1; i++) {
+ for (i = 0; i < 2 * alphsize - 1; i++) {
y[i] = 0.0;
}
- } else if(beta != 1.0) {
+ } else if (beta != 1.0) {
/* rescale y */
- for(i = 0; i < 2 * alphsize - 1; i++) {
+ for (i = 0; i < 2 * alphsize - 1; i++) {
y[i] *= beta;
}
}
- for(i = 0; i < alphsize; i++) {
- for(j = 0; j < alphsize; j++) {
+ for (i = 0; i < alphsize; i++) {
+ for (j = 0; j < alphsize; j++) {
y[j] += alpha * x[i * alphsize + j];
}
}
- for(i = 1; i < alphsize; i++) {
- for(j = 0; j < alphsize; j++) {
+ for (i = 1; i < alphsize; i++) {
+ for (j = 0; j < alphsize; j++) {
y[i + alphsize - 1] += alpha * x[i * alphsize + j];
}
}
@@ -221,33 +198,30 @@ MultiplyByA(Nlm_FloatHi beta,
* @param x a vector of size 2 * alphsize - 1
*/
static void
-MultiplyByAtranspose(Nlm_FloatHi beta,
- Nlm_FloatHiPtr y,
- Int4 alphsize,
- Nlm_FloatHi alpha,
- const Nlm_FloatHi PNTR x)
+MultiplyByAtranspose(double beta, double y[], int alphsize,
+ double alpha, const double x[])
{
- Int4 i, j; /* iteration indices over characters in the alphabet */
- Int4 k; /* index of a row of A transpose (a column of A); also
+ int i, j; /* iteration indices over characters in the alphabet */
+ int k; /* index of a row of A transpose (a column of A); also
an index into y */
- if(beta == 0.0) {
+ if (beta == 0.0) {
/* Initialize y to zero, without reading any elements from y */
- for(k = 0; k < alphsize * alphsize; k++) {
+ for (k = 0; k < alphsize * alphsize; k++) {
y[k] = 0.0;
}
- } else if(beta != 1.0) {
+ } else if (beta != 1.0) {
/* rescale y */
- for(k = 0; k < alphsize * alphsize; k++) {
+ for (k = 0; k < alphsize * alphsize; k++) {
y[k] *= beta;
}
}
- for(i = 0; i < alphsize; i++) {
- for(j = 0; j < alphsize; j++) {
+ for (i = 0; i < alphsize; i++) {
+ for (j = 0; j < alphsize; j++) {
k = i * alphsize + j;
y[k] += alpha * x[j];
- if(i > 0) {
+ if (i > 0) {
y[k] += alpha * x[i + alphsize - 1];
}
}
@@ -266,18 +240,15 @@ MultiplyByAtranspose(Nlm_FloatHi beta,
* @param col_sums column sums of the substitution probabilities
*/
static void
-ResidualsLinearConstraints(Nlm_FloatHiPtr rA,
- Int4 alphsize,
- const Nlm_FloatHi PNTR x,
- const Nlm_FloatHi PNTR row_sums,
- const Nlm_FloatHi PNTR col_sums)
+ResidualsLinearConstraints(double rA[], int alphsize, const double x[],
+ const double row_sums[], const double col_sums[])
{
- Int4 i; /* iteration index */
+ int i; /* iteration index */
- for(i = 0; i < alphsize; i++) {
+ for (i = 0; i < alphsize; i++) {
rA[i] = col_sums[i];
}
- for(i = 1; i < alphsize; i++) {
+ for (i = 1; i < alphsize; i++) {
rA[i + alphsize - 1] = row_sums[i];
}
MultiplyByA(1.0, rA, alphsize, -1.0, x);
@@ -298,24 +269,21 @@ ResidualsLinearConstraints(Nlm_FloatHiPtr rA,
* problem.
*/
static void
-DualResiduals(Nlm_FloatHiPtr resids_x,
- Int4 alphsize,
- Nlm_FloatHiPtr PNTR grads,
- const Nlm_FloatHi PNTR z,
- Int4 constrain_rel_entropy)
+DualResiduals(double resids_x[], int alphsize, double ** grads,
+ const double z[], int constrain_rel_entropy)
{
- Int4 i; /* iteration index */
- Int4 n = alphsize * alphsize; /* size of resids_x */
+ int i; /* iteration index */
+ int n = alphsize * alphsize; /* size of resids_x */
- if(constrain_rel_entropy) {
- Nlm_FloatHi eta; /* dual variable for the relative
+ if (constrain_rel_entropy) {
+ double eta; /* dual variable for the relative
entropy constraint */
eta = z[2 * alphsize - 1];
- for(i = 0; i < n; i++) {
+ for (i = 0; i < n; i++) {
resids_x[i] = -grads[0][i] + eta * grads[1][i];
}
} else {
- for(i = 0; i < n; i++) {
+ for (i = 0; i < n; i++) {
resids_x[i] = -grads[0][i];
}
}
@@ -348,28 +316,28 @@ DualResiduals(Nlm_FloatHiPtr resids_x,
*
*/
static void
-CalculateResiduals(Nlm_FloatHiPtr rnorm,
- Nlm_FloatHiPtr resids_x,
- Int4 alphsize,
- Nlm_FloatHiPtr resids_z,
- const Nlm_FloatHi PNTR values,
- Nlm_FloatHiPtr PNTR grads,
- const Nlm_FloatHi PNTR row_sums,
- const Nlm_FloatHi PNTR col_sums,
- const Nlm_FloatHi PNTR x,
- const Nlm_FloatHi PNTR z,
- Int4 constrain_rel_entropy,
- Nlm_FloatHi relative_entropy)
+CalculateResiduals(double * rnorm,
+ double resids_x[],
+ int alphsize,
+ double resids_z[],
+ const double values[],
+ double ** grads,
+ const double row_sums[],
+ const double col_sums[],
+ const double x[],
+ const double z[],
+ int constrain_rel_entropy,
+ double relative_entropy)
{
/* Euclidean norms of the primal and dual residuals */
- Nlm_FloatHi norm_resids_z, norm_resids_x;
+ double norm_resids_z, norm_resids_x;
DualResiduals(resids_x, alphsize, grads, z, constrain_rel_entropy);
norm_resids_x = Nlm_EuclideanNorm(resids_x, alphsize * alphsize);
ResidualsLinearConstraints(resids_z, alphsize, x, row_sums, col_sums);
- if(constrain_rel_entropy) {
+ if (constrain_rel_entropy) {
resids_z[2 * alphsize - 1] = relative_entropy - values[1];
norm_resids_z = Nlm_EuclideanNorm(resids_z, 2 * alphsize);
@@ -400,72 +368,89 @@ CalculateResiduals(Nlm_FloatHiPtr rnorm,
* backsolve using this factorization are stored.
*/
struct ReNewtonSystem {
- Int4 alphsize; /*< the size of the alphabet */
- Int4 constrain_rel_entropy; /*< if true, use the relative entropy
+ int alphsize; /*< the size of the alphabet */
+ int constrain_rel_entropy; /*< if true, use the relative entropy
constraint for this optimization
problem */
- Nlm_FloatHiPtr PNTR W; /*< A lower-triangular matrix
+ double ** W; /*< A lower-triangular matrix
representing a factorization of
the (2,2) block, -J D^{-1} J^T, of
the condensed linear system */
- Nlm_FloatHiPtr Dinv; /*< The diagonal elements of the
+ double * Dinv; /*< The diagonal elements of the
inverse of the necessarily
diagonal (1,1) block of the linear
system */
- Nlm_FloatHiPtr grad_re; /*< the gradient of the
+ double * grad_re; /*< the gradient of the
relative-entropy constraint, if
this constraint is used. */
};
typedef struct ReNewtonSystem ReNewtonSystem;
-typedef ReNewtonSystem PNTR ReNewtonSystemPtr;
/**
- * Create a new uninitialized ReNewtonSystem; the fields are
- * initialized by the FactorReNewtonSystem procedure.
- * ReNewtonSystemNew and FactorReNewtonSystem are called from only the
- * newt procedure.
+ * Free the memory associated with a ReNewtonSystem.
*
- * @param alphsize the size of the alphabet for this optimization
- * problem.
+ * @param newton_system on entry *newton_system points to the
+ * system to be freed. On exit, *newton_system
+ * is set to NULL.
*/
-static ReNewtonSystemPtr
-ReNewtonSystemNew(Int4 alphsize)
+static void
+ReNewtonSystemFree(ReNewtonSystem ** newton_system)
{
- ReNewtonSystemPtr newton_system; /* the new ReNewtonSystem */
-
- newton_system = (ReNewtonSystem *) Nlm_MemNew(sizeof(ReNewtonSystem));
+ if (*newton_system != NULL) {
+ Nlm_DenseMatrixFree(&(*newton_system)->W);
- newton_system->alphsize = alphsize;
- newton_system->constrain_rel_entropy = 1;
- newton_system->W = Nlm_LtriangMatrixNew(2 * alphsize);
+ free((*newton_system)->Dinv);
+ (*newton_system)->Dinv = NULL;
- newton_system->Dinv =
- (Nlm_FloatHiPtr) Nlm_MemNew(alphsize * alphsize * sizeof(Nlm_FloatHi));
- newton_system->grad_re =
- (Nlm_FloatHiPtr) Nlm_MemNew(alphsize * alphsize * sizeof(Nlm_FloatHi));
+ free((*newton_system)->grad_re);
+ (*newton_system)->grad_re = NULL;
- return newton_system;
+ free(*newton_system);
+ *newton_system = NULL;
+ }
}
/**
- * Free the memory associated with a ReNewtonSystem.
+ * Create a new uninitialized ReNewtonSystem; the fields are
+ * initialized by the FactorReNewtonSystem procedure.
+ * ReNewtonSystemNew and FactorReNewtonSystem are called from only the
+ * newt procedure.
*
- * @param newton_system on entry *newton_system points to the
- * system to be freed. On exit, *newton_system
- * is set to NULL.
+ * @param alphsize the size of the alphabet for this optimization
+ * problem.
*/
-static void
-ReNewtonSystemFree(ReNewtonSystemPtr PNTR newton_system)
+static ReNewtonSystem * ReNewtonSystemNew(int alphsize)
{
- (*newton_system)->W = Nlm_DenseMatrixFree((*newton_system)->W);
- (*newton_system)->Dinv =
- (Nlm_FloatHiPtr) Nlm_MemFree((*newton_system)->Dinv);
- (*newton_system)->grad_re =
- (Nlm_FloatHiPtr) Nlm_MemFree((*newton_system)->grad_re);
+ ReNewtonSystem * newton_system; /* the new ReNewtonSystem */
+
+ newton_system = (ReNewtonSystem *) malloc(sizeof(ReNewtonSystem));
+ if (newton_system != NULL) {
+ newton_system->alphsize = alphsize;
+ newton_system->constrain_rel_entropy = 1;
+ newton_system->W = NULL;
+ newton_system->Dinv = NULL;
+ newton_system->grad_re = NULL;
+
+ newton_system->W = Nlm_LtriangMatrixNew(2 * alphsize);
+ if (newton_system->W == NULL)
+ goto error_return;
+ newton_system->Dinv =
+ (double *) malloc(alphsize * alphsize * sizeof(double));
+ if (newton_system->Dinv == NULL)
+ goto error_return;
+ newton_system->grad_re =
+ (double *) malloc(alphsize * alphsize * sizeof(double));
+ if (newton_system->grad_re == NULL)
+ goto error_return;
+ }
+ goto normal_return;
+error_return:
+ ReNewtonSystemFree(&newton_system);
+normal_return:
- *newton_system = (ReNewtonSystemPtr) Nlm_MemFree(*newton_system);
+ return newton_system;
}
@@ -484,22 +469,23 @@ ReNewtonSystemFree(ReNewtonSystemPtr PNTR newton_system)
* problem.
*/
static void
-FactorReNewtonSystem(ReNewtonSystemPtr newton_system,
- const Nlm_FloatHi PNTR x,
- const Nlm_FloatHi PNTR z,
- Nlm_FloatHiPtr PNTR grads,
- Int4 constrain_rel_entropy)
+FactorReNewtonSystem(ReNewtonSystem * newton_system,
+ const double x[],
+ const double z[],
+ double ** grads,
+ int constrain_rel_entropy,
+ double * workspace)
{
- Int4 i; /* iteration index */
- Int4 n; /* the length of x */
- Int4 m; /* the length of z */
+ int i; /* iteration index */
+ int n; /* the length of x */
+ int m; /* the length of z */
/* Pointers to fields in newton_systems; the names of the local
* variables match the names of the fields. */
- Nlm_FloatHiPtr PNTR W = newton_system->W;
- Int4 alphsize = newton_system->alphsize;
- Nlm_FloatHiPtr Dinv = newton_system->Dinv;
- Nlm_FloatHiPtr grad_re = newton_system->grad_re;
+ double ** W = newton_system->W;
+ int alphsize = newton_system->alphsize;
+ double * Dinv = newton_system->Dinv;
+ double * grad_re = newton_system->grad_re;
n = alphsize * alphsize;
m = constrain_rel_entropy ? 2 * alphsize : 2 * alphsize - 1;
@@ -507,51 +493,45 @@ FactorReNewtonSystem(ReNewtonSystemPtr newton_system,
newton_system->constrain_rel_entropy = constrain_rel_entropy;
/* The original system has the form
- *
+ *
* (D J^T)
* (J 0 ).
*
- * We block reduce the system to
+ * We block reduce the system to
*
* (D J^T )
* (0 -J D^{-1} J^T).
*
* First we find the inverse of the diagonal matrix D. */
-
- if(constrain_rel_entropy) {
- Nlm_FloatHi eta; /* dual variable for the relative
+
+ if (constrain_rel_entropy) {
+ double eta; /* dual variable for the relative
entropy constraint */
eta = z[m - 1];
- for(i = 0; i < n; i++) {
+ for (i = 0; i < n; i++) {
Dinv[i] = x[i] / (1 - eta);
}
} else {
- Nlm_MemCpy(Dinv, x, n * sizeof(Nlm_FloatHi));
+ memcpy(Dinv, x, n * sizeof(double));
}
/* Then we compute J D^{-1} J^T; First fill in the part that corresponds
* to the linear constraints */
ScaledSymmetricProductA(W, Dinv, alphsize);
- if(constrain_rel_entropy) {
- Nlm_FloatHiPtr work; /* a vector for intermediate computations */
-
+ if (constrain_rel_entropy) {
/* Save the gradient of the relative entropy constraint. */
- Nlm_MemCpy(grad_re, grads[1], n * sizeof(Nlm_FloatHi));
+ memcpy(grad_re, grads[1], n * sizeof(double));
/* Fill in the part of J D^{-1} J^T that corresponds to the relative
* entropy constraint. */
- work = (Nlm_FloatHiPtr) Nlm_MemNew(n * sizeof(Nlm_FloatHi));
-
W[m - 1][m - 1] = 0.0;
- for(i = 0; i < n; i++) {
- work[i] = Dinv[i] * grad_re[i];
+ for (i = 0; i < n; i++) {
+ workspace[i] = Dinv[i] * grad_re[i];
- W[m - 1][m - 1] += grad_re[i] * work[i];
+ W[m - 1][m - 1] += grad_re[i] * workspace[i];
}
- MultiplyByA(0.0, &W[m - 1][0], alphsize, 1.0, work);
-
- work = (Nlm_FloatHiPtr) Nlm_MemFree(work);
+ MultiplyByA(0.0, &W[m - 1][0], alphsize, 1.0, workspace);
}
/* Factor J D^{-1} J^T and save the result in W. */
Nlm_FactorLtriangPosDef(W, m);
@@ -569,42 +549,38 @@ FactorReNewtonSystem(ReNewtonSystemPtr newton_system,
* @param newton_system the factored matrix for the Newton system.
*/
static void
-SolveReNewtonSystem(Nlm_FloatHiPtr x,
- Nlm_FloatHiPtr z,
- const ReNewtonSystem PNTR newton_system)
+SolveReNewtonSystem(double x[], double z[],
+ const ReNewtonSystem * newton_system, double workspace[])
{
- Int4 i; /* iteration index */
- Int4 n; /* the size of x */
- Int4 mA; /* the number of linear constraints */
- Int4 m; /* the size of z */
- Nlm_FloatHiPtr work; /* vector for intermediate calculations */
+ int i; /* iteration index */
+ int n; /* the size of x */
+ int mA; /* the number of linear constraints */
+ int m; /* the size of z */
/* Local variables that represent fields of newton_system */
- Nlm_FloatHiPtr PNTR W = newton_system->W;
- Nlm_FloatHiPtr Dinv = newton_system->Dinv;
- Nlm_FloatHiPtr grad_re = newton_system->grad_re;
- Int4 alphsize = newton_system->alphsize;
- Int4 constrain_rel_entropy = newton_system->constrain_rel_entropy;
+ double ** W = newton_system->W;
+ double * Dinv = newton_system->Dinv;
+ double * grad_re = newton_system->grad_re;
+ int alphsize = newton_system->alphsize;
+ int constrain_rel_entropy = newton_system->constrain_rel_entropy;
n = alphsize * alphsize;
mA = 2 * alphsize - 1;
m = constrain_rel_entropy ? mA + 1 : mA;
- work = (Nlm_FloatHiPtr) Nlm_MemNew(n * sizeof(Nlm_FloatHi));
-
/* Apply the same block reduction to the right-hand side as was
* applied to the matrix:
*
* rzhat = rz - J D^{-1} rx
*/
- for(i = 0; i < n; i++) {
- work[i] = x[i] * Dinv[i];
+ for (i = 0; i < n; i++) {
+ workspace[i] = x[i] * Dinv[i];
}
- MultiplyByA(1.0, z, alphsize, -1.0, work);
+ MultiplyByA(1.0, z, alphsize, -1.0, workspace);
- if(constrain_rel_entropy) {
- for(i = 0; i < n; i++) {
- z[m - 1] -= grad_re[i] * work[i];
+ if (constrain_rel_entropy) {
+ for (i = 0; i < n; i++) {
+ z[m - 1] -= grad_re[i] * workspace[i];
}
}
@@ -615,17 +591,16 @@ SolveReNewtonSystem(Nlm_FloatHiPtr x,
*
* x = D^{-1) (rx + J\T z)
*/
- if(constrain_rel_entropy) {
+ if (constrain_rel_entropy) {
for(i = 0; i < n; i++) {
x[i] += grad_re[i] * z[m - 1];
}
}
MultiplyByAtranspose(1.0, x, alphsize, 1.0, z);
- for(i = 0; i < n; i++) {
+ for (i = 0; i < n; i++) {
x[i] *= Dinv[i];
}
- work = (Nlm_FloatHiPtr) Nlm_MemFree(work);
}
@@ -648,25 +623,22 @@ SolveReNewtonSystem(Nlm_FloatHiPtr x,
* is used in this optimization problem
*/
static void
-EvaluateReFunctions(Nlm_FloatHiPtr values,
- Nlm_FloatHiPtr PNTR grads,
- Int4 alphsize,
- const Nlm_FloatHi PNTR x,
- const Nlm_FloatHi PNTR q,
- const Nlm_FloatHi PNTR scores,
- Int4 constrain_rel_entropy)
+EvaluateReFunctions(double values[], double ** grads, int alphsize,
+ const double x[], const double q[],
+ const double scores[],
+ int constrain_rel_entropy)
{
- Int4 k; /* iteration index over elements of x, q and scores */
- Nlm_FloatHi temp; /* holds intermediate values in a computation */
+ int k; /* iteration index over elements of x, q and scores */
+ double temp; /* holds intermediate values in a computation */
values[0] = 0.0; values[1] = 0.0;
- for(k = 0; k < alphsize * alphsize; k++) {
+ for (k = 0; k < alphsize * alphsize; k++) {
temp = log(x[k] / q[k]);
values[0] += x[k] * temp;
grads[0][k] = temp + 1;
- if(constrain_rel_entropy) {
+ if (constrain_rel_entropy) {
temp += scores[k];
values[1] += x[k] * temp;
@@ -691,17 +663,17 @@ EvaluateReFunctions(Nlm_FloatHiPtr values,
* @param col_freqs background frequencies of the other sequence
*/
static void
-ComputeScoresFromProbs(Nlm_FloatHiPtr scores,
- Int4 alphsize,
- const Nlm_FloatHi PNTR target_freqs,
- const Nlm_FloatHi PNTR row_freqs,
- const Nlm_FloatHi PNTR col_freqs)
+ComputeScoresFromProbs(double scores[],
+ int alphsize,
+ const double target_freqs[],
+ const double row_freqs[],
+ const double col_freqs[])
{
- Int4 i, j; /* iteration indices over characters in the alphabet */
- Int4 k; /* index into scores and target_freqs */
+ int i, j; /* iteration indices over characters in the alphabet */
+ int k; /* index into scores and target_freqs */
- for(i = 0; i < alphsize; i++) {
- for(j = 0; j < alphsize; j++) {
+ for (i = 0; i < alphsize; i++) {
+ for (j = 0; j < alphsize; j++) {
k = i * alphsize + j;
scores[k] = log(target_freqs[k] / (row_freqs[i] * col_freqs[j]));
@@ -736,71 +708,81 @@ ComputeScoresFromProbs(Nlm_FloatHiPtr scores,
* this argument is ignored.
* @param maxits the maximum number of iterations permitted for the
* optimization algorithm; a good value is 2000.
- * @param tol the solution tolerance; the residuals of the optimization
+ * @param tol the solution tolerance; the residuals of the optimization
* program must have Euclidean norm <= tol for the
* algorithm to terminate.
*
* @returns if an optimal set of target frequencies is
- * found, then the number of iterations used by the
- * optimization algorithm; otherwise maxits + 1.
+ * found, then 0, if the iteration failed to
+ * converge, then 1, if there was some error, then -1.
*/
-Int4
-OptimizeTargetFrequencies(Nlm_FloatHiPtr x,
- Int4 alphsize,
- const Nlm_FloatHi PNTR q,
- const Nlm_FloatHi PNTR row_sums,
- const Nlm_FloatHi PNTR col_sums,
- Int4 constrain_rel_entropy,
- Nlm_FloatHi relative_entropy,
- Nlm_FloatHi tol,
- Int4 maxits)
+int
+Blast_OptimizeTargetFrequencies(double x[],
+ int alphsize,
+ int *iterations,
+ const double q[],
+ const double row_sums[],
+ const double col_sums[],
+ int constrain_rel_entropy,
+ double relative_entropy,
+ double tol,
+ int maxits)
{
- Int4 its; /* number of iterations that have been performed */
- Int4 n; /* number of target frequencies; the size of x */
- Int4 mA; /* number of linear constraints */
- Int4 m; /* total number of constraints */
+ int its; /* number of iterations that have been performed */
+ int n; /* number of target frequencies; the size of x */
+ int mA; /* number of linear constraints */
+ int m; /* total number of constraints */
- Nlm_FloatHi values[2]; /* values of the nonlinear functions
+ double values[2]; /* values of the nonlinear functions
at this iterate */
- Nlm_FloatHiPtr PNTR grads; /* gradients of the nonlinear
+ double ** grads = NULL; /* gradients of the nonlinear
functions at this iterate */
- ReNewtonSystemPtr newton_system; /* factored matrix of the
- linear system to be solved
- at this iteration */
- Nlm_FloatHiPtr z; /* dual variables (Lagrange multipliers) */
- Nlm_FloatHiPtr resids_x; /* dual residuals (gradient of Lagrangian) */
- Nlm_FloatHiPtr resids_z; /* primal (constraint) residuals */
- Nlm_FloatHi rnorm; /* norm of the residuals for the
+ ReNewtonSystem *
+ newton_system = NULL; /* factored matrix of the linear
+ system to be solved at this
+ iteration */
+ double * z = NULL; /* dual variables (Lagrange multipliers) */
+ double * resids_x = NULL; /* dual residuals (gradient of Lagrangian) */
+ double * resids_z = NULL; /* primal (constraint) residuals */
+ double rnorm; /* norm of the residuals for the
current iterate */
- Nlm_FloatHiPtr old_scores; /* a scoring matrix, with lambda = 1,
+ double * old_scores = NULL; /* a scoring matrix, with lambda = 1,
generated from q, row_sums and
col_sums */
- Int4 converged; /* true if Newton's method converged
+ double * workspace = NULL; /* A vector for intermediate computations */
+ int converged; /* true if Newton's method converged
to a *minimizer* (strong
second-order point) */
+ int status; /* the return status */
n = alphsize * alphsize;
mA = 2 * alphsize - 1;
m = constrain_rel_entropy ? mA + 1 : mA;
newton_system = ReNewtonSystemNew(alphsize);
-
- resids_x = (Nlm_FloatHiPtr) Nlm_MemNew(n * sizeof(Nlm_FloatHi));
- resids_z = (Nlm_FloatHiPtr) Nlm_MemNew((mA + 1) * sizeof(Nlm_FloatHi));
+ if (newton_system == NULL) goto error_return;
+ resids_x = (double *) malloc(n * sizeof(double));
+ if (resids_x == NULL) goto error_return;
+ resids_z = (double *) malloc((mA + 1) * sizeof(double));
+ if (resids_z == NULL) goto error_return;
/* z must be initialized to zero */
- z = (Nlm_FloatHiPtr) Nlm_Calloc( mA + 1, sizeof(Nlm_FloatHi));
+ z = (double *) calloc( mA + 1, sizeof(double));
+ if (z == NULL) goto error_return;
+ old_scores = (double *) malloc(n * sizeof(double));
+ if (old_scores == NULL) goto error_return;
+ workspace = (double *) malloc(n * sizeof(double));
+ if (workspace == NULL) goto error_return;
+ grads = Nlm_DenseMatrixNew(2, n);
+ if (grads == NULL) goto error_return;
- old_scores = (Nlm_FloatHiPtr) Nlm_MemNew(n * sizeof(Nlm_FloatHi));
ComputeScoresFromProbs(old_scores, alphsize, q, row_sums, col_sums);
- grads = Nlm_DenseMatrixNew(2, n);
-
/* Use q as the initial value for x */
- Nlm_MemCpy(x, q, n * sizeof(Nlm_FloatHi));
+ memcpy(x, q, n * sizeof(double));
its = 0; /* Initialize the iteration count. Note that we may
- converge in zero iterations if the initial x is
+ converge in zero iterations if the initial x is
optimal. */
- while(its <= maxits) {
+ while (its <= maxits) {
/* Compute the residuals */
EvaluateReFunctions(values, grads, alphsize, x, q, old_scores,
constrain_rel_entropy);
@@ -810,21 +792,22 @@ OptimizeTargetFrequencies(Nlm_FloatHiPtr x,
/* and check convergence; the test correctly handles the case
in which rnorm is NaN (not a number). */
- if(!(rnorm > tol)) {
+ if ( !(rnorm > tol) ) {
/* We converged at the current iterate */
break;
} else {
/* we did not converge, so increment the iteration counter
and start a new iteration */
- if(++its <= maxits) {
+ if (++its <= maxits) {
/* We have not exceeded the maximum number of iterations;
take a Newton step. */
- Nlm_FloatHi alpha; /* a positive number used to scale the
+ double alpha; /* a positive number used to scale the
Newton step. */
FactorReNewtonSystem(newton_system, x, z, grads,
- constrain_rel_entropy);
- SolveReNewtonSystem(resids_x, resids_z, newton_system);
+ constrain_rel_entropy, workspace);
+ SolveReNewtonSystem(resids_x, resids_z, newton_system,
+ workspace);
/* Calculate a value of alpha that ensure that x is
positive */
@@ -836,23 +819,30 @@ OptimizeTargetFrequencies(Nlm_FloatHiPtr x,
}
}
}
-
converged = 0;
- if( its <= maxits && rnorm <= tol ) {
+ if (its <= maxits && rnorm <= tol) {
/* Newton's iteration converged */
- if( !constrain_rel_entropy || z[m - 1] < 1 ) {
+ if ( !constrain_rel_entropy || z[m - 1] < 1 ) {
/* and the final iterate is a minimizer */
converged = 1;
}
}
-
- grads = Nlm_DenseMatrixFree(grads);
- old_scores = (Nlm_FloatHiPtr) Nlm_MemFree(old_scores);
- z = (Nlm_FloatHiPtr) Nlm_MemFree(z);
- resids_z = (Nlm_FloatHiPtr) Nlm_MemFree(resids_z);
- resids_x = (Nlm_FloatHiPtr) Nlm_MemFree(resids_x);
-
+ status = converged ? 0 : 1;
+ *iterations = its;
+ goto normal_return;
+
+error_return:
+ status = -1;
+ *iterations = 0;
+normal_return:
+
+ Nlm_DenseMatrixFree(&grads);
+ free(workspace);
+ free(old_scores);
+ free(z);
+ free(resids_z);
+ free(resids_x);
ReNewtonSystemFree(&newton_system);
- return converged ? its : maxits + 1;
+ return status;
}
diff --git a/algo/blast/composition_adjustment/optimize_target_freq.h b/algo/blast/composition_adjustment/optimize_target_freq.h
index f9684418..5b56b000 100644
--- a/algo/blast/composition_adjustment/optimize_target_freq.h
+++ b/algo/blast/composition_adjustment/optimize_target_freq.h
@@ -1,58 +1,59 @@
-/* ===========================================================================
-*
-* PUBLIC DOMAIN NOTICE
-* National Center for Biotechnology Information
-*
-* This software/database is a "United States Government Work" under the
-* terms of the United States Copyright Act. It was written as part of
-* the author's official duties as a United States Government employee and
-* thus cannot be copyrighted. This software/database is freely available
-* to the public for use. The National Library of Medicine and the U.S.
-* Government have not placed any restriction on its use or reproduction.
-*
-* Although all reasonable efforts have been taken to ensure the accuracy
-* and reliability of the software and data, the NLM and the U.S.
-* Government do not and cannot warrant the performance or results that
-* may be obtained by using this software or data. The NLM and the U.S.
-* Government disclaim all warranties, express or implied, including
-* warranties of performance, merchantability or fitness for any particular
-* purpose.
-*
-* Please cite the author in any work or product based on this material.
-*
-* ===========================================================================*/
+/* $Id: optimize_target_freq.h,v 1.6 2005/12/01 13:54:04 gertz Exp $
+ * ===========================================================================
+ *
+ * PUBLIC DOMAIN NOTICE
+ * National Center for Biotechnology Information
+ *
+ * This software/database is a "United States Government Work" under the
+ * terms of the United States Copyright Act. It was written as part of
+ * the author's official duties as a United States Government employee and
+ * thus cannot be copyrighted. This software/database is freely available
+ * to the public for use. The National Library of Medicine and the U.S.
+ * Government have not placed any restriction on its use or reproduction.
+ *
+ * Although all reasonable efforts have been taken to ensure the accuracy
+ * and reliability of the software and data, the NLM and the U.S.
+ * Government do not and cannot warrant the performance or results that
+ * may be obtained by using this software or data. The NLM and the U.S.
+ * Government disclaim all warranties, express or implied, including
+ * warranties of performance, merchantability or fitness for any particular
+ * purpose.
+ *
+ * Please cite the author in any work or product based on this material.
+ *
+ * ===========================================================================*/
-/*****************************************************************************
+/**
+ * @file optimize_target_freq.h
+ * @author E. Michael Gertz
+ *
+ * Exports for optimized_target_freq.c
+ */
-File name: re_newton.h
+#ifndef __OPTIMIZE_TARGET_FREQ__
+#define __OPTIMIZE_TARGET_FREQ__
-Author: E. Michael Gertz
+#include <algo/blast/core/blast_export.h>
-Contents: Exports for re_newton.c
- Mid-level functions that directly solve the optimization
- problem for compositional score matrix adjustment.
- Used in conjunction with Newton_procedures.c and nlm_numerics
+#ifdef __cplusplus
+extern "C" {
+#endif
-******************************************************************************/
-/*
- * $Log: re_newton.h,v $
- * Revision 1.1 2005/05/16 16:11:41 papadopo
- * Initial revision
- *
- */
+NCBI_XBLAST_EXPORT
+int
+Blast_OptimizeTargetFrequencies(double x[],
+ int alphsize,
+ int * iterations,
+ const double q[],
+ const double row_sums[],
+ const double col_sums[],
+ int constrain_rel_entropy,
+ double relative_entropy,
+ double tol,
+ int maxits);
-#ifndef RE_NEWTON
-#define RE_NEWTON
+#ifdef __cplusplus
+}
+#endif
-Int4
-OptimizeTargetFrequencies(Nlm_FloatHiPtr x,
- Int4 alphsize,
- const Nlm_FloatHi PNTR q,
- const Nlm_FloatHi PNTR row_sums,
- const Nlm_FloatHi PNTR col_sums,
- Int4 constrain_rel_entropy,
- Nlm_FloatHi relative_entropy,
- Nlm_FloatHi tol,
- Int4 maxits);
-
#endif
diff --git a/algo/blast/composition_adjustment/redo_alignment.c b/algo/blast/composition_adjustment/redo_alignment.c
new file mode 100644
index 00000000..dd71e677
--- /dev/null
+++ b/algo/blast/composition_adjustment/redo_alignment.c
@@ -0,0 +1,1367 @@
+/* ===========================================================================
+*
+* PUBLIC DOMAIN NOTICE
+* National Center for Biotechnology Information
+*
+* This software/database is a "United States Government Work" under the
+* terms of the United States Copyright Act. It was written as part of
+* the author's official duties as a United States Government employee and
+* thus cannot be copyrighted. This software/database is freely available
+* to the public for use. The National Library of Medicine and the U.S.
+* Government have not placed any restriction on its use or reproduction.
+*
+* Although all reasonable efforts have been taken to ensure the accuracy
+* and reliability of the software and data, the NLM and the U.S.
+* Government do not and cannot warrant the performance or results that
+* may be obtained by using this software or data. The NLM and the U.S.
+* Government disclaim all warranties, express or implied, including
+* warranties of performance, merchantability or fitness for any particular
+* purpose.
+*
+* Please cite the author in any work or product based on this material.
+*
+* ===========================================================================*/
+
+/** @file kappa_common.c
+ *
+ * @author Alejandro Schaffer, E. Michael Gertz
+ *
+ * Routines for redoing a set of alignments, using either
+ * composition matrix adjustment or the Smith-Waterman algorithm (or
+ * both.)
+ */
+#ifndef SKIP_DOXYGEN_PROCESSING
+static char const rcsid[] =
+ "$Id: redo_alignment.c,v 1.2 2005/12/01 15:41:42 gertz Exp $";
+#endif /* SKIP_DOXYGEN_PROCESSING */
+
+#include <stdlib.h>
+#include <assert.h>
+#include <math.h>
+#include <algo/blast/core/ncbi_std.h>
+#include <algo/blast/composition_adjustment/redo_alignment.h>
+#include <algo/blast/composition_adjustment/nlm_linear_algebra.h>
+#include <algo/blast/composition_adjustment/composition_adjustment.h>
+#include <algo/blast/composition_adjustment/composition_constants.h>
+#include <algo/blast/composition_adjustment/smith_waterman.h>
+#include <algo/blast/composition_adjustment/compo_heap.h>
+
+/* The natural log of 2, defined in newer systems as M_LN2 in math.h, but
+ missing in older systems. */
+#define LOCAL_LN2 0.69314718055994530941723212145818
+
+/** Define COMPO_INTENSE_DEBUG to be true to turn on rigorous but
+ * expensive consistency tests in the composition_adjustment
+ * module.
+ *
+ * This macro is usually used as part of a C-conditional
+ * if (COMPO_INTENSE_DEBUG) {
+ * perform expensive tests
+ * }
+ * The C compiler will then validate the code to perform the tests, but
+ * will almost always strip the code if COMPO_INTENSE_DEBUG is false.
+ */
+#ifndef COMPO_INTENSE_DEBUG
+#define COMPO_INTENSE_DEBUG 0
+#endif
+
+/** by what factor might initially reported E-value exceed true Evalue */
+#define EVALUE_STRETCH 5
+
+/** -1/0/1 if a is less than/greater than/equal to b */
+#ifndef CMP
+#define CMP(a,b) ((a)>(b) ? 1 : ((a)<(b) ? -1 : 0))
+#endif
+
+/** For translated subject sequences, the number of amino acids to
+ include before and after the existing aligned segment when
+ generating a composition-based scoring system. */
+static const int kWindowBorder = 200;
+
+/** pseudocounts for relative-entropy-based score matrix adjustment */
+static const int kReMatrixAdjustmentPseudocounts = 20;
+
+/**
+ * s_WindowInfo - a struct whose instances represent a range
+ * of data in a sequence. */
+typedef struct s_WindowInfo
+{
+ BlastCompo_SequenceRange query_range; /**< range of the query
+ included in this window */
+ BlastCompo_SequenceRange subject_range; /**< range of the subject
+ included in this window */
+ BlastCompo_Alignment * align; /**< list of existing alignments
+ contained in this window */
+ int hspcnt; /**< number of alignment in
+ this window */
+} s_WindowInfo;
+
+
+/**
+ * Create a new BlastCompo_Alignment; parameters to this function
+ * correspond directly to fields of BlastCompo_Alignment */
+BlastCompo_Alignment *
+BlastCompo_AlignmentNew(int score,
+ ECompoAdjustModes comp_adjustment_mode,
+ int queryStart, int queryEnd, int queryIndex,
+ int matchStart, int matchEnd, int frame,
+ void * context)
+{
+ BlastCompo_Alignment * align = malloc(sizeof(BlastCompo_Alignment));
+ if (align != NULL) {
+ align->score = score;
+ align->comp_adjustment_mode = comp_adjustment_mode;
+ align->queryIndex = queryIndex;
+ align->queryStart = queryStart;
+ align->queryEnd = queryEnd;
+ align->matchStart = matchStart;
+ align->matchEnd = matchEnd;
+ align->frame = frame;
+ align->context = context;
+ align->next = NULL;
+ }
+ return align;
+}
+
+
+/**
+ * Recursively free all alignments in the singly linked list whose
+ * head is *palign. Set *palign to NULL.
+ *
+ * @param palign pointer to the head of a singly linked list
+ * of alignments.
+ */
+void
+BlastCompo_AlignmentsFree(BlastCompo_Alignment ** palign,
+ void (*free_context)(void*))
+{
+ BlastCompo_Alignment * align; /* represents the current
+ alignment in loops */
+ align = *palign; *palign = NULL;
+ while (align != NULL) {
+ /* Save the value of align->next, because align is to be deleted. */
+ BlastCompo_Alignment * align_next = align->next;
+
+ align_next = align->next;
+ if (free_context != NULL && align->context != NULL) {
+ free_context(align->context);
+ }
+ free(align);
+ align = align_next;
+ }
+}
+
+
+/**
+ * Reverse a list of BlastCompo_Alignments. */
+static void
+s_AlignmentsRev(BlastCompo_Alignment ** plist)
+{
+ BlastCompo_Alignment *list; /* the forward list */
+ BlastCompo_Alignment *new_list; /* the reversed list */
+ list = *plist; new_list = NULL;
+ while (list != NULL) {
+ BlastCompo_Alignment * list_next = list->next;
+ list->next = new_list;
+ new_list = list;
+ list = list_next;
+ }
+ *plist = new_list;
+}
+
+
+/**
+ * Compare two BlastCompo_Alignments. */
+static int
+s_AlignmentCmp(const BlastCompo_Alignment * a,
+ const BlastCompo_Alignment * b)
+{
+ int result;
+ if (0 == (result = CMP(b->score, a->score)) &&
+ 0 == (result = CMP(a->matchStart, b->matchStart)) &&
+ 0 == (result = CMP(b->matchEnd, a->matchEnd)) &&
+ 0 == (result = CMP(a->queryStart, b->queryStart))) {
+ /* if all other tests cannot distinguish the alignments, then
+ * the final test is the result */
+ result = CMP(b->queryEnd, a->queryEnd);
+ }
+ return result;
+}
+
+/** Temporary function to determine whether alignments are sorted */
+static int
+s_AlignmentsAreSorted(BlastCompo_Alignment * alignments)
+{
+ BlastCompo_Alignment * align;
+ for (align = alignments; align != NULL; align = align->next) {
+ if (align->next && align->next->score > align->score) {
+ return 0;
+ }
+ }
+ return 1;
+}
+
+
+static int
+s_DistinctAlignmentsLength(BlastCompo_Alignment * list)
+{
+ int length = 0;
+ for ( ; list != NULL; list = list->next) {
+ length++;
+ }
+ return length;
+}
+
+
+static void
+s_DistinctAlignmentsSort(BlastCompo_Alignment ** plist, int hspcnt)
+{
+ /* mergesort */
+
+ if (COMPO_INTENSE_DEBUG) {
+ assert(s_DistinctAlignmentsLength(*plist) == hspcnt);
+ }
+ if(hspcnt > 1) {
+ BlastCompo_Alignment * list = *plist;
+ BlastCompo_Alignment *leftlist, *rightlist, **tail;
+ int i, leftcnt, rightcnt;
+
+ /* Split the list in half */
+ leftcnt = hspcnt/2;
+ rightcnt = hspcnt - leftcnt;
+
+ leftlist = list;
+ /* Find the point to split the list; this loop splits lists
+ correctly only when list != NULL and leftcnt > 0, which is
+ necessarily the case because hspcnt > 1 */
+ assert(list != NULL && leftcnt > 0);
+ for (i = 0; i < leftcnt - 1 && list->next != NULL; i++) {
+ list = list->next;
+ }
+ rightlist = list->next;
+ list->next = NULL;
+
+ if (COMPO_INTENSE_DEBUG) {
+ assert(s_DistinctAlignmentsLength(rightlist) == rightcnt);
+ assert(s_DistinctAlignmentsLength(leftlist) == leftcnt);
+ }
+ /* Sort the two lists */
+ if (leftcnt > 1)
+ s_DistinctAlignmentsSort(&leftlist, leftcnt);
+ if (rightcnt > 1)
+ s_DistinctAlignmentsSort(&rightlist, rightcnt);
+
+ /* And then merge them */
+ list = NULL;
+ tail = &list;
+ while (leftlist != NULL || rightlist != NULL) {
+ if (leftlist == NULL) {
+ *tail = rightlist;
+ rightlist = NULL;
+ } else if (rightlist == NULL) {
+ *tail = leftlist;
+ leftlist = NULL;
+ } else {
+ BlastCompo_Alignment * elt;
+ if (s_AlignmentCmp(leftlist, rightlist) < 0) {
+ elt = leftlist;
+ leftlist = leftlist->next;
+ } else {
+ elt = rightlist;
+ rightlist = rightlist->next;
+ }
+ *tail = elt;
+ tail = &elt->next;
+ }
+ }
+ *plist = list;
+ if (COMPO_INTENSE_DEBUG) {
+ assert(s_DistinctAlignmentsLength(list) == hspcnt);
+ assert(s_AlignmentsAreSorted(list));
+ }
+ }
+}
+
+
+/**
+ * Copy a BlastCompo_Alignment, setting the next field to NULL
+ */
+static BlastCompo_Alignment *
+s_AlignmentCopy(const BlastCompo_Alignment * align)
+{
+ return BlastCompo_AlignmentNew(align->score,
+ align->comp_adjustment_mode,
+ align->queryStart,
+ align->queryEnd,
+ align->queryIndex,
+ align->matchStart,
+ align->matchEnd, align->frame,
+ align->context);
+
+}
+
+
+/**
+ * Given a list of alignments and a new alignment, create a new list
+ * of alignments that conditionally includes the new alignment.
+ *
+ * If there is an equal or higher-scoring alignment in the preexisting
+ * list of alignments that shares an endpoint with the new alignment,
+ * then preexisting list is returned. Otherwise, a new list is
+ * returned with the new alignment as its head and the elements of
+ * preexisting list that do not share an endpoint with the new
+ * alignment as its tail. The order of elements is preserved.
+ *
+ * Typically, a list of alignments is built one alignment at a time
+ * through a call to s_WithDistinctEnds. All alignments in the resulting
+ * list have distinct endpoints. Which items are retained in the list
+ * depends on the order in which they were added.
+ *
+ * Note that an endpoint is a triple, specifying a frame, a location
+ * in the query and a location in the subject. In other words,
+ * alignments that are not in the same frame never share endpoints.
+ *
+ * @param p_newAlign on input the alignment that may be added to
+ * the list; on output NULL
+ * @param p_oldAlignment on input the existing list of alignments;
+ * on output the new list
+ */
+static void
+s_WithDistinctEnds(BlastCompo_Alignment **p_newAlign,
+ BlastCompo_Alignment **p_oldAlignments,
+ void free_align_tracebacks(void *))
+{
+ /* Deference the input parameters. */
+ BlastCompo_Alignment * newAlign = *p_newAlign;
+ BlastCompo_Alignment * oldAlignments = *p_oldAlignments;
+ BlastCompo_Alignment * align; /* represents the current
+ alignment in loops */
+ int include_new_align; /* true if the new alignment
+ may be added to the list */
+ *p_newAlign = NULL;
+ include_new_align = 1;
+
+ for (align = oldAlignments; align != NULL; align = align->next) {
+ if (align->frame == newAlign->frame &&
+ ((align->queryStart == newAlign->queryStart &&
+ align->matchStart == newAlign->matchStart)
+ || (align->queryEnd == newAlign->queryEnd &&
+ align->matchEnd == newAlign->matchEnd))) {
+ /* At least one of the endpoints of newAlign matches an endpoint
+ of align. */
+ if (newAlign->score <= align->score) {
+ /* newAlign cannot be added to the list. */
+ include_new_align = 0;
+ break;
+ }
+ }
+ }
+ if (include_new_align) {
+ /* tail of the list being created */
+ BlastCompo_Alignment **tail;
+
+ tail = &newAlign->next;
+ align = oldAlignments;
+ while (align != NULL) {
+ /* Save align->next because align may be deleted. */
+ BlastCompo_Alignment * align_next = align->next;
+ align->next = NULL;
+ if (align->frame == newAlign->frame &&
+ ((align->queryStart == newAlign->queryStart &&
+ align->matchStart == newAlign->matchStart)
+ || (align->queryEnd == newAlign->queryEnd &&
+ align->matchEnd == newAlign->matchEnd))) {
+ /* The alignment shares an end with newAlign; */
+ /* delete it. */
+ BlastCompo_AlignmentsFree(&align, free_align_tracebacks);
+ } else { /* The alignment does not share an end with newAlign; */
+ /* add it to the output list. */
+ *tail = align;
+ tail = &align->next;
+ }
+ align = align_next;
+ } /* end while align != NULL */
+ *p_oldAlignments = newAlign;
+ } else { /* do not include_new_align */
+ BlastCompo_AlignmentsFree(&newAlign, free_align_tracebacks);
+ } /* end else do not include newAlign */
+}
+
+
+/** Release the data associated with this object. */
+static void s_SequenceDataRelease(BlastCompo_SequenceData * self)
+{
+ if (self->buffer) free(self->buffer);
+ self->data = NULL; self->buffer = NULL;
+}
+
+
+
+/**
+ * Create and initialize a new s_WindowInfo.
+ *
+ * Parameters to this function correspond directly to fields of
+ * s_WindowInfo.
+ */
+static s_WindowInfo *
+s_WindowInfoNew(int begin, int end, int context,
+ int queryOrigin, int queryLength, int query_index,
+ BlastCompo_Alignment * align)
+{
+ s_WindowInfo * window; /* new window to be returned */
+
+ window = malloc(sizeof(s_WindowInfo));
+ if (window != NULL) {
+ window->subject_range.begin = begin;
+ window->subject_range.end = end;
+ window->subject_range.context = context;
+ window->query_range.begin = queryOrigin;
+ window->query_range.end = queryOrigin + queryLength;
+ window->query_range.context = query_index;
+ window->align = align;
+ window->hspcnt = 0;
+ for ( ; align != NULL; align = align->next) {
+ window->hspcnt++;
+ }
+ }
+ return window;
+}
+
+
+/**
+ * Free an instance of s_WindowInfo.
+ *
+ * @param *window on entry the window to be freed; on exit NULL
+ */
+static void
+s_WindowInfoFree(s_WindowInfo ** window)
+{
+ if (*window != NULL) {
+ BlastCompo_AlignmentsFree(&(*window)->align, NULL);
+ free(*window);
+ }
+ *window = NULL;
+}
+
+
+/**
+ * Join two instance of s_WindowInfo into a single window
+ *
+ * @param win1 on entry, one of the two windows to be joined; on exit
+ * the combined window
+ * @param *pwin2 on entry, the other window to be joined, on exit NULL
+ */
+static void
+s_WindowInfoJoin(s_WindowInfo * win1, s_WindowInfo ** pwin2)
+{
+ /* the second window, which will be deleted when this routine exits */
+ s_WindowInfo * win2 = *pwin2;
+ BlastCompo_Alignment *align, **tail;
+ /* subject ranges for the two windows */
+ BlastCompo_SequenceRange * sbjct_range1 = &win1->subject_range;
+ BlastCompo_SequenceRange * sbjct_range2 = &win2->subject_range;
+
+ assert(sbjct_range1->context == sbjct_range2->context);
+ assert(win1->query_range.context == win2->query_range.context);
+
+ sbjct_range1->begin = MIN(sbjct_range1->begin, sbjct_range2->begin);
+ sbjct_range1->end = MAX(sbjct_range1->end, sbjct_range2->end);
+ win1->hspcnt += win2->hspcnt;
+
+ tail = &win1->align;
+ for (align = win1->align; align != NULL; align = align->next) {
+ tail = &align->next;
+ }
+ *tail = win2->align;
+ win2->align = NULL;
+
+ s_WindowInfoFree(pwin2);
+}
+
+
+/**
+ * A comparison routine used to sort a list of windows, first by frame
+ * and then by location.
+ */
+static int
+s_LocationCompareWindows(const void * vp1, const void *vp2)
+{
+ /* w1 and w2 are the windows being compared */
+ s_WindowInfo * w1 = *(s_WindowInfo **) vp1;
+ s_WindowInfo * w2 = *(s_WindowInfo **) vp2;
+ /* the subject ranges of the two windows */
+ BlastCompo_SequenceRange * sr1 = &w1->subject_range;
+ BlastCompo_SequenceRange * sr2 = &w2->subject_range;
+ /* the query indices of the two windows */
+ /* the query ranges of the two windows */
+ BlastCompo_SequenceRange * qr1 = &w1->query_range;
+ BlastCompo_SequenceRange * qr2 = &w2->query_range;
+
+ int result; /* result of the comparison */
+ if (0 == (result = CMP(qr1->context, qr2->context)) &&
+ 0 == (result = CMP(sr1->context, sr2->context)) &&
+ 0 == (result = CMP(sr1->begin, sr2->begin)) &&
+ 0 == (result = CMP(sr1->end, sr2->end)) &&
+ 0 == (result = CMP(qr1->begin, qr2->begin))) {
+ result = CMP(qr1->end, qr2->end);
+ }
+ return result;
+}
+
+
+/**
+ * A comparison routine used to sort a list of windows by position in
+ * the subject, ignoring strand and frame. Ties are broken
+ * deterministically.
+ */
+static int
+s_SubjectCompareWindows(const void * vp1, const void *vp2)
+{
+ /* w1 and w2 are the windows being compared */
+ s_WindowInfo * w1 = *(s_WindowInfo **) vp1;
+ s_WindowInfo * w2 = *(s_WindowInfo **) vp2;
+ /* the subject ranges of the two windows */
+ BlastCompo_SequenceRange * sr1 = &w1->subject_range;
+ BlastCompo_SequenceRange * sr2 = &w2->subject_range;
+ /* the query ranges of the two windows */
+ BlastCompo_SequenceRange * qr1 = &w1->query_range;
+ BlastCompo_SequenceRange * qr2 = &w2->query_range;
+
+ int result; /* result of the comparison */
+ if (0 == (result = CMP(sr1->begin, sr2->begin)) &&
+ 0 == (result = CMP(sr1->end, sr2->end)) &&
+ 0 == (result = CMP(sr1->context, sr2->context)) &&
+ 0 == (result = CMP(qr1->begin, qr2->begin)) &&
+ 0 == (result = CMP(qr1->end, qr2->end))) {
+ result = CMP(qr1->context, qr2->context);
+ }
+ return result;
+}
+
+
+
+/**
+ * Read a list of alignments from a translated search and create a
+ * new array of pointers to s_WindowInfo so that each alignment is
+ * contained in exactly one window. See s_WindowsFromAligns for the
+ * meaning of the parameters. (@sa s_WindowsFromAligns).
+ *
+ * @return 0 on success, -1 on out-of-memory
+ */
+static int
+s_WindowsFromTranslatedAligns(BlastCompo_Alignment * alignments,
+ BlastCompo_QueryInfo * query_info,
+ int hspcnt, int border, int sequence_length,
+ s_WindowInfo ***pwindows, int * nWindows)
+{
+ int k; /* iteration index */
+ s_WindowInfo ** windows; /* the output list of windows */
+ int length_joined; /* the current length of the
+ list of joined windows */
+ BlastCompo_Alignment * align; /* represents the current
+ alignment in the main loop */
+ *nWindows = 0;
+ windows = *pwindows = calloc(hspcnt, sizeof(s_WindowInfo*));
+ *nWindows = hspcnt;
+ if (windows == NULL)
+ goto error_return;
+
+ for (align = alignments, k = 0;
+ align != NULL;
+ align = align->next, k++) {
+ int frame; /* translation frame */
+ int query_index; /* index of the query contained in the
+ current HSP */
+ int query_origin; /* start of the current query in the
+ concatenated query */
+ int query_length; /* length of the current query */
+ int translated_length; /* length of the translation of the entire
+ nucleotide sequence in this frame */
+ int begin, end; /* interval in amino acid coordinates of
+ the translated window */
+ /* copy of the current alignment to add to the window */
+ BlastCompo_Alignment * align_copy;
+ frame = align->frame;
+ query_index = align->queryIndex;
+ query_origin = query_info[query_index].origin;
+ query_length = query_info[query_index].seq.length;
+ translated_length = (sequence_length - ABS(frame) + 1)/3;
+
+ begin = MAX(0, align->matchStart - border);
+ end = MIN(translated_length, align->matchEnd + border);
+ align_copy = s_AlignmentCopy(align);
+ if (align_copy == NULL)
+ goto error_return;
+ windows[k] =
+ s_WindowInfoNew(begin, end, frame, query_origin, query_length,
+ query_index, align_copy);
+ if (windows[k] == NULL)
+ goto error_return;
+ }
+ qsort(windows, hspcnt, sizeof(BlastCompo_SequenceRange*),
+ s_LocationCompareWindows);
+
+ /* Join windows that overlap or are too close together. */
+ length_joined = 0;
+ for (k = 0; k < hspcnt; k++) { /* for all windows in the
+ original list */
+ s_WindowInfo * window; /* window at this value of k */
+ s_WindowInfo * nextWindow; /* window at the next
+ value of k, or NULL if
+ no such window
+ exists */
+ window = windows[k];
+ nextWindow = ( k + 1 < hspcnt ) ? windows[k+1] : NULL;
+
+ if(nextWindow != NULL &&
+ window->subject_range.context ==
+ nextWindow->subject_range.context &&
+ window->query_range.context == nextWindow->query_range.context &&
+ window->subject_range.end >= nextWindow->subject_range.begin) {
+ /* Join the current window with the next window. Do not add the
+ current window to the output list. */
+ s_WindowInfoJoin(nextWindow, &windows[k]);
+ } else {
+ /* Don't join the current window with the next window. Add the
+ current window to the output list instead */
+ windows[length_joined] = window;
+ length_joined++;
+ } /* end else don't join the current window with the next window */
+ } /* end for all windows in the original list */
+ *nWindows = length_joined;
+
+ for (k = length_joined; k < hspcnt; k++) {
+ windows[k] = NULL;
+ }
+ for (k = 0; k < length_joined; k++) {
+ s_DistinctAlignmentsSort(&windows[k]->align, windows[k]->hspcnt);
+ }
+ qsort(windows, *nWindows, sizeof(BlastCompo_SequenceRange*),
+ s_SubjectCompareWindows);
+ return 0; /* normal return */
+
+error_return:
+ for (k = 0; k < *nWindows; k++) {
+ if (windows[k] != NULL)
+ s_WindowInfoFree(&windows[k]);
+ }
+ free(windows);
+ *pwindows = NULL;
+ return -1;
+}
+
+
+/**
+ * Read a list of alignments from a protein search and create a
+ * new array of pointers to s_WindowInfo so that each alignment is
+ * contained in exactly one window. See s_WindowsFromAligns for the
+ * meaning of the parameters. (@sa s_WindowsFromAligns).
+ *
+ * @return 0 on success, -1 on out-of-memory
+ */
+static int
+s_WindowsFromProteinAligns(BlastCompo_Alignment * alignments,
+ BlastCompo_QueryInfo * query_info,
+ int numQueries,
+ int sequence_length,
+ s_WindowInfo ***pwindows,
+ int * nWindows)
+{
+ BlastCompo_Alignment * align;
+ int query_index; /* index of the query */
+ int query_origin; /* start of an individual query in the
+ concatenated query */
+ int query_length; /* length of an individual query */
+ int window_index; /* index of a window in the window list */
+
+ /* new list of windows */
+ s_WindowInfo ** windows =
+ calloc(numQueries, sizeof(s_WindowInfo*));
+ *nWindows = 0;
+ if (windows == NULL)
+ goto error_return;
+ *nWindows = numQueries;
+ for (align = alignments; align != NULL; align = align->next) {
+ BlastCompo_Alignment * copiedAlign;
+
+ query_index = align->queryIndex;
+ query_origin = query_info[query_index].origin;
+ query_length = query_info[query_index].seq.length;
+
+ if (windows[query_index] == NULL) {
+ windows[query_index] =
+ s_WindowInfoNew(0, sequence_length, 0, query_origin,
+ query_length, query_index, NULL);
+ if (windows[query_index] == NULL)
+ goto error_return;
+ }
+ copiedAlign = s_AlignmentCopy(align);
+ if (copiedAlign == NULL)
+ goto error_return;
+ copiedAlign->next = windows[query_index]->align;
+ windows[query_index]->align = copiedAlign;
+ windows[query_index]->hspcnt++;
+ }
+ window_index = 0;
+ for (query_index = 0; query_index < numQueries; query_index++) {
+ if (windows[query_index] != NULL) {
+ windows[window_index] = windows[query_index];
+ s_AlignmentsRev(&windows[window_index]->align);
+ window_index++;
+ }
+ }
+ /* shrink to fit */
+ {
+ s_WindowInfo ** new_windows =
+ realloc(windows, window_index * sizeof(BlastCompo_SequenceRange*));
+ if (new_windows == NULL) {
+ goto error_return;
+ } else {
+ windows = new_windows;
+ *nWindows = window_index;
+ }
+ }
+ qsort(windows, *nWindows, sizeof(BlastCompo_SequenceRange*),
+ s_SubjectCompareWindows);
+ *pwindows = windows;
+ /* Normal return */
+ return 0;
+
+error_return:
+ for (window_index = 0; window_index < *nWindows; window_index++) {
+ s_WindowInfoFree(&windows[window_index]);
+ }
+ free(windows);
+ return -1;
+}
+
+
+/**
+ * Read a list of alignments from a search (protein or translated) and
+ * create a new array of pointers to s_WindowInfo so that each
+ * alignment is contained in exactly one window.
+ *
+ * @param alignments a list of alignments from a translated
+ * search
+ * @param query_info information about the query/queries used
+ * in the search
+ * @param hspcnt number of alignments
+ * @param numQueries number of queries
+ * @param border border around windows; windows with
+ * overlapping borders will be joined.
+ * @param sequence_length length of the subject sequence, in
+ * nucleotides for translated searches or
+ * in amino acids for protein searches
+ * @param *pwindows the new array of windows
+ * @param nWindows the length of *pwindows
+ * @param subject_is_translated is the subject sequence translated?
+ *
+ * @return 0 on success, -1 on out-of-memory
+ */
+static int
+s_WindowsFromAligns(BlastCompo_Alignment * alignments,
+ BlastCompo_QueryInfo * query_info, int hspcnt,
+ int numQueries, int border, int sequence_length,
+ s_WindowInfo ***pwindows, int * nWindows,
+ int subject_is_translated)
+{
+ if (subject_is_translated) {
+ return s_WindowsFromTranslatedAligns(alignments, query_info,
+ hspcnt, border,
+ sequence_length,
+ pwindows, nWindows);
+ } else {
+ return s_WindowsFromProteinAligns(alignments, query_info,
+ numQueries, sequence_length,
+ pwindows, nWindows);
+ }
+}
+
+
+/**
+ * Compute the amino acid composition of the subject region.
+ *
+ * @param subject_composition the computed composition.
+ * @param subject subject sequence data
+ * @param subject_range the range of the given subject data in
+ * the complete subject sequence
+ * @param align an alignment of the query to the
+ * subject range
+ */
+static void
+s_GetSubjectComposition(Blast_AminoAcidComposition * subject_composition,
+ BlastCompo_SequenceData * subject,
+ BlastCompo_SequenceRange * subject_range,
+ BlastCompo_Alignment * align)
+{
+ Uint1 * subject_data; /* sequence data for the subject */
+ int length; /* length of the subject portion of the alignment */
+ int start; /* start of the subject portion, relative to the given
+ range */
+ int finish; /* end of the subject portion, relative to the
+ given range */
+ int translation_frame; /* the translation frame of the subject
+ sequence */
+ /* [left, right) is the interval of the subject to use when
+ * computing composition. The endpoints are offsets into the
+ * subject_range. */
+ int left, right;
+
+ subject_data = subject->data;
+ length = subject_range->end - subject_range->begin;
+ start = align->matchStart - subject_range->begin;
+ finish = align->matchEnd - subject_range->begin;
+ translation_frame = subject_range->context;
+
+ if (translation_frame == 0) {
+ /* This is not a tblastn search; use the whole subject when
+ * computing the composition */
+ left = 0;
+ right = length;
+ } else {
+ /* This is a tblastn search; use only the part of the subject. */
+ Blast_GetCompositionRange(&left, &right, subject_data, length,
+ start, finish);
+ }
+ Blast_ReadAaComposition(subject_composition, &subject_data[left],
+ right - left);
+}
+
+
+/**
+ * Compute an evalue from a score and a set of statistical parameters
+ */
+static double
+s_EvalueFromScore(int score, double Lambda, double logK, double searchsp)
+{
+ return searchsp * exp(-(Lambda * score) + logK);
+}
+
+
+/**
+ * The number of bits by which the score of a previously computed
+ * alignment must exceed the score of the HSP under consideration for
+ * a containment relationship to be reported by the isContained
+ * routine. */
+#define KAPPA_BIT_TOL 2.0
+
+
+#define KAPPA_CONTAINED_IN_HSP(a,b,c,d,e,f) \
+((a <= c && b >= c) && (d <= f && e >= f))
+#define KAPPA_SIGN(a) ((a > 0) ? 1 : ((a < 0) ? -1 : 0))
+/**
+ * Return true if an alignment is contained in a previously-computed
+ * alignment of sufficiently high score.
+ *
+ * @param in_align the alignment to be tested
+ * @param alignments list of alignments
+ * @param lambda Karlin-Altschul statistical parameter
+ */
+static Boolean
+s_IsContained(BlastCompo_Alignment * in_align,
+ BlastCompo_Alignment * alignments,
+ double lambda)
+{
+ BlastCompo_Alignment * align; /* represents the current alignment
+ in the main loop */
+ /* Endpoints of the alignment */
+ int query_offset = in_align->queryStart;
+ int query_end = in_align->queryEnd;
+ int subject_offset = in_align->matchStart;
+ int subject_end = in_align->matchEnd;
+ double score = in_align->score;
+ double scoreThresh = score + KAPPA_BIT_TOL * LOCAL_LN2/lambda;
+
+ for (align = alignments; align != NULL; align = align->next ) {
+ /* for all elements of alignments */
+ if (KAPPA_SIGN(in_align->frame) == KAPPA_SIGN(align->frame)) {
+ /* hsp1 and hsp2 are in the same query/subject frame */
+ if (KAPPA_CONTAINED_IN_HSP
+ (align->queryStart, align->queryEnd, query_offset,
+ align->matchStart, align->matchEnd, subject_offset) &&
+ KAPPA_CONTAINED_IN_HSP
+ (align->queryStart, align->queryEnd, query_end,
+ align->matchStart, align->matchEnd, subject_end) &&
+ scoreThresh <= align->score) {
+ return 1;
+ }
+ }
+ }
+ return 0;
+}
+
+
+/** Free a set of Blast_RedoAlignParams */
+void
+Blast_RedoAlignParamsFree(Blast_RedoAlignParams ** pparams)
+{
+ if (*pparams != NULL) {
+ Blast_MatrixInfoFree(&(*pparams)->matrix_info);
+ free((*pparams)->gapping_params);
+ free(*pparams);
+ *pparams = NULL;
+ }
+}
+
+/** Create new Blast_RedoAlignParams object. The parameters of this
+ * function correspond directly to the fields of
+ * Blast_RedoAlignParams. The new Blast_RedoAlignParams object takes
+ * possession of *pmatrix_info and *pgapping_params, so these values
+ * are set to NULL on exit. */
+Blast_RedoAlignParams *
+Blast_RedoAlignParamsNew(Blast_MatrixInfo ** pmatrix_info,
+ BlastCompo_GappingParams ** pgapping_params,
+ int adjustParameters, int positionBased,
+ int subject_is_translated,
+ int ccat_query_length, int cutoff_s,
+ double cutoff_e, int do_link_hsps, double Lambda,
+ double logK,
+ const Blast_RedoAlignCallbacks * callbacks)
+{
+ Blast_RedoAlignParams * params = malloc(sizeof(Blast_RedoAlignParams));
+ if (params) {
+ params->matrix_info = *pmatrix_info;
+ *pmatrix_info = NULL;
+ params->gapping_params = *pgapping_params;
+ *pgapping_params = NULL;
+
+ params->adjustParameters = adjustParameters;
+ params->positionBased = positionBased;
+ params->RE_pseudocounts = kReMatrixAdjustmentPseudocounts;
+ params->subject_is_translated = subject_is_translated;
+ params->ccat_query_length = ccat_query_length;
+ params->cutoff_s = cutoff_s;
+ params->cutoff_e = cutoff_e;
+ params->do_link_hsps = do_link_hsps;
+ params->Lambda = Lambda;
+ params->logK = logK;
+ params->callbacks = callbacks;
+ } else {
+ free(*pmatrix_info); *pmatrix_info = NULL;
+ free(*pgapping_params); *pgapping_params = NULL;
+ }
+ return params;
+}
+
+
+/**
+ * Recompute all alignments for one query/subject pair using
+ * composition-based statistics or composition-based matrix adjustment.
+ *
+ * @param alignments an array of lists containing the newly
+ * computed alignments. There is one array
+ * element for each query in the original
+ * search
+ * @param params parameters used to redo the alignments
+ * @param incoming_aligns a list of existing alignments
+ * @param hspcnt length of incoming_aligns
+ * @param matchingSeq the database sequence
+ * @param ccat_query_length the length of the concatenated query
+ * @param query information about all queries
+ * @param numQueries the number of queries
+ * @param matrix the scoring matrix
+ * @param NRrecord a workspace used to adjust the composition.
+ *
+ * @return 0 on success, -1 on out-of-memory
+ */
+int
+Blast_RedoOneMatch(BlastCompo_Alignment ** alignments,
+ Blast_RedoAlignParams * params,
+ BlastCompo_Alignment * incoming_aligns, int hspcnt,
+ BlastCompo_MatchingSequence * matchingSeq,
+ int ccat_query_length, BlastCompo_QueryInfo query_info[],
+ int numQueries, int ** matrix,
+ Blast_CompositionWorkspace * NRrecord)
+{
+ int status = 0; /* return status */
+ s_WindowInfo **windows; /* array of windows */
+ int nWindows; /* length of windows */
+ int window_index; /* loop index */
+ int query_index; /* index of the current query */
+ /* which mode of composition adjustment is actually used? */
+ ECompoAdjustModes whichMode = eNoCompositionAdjustment;
+
+ /* fields of params, as local variables */
+ Blast_MatrixInfo * scaledMatrixInfo = params->matrix_info;
+ int adjustParameters = params->adjustParameters;
+ int positionBased = params->positionBased;
+ int RE_rule = params->adjustParameters - 1;
+ int RE_pseudocounts = params->RE_pseudocounts;
+ int subject_is_translated = params->subject_is_translated;
+ double Lambda = params->Lambda;
+ BlastCompo_GappingParams * gapping_params = params->gapping_params;
+ const Blast_RedoAlignCallbacks * callbacks = params->callbacks;
+
+ assert(adjustParameters < 2 || !positionBased);
+ for (query_index = 0; query_index < numQueries; query_index++) {
+ alignments[query_index] = NULL;
+ }
+ status =
+ s_WindowsFromAligns(incoming_aligns, query_info, hspcnt, numQueries,
+ kWindowBorder, matchingSeq->length, &windows,
+ &nWindows, subject_is_translated);
+ if (status != 0) {
+ goto function_level_cleanup;
+ }
+ /* for all windows */
+ for (window_index = 0; window_index < nWindows; window_index++) {
+ s_WindowInfo * window; /* the current window */
+ BlastCompo_Alignment * in_align; /* the current alignment */
+ int hsp_index; /* index of the current alignment */
+ /* data for the current window */
+ BlastCompo_SequenceData subject = {0,};
+ BlastCompo_SequenceData * query; /* query data for this window */
+ /* the composition of this query */
+ Blast_AminoAcidComposition * query_composition;
+
+ window = windows[window_index];
+ status =
+ callbacks->get_range(matchingSeq, &window->subject_range,
+ &subject);
+ if (status != 0) {
+ goto window_index_loop_cleanup;
+ }
+ /* for all alignments in this window */
+ for (in_align = window->align, hsp_index = 0;
+ in_align != NULL;
+ in_align = in_align->next, hsp_index++) {
+ query_index = in_align->queryIndex;
+ query = &query_info[query_index].seq;
+ query_composition = &query_info[query_index].composition;
+ /* if in_align is not contained in a higher-scoring
+ * alignment */
+ if ( !s_IsContained(in_align, alignments[query_index], Lambda) ) {
+ BlastCompo_Alignment * newAlign; /* the new alignment */
+ /* adjust_search_failed is true only if Blast_AdjustScores
+ * is called and returns a nonzero value */
+ int adjust_search_failed = 0;
+ if (adjustParameters &&
+ (subject_is_translated || hsp_index == 0)) {
+ Blast_AminoAcidComposition subject_composition;
+ s_GetSubjectComposition(&subject_composition,
+ &subject,
+ &window->subject_range,
+ in_align);
+ adjust_search_failed =
+ Blast_AdjustScores(matrix, query_composition,
+ query->length,
+ &subject_composition,
+ subject.length,
+ scaledMatrixInfo, RE_rule,
+ RE_pseudocounts, NRrecord,
+ &whichMode,
+ callbacks->calc_lambda);
+ if (adjust_search_failed < 0) { /* fatal error */
+ status = adjust_search_failed;
+ goto window_index_loop_cleanup;
+ }
+ }
+ if ( !adjust_search_failed ) {
+ newAlign =
+ callbacks->
+ redo_one_alignment(in_align, whichMode,
+ query, &window->query_range,
+ ccat_query_length,
+ &subject, &window->subject_range,
+ matchingSeq->length,
+ gapping_params);
+ s_WithDistinctEnds(&newAlign, &alignments[query_index],
+ callbacks->free_align_traceback);
+ }
+ } /* end if in_align is not contained...*/
+ } /* end for all alignments in this window */
+window_index_loop_cleanup:
+ if (subject.data != NULL)
+ s_SequenceDataRelease(&subject);
+ if (status != 0)
+ goto function_level_cleanup;
+ } /* end for all windows */
+function_level_cleanup:
+ if (status != 0) {
+ for (query_index = 0; query_index < numQueries; query_index++) {
+ BlastCompo_AlignmentsFree(&alignments[query_index],
+ callbacks->free_align_traceback);
+ }
+ }
+ for (window_index = 0; window_index < nWindows; window_index++) {
+ s_WindowInfoFree(&windows[window_index]);
+ }
+ free(windows);
+
+ return status;
+}
+
+
+/**
+ * Recompute all alignments for one query/subject pair using the
+ * Smith-Waterman algorithm and possibly also composition-based
+ * statistics or composition-based matrix adjustment.
+ *
+ * @param alignments an array of lists containing the newly
+ * computed alignments. There is one array
+ * element for each query in the original
+ * search
+ * @param params parameters used to redo the alignments
+ * @param incoming_aligns a list of existing alignments
+ * @param hspcnt length of incoming_aligns
+ * @param matchingSeq the database sequence
+ * @param query information about all queries
+ * @param numQueries the number of queries
+ * @param matrix the scoring matrix
+ * @param NRrecord a workspace used to adjust the composition.
+ * @param forbidden a workspace used to hold forbidden ranges
+ * for the Smith-Waterman algorithm.
+ * @param significantMatches an array of heaps of alignments for
+ * query-subject pairs that have already
+ * been redone; used to terminate the
+ * Smith-Waterman algorithm early if it is
+ * clear that the current match is not
+ * significant enough to be saved.
+ *
+ * @return 0 on success, -1 on out-of-memory
+ */
+int
+Blast_RedoOneMatchSmithWaterman(BlastCompo_Alignment ** alignments,
+ Blast_RedoAlignParams * params,
+ BlastCompo_Alignment * incoming_aligns,
+ int hspcnt,
+ BlastCompo_MatchingSequence * matchingSeq,
+ BlastCompo_QueryInfo query_info[],
+ int numQueries,
+ int ** matrix,
+ Blast_CompositionWorkspace * NRrecord,
+ Blast_ForbiddenRanges * forbidden,
+ BlastCompo_Heap * significantMatches)
+{
+ int status = 0; /* status return value */
+ s_WindowInfo **windows = NULL; /* array of windows */
+ int nWindows; /* length of windows */
+ int window_index; /* loop index */
+ int query_index; /* index of the current query */
+ /* which mode of composition adjustment is actually used? */
+ ECompoAdjustModes whichMode = eNoCompositionAdjustment;
+
+ /* fields of params, as local variables */
+ Blast_MatrixInfo * scaledMatrixInfo = params->matrix_info;
+ int adjustParameters = params->adjustParameters;
+ int positionBased = params->positionBased;
+ int RE_rule = params->adjustParameters - 1;
+ int RE_pseudocounts = params->RE_pseudocounts;
+ int subject_is_translated = params->subject_is_translated;
+ int do_link_hsps = params->do_link_hsps;
+ int ccat_query_length = params->ccat_query_length;
+ BlastCompo_GappingParams * gapping_params = params->gapping_params;
+ double Lambda = params->Lambda;
+ double logK = params->logK;
+ const Blast_RedoAlignCallbacks * callbacks = params->callbacks;
+
+ int gap_open = gapping_params->gap_open;
+ int gap_extend = gapping_params->gap_extend;
+
+ assert(adjustParameters < 2 || !positionBased);
+ for (query_index = 0; query_index < numQueries; query_index++) {
+ alignments[query_index] = NULL;
+ }
+ /* Find the multiple translation windows used by tblastn queries. */
+ status =
+ s_WindowsFromAligns(incoming_aligns, query_info, hspcnt, numQueries,
+ kWindowBorder, matchingSeq->length, &windows,
+ &nWindows, subject_is_translated);
+ if (status != 0)
+ goto function_level_cleanup;
+ /* We are performing a Smith-Waterman alignment */
+ for (window_index = 0; window_index < nWindows; window_index++) {
+ /* for all window */
+ s_WindowInfo * window = NULL; /* the current window */
+ BlastCompo_SequenceData subject = {0,};
+ /* subject data for this window */
+ BlastCompo_SequenceData * query; /* query data for this window */
+ /* the composition of this query */
+ Blast_AminoAcidComposition * query_composition;
+ double searchsp; /* effective search space */
+
+ /* adjust_search_failed is true only if Blast_AdjustScores
+ * is called and returns a nonzero value */
+ int adjust_search_failed = FALSE;
+
+ window = windows[window_index];
+ query_index = window->query_range.context;
+ query = &query_info[query_index].seq;
+ query_composition = &query_info[query_index].composition;
+ searchsp = query_info[query_index].eff_search_space;
+
+ status = callbacks->get_range(matchingSeq, &window->subject_range,
+ &subject);
+ if (status != 0)
+ goto window_index_loop_cleanup;
+
+ /* For Smith-Waterman alignments, adjust the search using the
+ * composition of the highest scoring alignment in window */
+ if (adjustParameters) {
+ Blast_AminoAcidComposition subject_composition;
+ s_GetSubjectComposition(&subject_composition,
+ &subject, &window->subject_range,
+ window->align);
+ adjust_search_failed =
+ Blast_AdjustScores(matrix,
+ query_composition, query->length,
+ &subject_composition, subject.length,
+ scaledMatrixInfo,
+ RE_rule, RE_pseudocounts, NRrecord,
+ &whichMode, callbacks->calc_lambda);
+ if (adjust_search_failed < 0) { /* fatal error */
+ status = adjust_search_failed;
+ goto window_index_loop_cleanup;
+ }
+ }
+ if ( !adjust_search_failed ) {
+ /* BlastCompo_AdjustSearch ran without error; compute the new
+ alignments. */
+ int aSwScore; /* score computed by the
+ * Smith-Waterman algorithm. */
+ int alignment_is_significant; /* True if the score/evalue of
+ * the Smith-Waterman alignment
+ * is significant. */
+ Blast_ForbiddenRangesClear(forbidden);
+ do {
+ int matchEnd, queryEnd; /* end points of the alignments
+ * computed by the Smith-Waterman
+ * algorithm. */
+ status =
+ Blast_SmithWatermanScoreOnly(&aSwScore, &matchEnd,
+ &queryEnd,
+ subject.data,
+ subject.length,
+ query->data,
+ query->length, matrix,
+ gap_open, gap_extend,
+ positionBased,
+ forbidden);
+ if (status != 0)
+ goto window_index_loop_cleanup;
+
+ if (do_link_hsps) {
+ alignment_is_significant = aSwScore >= params->cutoff_s;
+ } else {
+ double newSwEvalue; /* evalue as computed by the
+ * Smith-Waterman algorithm */
+ newSwEvalue =
+ s_EvalueFromScore(aSwScore, Lambda, logK, searchsp);
+
+ alignment_is_significant = newSwEvalue < params->cutoff_e;
+ if (alignments[query_index] == NULL) {
+ /* this is the most significant alignment; if
+ * it will not be accepted, no alignments from
+ * this match will */
+ alignment_is_significant =
+ alignment_is_significant &&
+ BlastCompo_HeapWouldInsert(
+ &significantMatches[query_index],
+ newSwEvalue, aSwScore, matchingSeq->index);
+ }
+ }
+ if (alignment_is_significant) {
+ /* the redone alignment */
+ BlastCompo_Alignment * newAlign;
+ int matchStart, queryStart; /* the start of the
+ * alignment in the
+ * match/query sequence */
+ int updatedScore; /* score found by the SW
+ algorithm run in reverse */
+ status =
+ Blast_SmithWatermanFindStart(&updatedScore,
+ &matchStart,
+ &queryStart,
+ subject.data,
+ subject.length,
+ query->data,
+ matrix, gap_open,
+ gap_extend,
+ matchEnd,
+ queryEnd,
+ aSwScore,
+ positionBased,
+ forbidden);
+ if (status != 0) {
+ goto window_index_loop_cleanup;
+ }
+ status =
+ callbacks->
+ new_xdrop_align(&newAlign, &queryEnd, &matchEnd,
+ queryStart, matchStart, aSwScore,
+ query, &window->query_range,
+ ccat_query_length,
+ &subject, &window->subject_range,
+ matchingSeq->length,
+ gapping_params, whichMode);
+ if (status != 0) {
+ goto window_index_loop_cleanup;
+ }
+ newAlign->next = alignments[query_index];
+ alignments[query_index] = newAlign;
+
+ if (window->hspcnt > 1) {
+ /* We may compute more alignments; make the range
+ of the current alignment forbidden */
+ status =
+ Blast_ForbiddenRangesPush(forbidden,
+ queryStart, queryEnd,
+ matchStart, matchEnd);
+ }
+ if (status != 0) {
+ goto window_index_loop_cleanup;
+ }
+ }
+ /* end if the next local alignment is significant */
+ } while (alignment_is_significant && window->hspcnt > 1);
+ /* end do..while the next local alignment is significant, and
+ * the original blast search found more than one alignment. */
+ } /* end if BlastCompo_AdjustSearch ran without error. */
+window_index_loop_cleanup:
+ if (subject.data != NULL)
+ s_SequenceDataRelease(&subject);
+ if (status != 0)
+ goto function_level_cleanup;
+ } /* end for all windows */
+
+function_level_cleanup:
+ if (status != 0) {
+ for (query_index = 0; query_index < numQueries; query_index++) {
+ BlastCompo_AlignmentsFree(&alignments[query_index],
+ callbacks->free_align_traceback);
+ }
+ }
+ for (window_index = 0; window_index < nWindows; window_index++) {
+ s_WindowInfoFree(&windows[window_index]);
+ }
+ free(windows);
+
+ return status;
+}
+
+
+/** Return true if a heuristic determines that it is unlikely to be
+ * worthwhile to redo a query-subject pair with the given evalue; used
+ * to terminate the main loop for redoing all alignments early. */
+int
+BlastCompo_EarlyTermination(double evalue,
+ BlastCompo_Heap significantMatches[],
+ int numQueries)
+{
+ int i;
+ for (i = 0; i < numQueries; i++) {
+ if (BlastCompo_HeapFilledToCutoff(&significantMatches[i])) {
+ double ecutoff = significantMatches[i].ecutoff;
+ /* Only matches with evalue <= ethresh will be saved. */
+ if (evalue <= EVALUE_STRETCH * ecutoff) {
+ /* The evalue if this match is sufficiently small
+ * that we want to redo it to try to obtain an
+ * alignment with evalue smaller than ecutoff. */
+ return FALSE;
+ }
+ } else {
+ return FALSE;
+ }
+ }
+ return TRUE;
+}
diff --git a/algo/blast/composition_adjustment/redo_alignment.h b/algo/blast/composition_adjustment/redo_alignment.h
new file mode 100644
index 00000000..948fac72
--- /dev/null
+++ b/algo/blast/composition_adjustment/redo_alignment.h
@@ -0,0 +1,333 @@
+/* $Id: redo_alignment.h,v 1.1 2005/12/01 13:52:42 gertz Exp $
+ * ===========================================================================
+ *
+ * PUBLIC DOMAIN NOTICE
+ * National Center for Biotechnology Information
+ *
+ * This software/database is a "United States Government Work" under the
+ * terms of the United States Copyright Act. It was written as part of
+ * the author's official duties as a United States Government employee and
+ * thus cannot be copyrighted. This software/database is freely available
+ * to the public for use. The National Library of Medicine and the U.S.
+ * Government have not placed any restriction on its use or reproduction.
+ *
+ * Although all reasonable efforts have been taken to ensure the accuracy
+ * and reliability of the software and data, the NLM and the U.S.
+ * Government do not and cannot warrant the performance or results that
+ * may be obtained by using this software or data. The NLM and the U.S.
+ * Government disclaim all warranties, express or implied, including
+ * warranties of performance, merchantability or fitness for any particular
+ * purpose.
+ *
+ * Please cite the author in any work or product based on this material.
+ *
+ * ===========================================================================*/
+/**
+ * @file kappa_common.h
+ * @author Alejandro Schaffer, E. Michael Gertz
+ *
+ * Definitions used to redo a set of alignments, using either
+ * composition matrix adjustment or the Smith-Waterman algorithm (or
+ * both.)
+ *
+ * Definitions with the prefix 'BlastCompo_' are primarily intended for use
+ * by glue code that interfaces with this module, i.e. the definitions
+ * need to be externally available so that glue code may be written, but
+ * are not intended for general use.
+ */
+#ifndef __REDO_ALIGNMENT__
+#define __REDO_ALIGNMENT__
+
+#include <algo/blast/composition_adjustment/composition_adjustment.h>
+#include <algo/blast/composition_adjustment/smith_waterman.h>
+#include <algo/blast/composition_adjustment/compo_heap.h>
+
+
+/**
+ * Within the composition adjustment module, an object of type
+ * BlastCompo_Alignment represents a distinct alignment of the query
+ * sequence to the current subject sequence. These objects are
+ * typically part of a singly linked list of distinct alignments,
+ * stored in the reverse of the order in which they were computed.
+ */
+typedef struct BlastCompo_Alignment {
+ int score; /**< the score of this alignment */
+ ECompoAdjustModes comp_adjustment_mode; /**< how the score was computed */
+ int queryIndex; /**< index of the query in a concatenated query */
+ int queryStart; /**< the start of the alignment in the query */
+ int queryEnd; /**< one past the end of the alignment in the query */
+ int matchStart; /**< the start of the alignment in the subject */
+ int matchEnd; /**< one past the end of the alignment in the
+ subject */
+ int frame; /**< the subject frame */
+ void * context; /**< traceback info for a gapped alignment */
+ struct BlastCompo_Alignment * next; /**< the next alignment in the
+ list */
+} BlastCompo_Alignment;
+
+NCBI_XBLAST_EXPORT
+BlastCompo_Alignment *
+BlastCompo_AlignmentNew(int score,
+ ECompoAdjustModes comp_adjustment_mode,
+ int queryIndex, int queryStart, int queryEnd,
+ int matchStart, int matchEnd, int frame,
+ void * context);
+
+void BlastCompo_AlignmentsFree(BlastCompo_Alignment ** palign,
+ void (*free_context)(void*));
+
+/** Parameters used to compute gapped alignments */
+struct BlastCompo_GappingParams {
+ int gap_open; /**< penalty for opening a gap */
+ int gap_extend; /**< penalty for extending a gapped alignment by
+ one residue */
+ int decline_align; /**< penalty for declining to align two characters */
+ int x_dropoff; /**< for x-drop algorithms, once a path falls below
+ the best score by this (positive) amount, the
+ path is no longer searched */
+ void * context; /**< a pointer to any additional gapping parameters
+ that may be needed by the calling routine. */
+};
+typedef struct BlastCompo_GappingParams BlastCompo_GappingParams;
+
+
+/**
+ * BlastCompo_SequenceRange - a struct whose instances represent a range
+ * of data in a sequence. */
+typedef struct BlastCompo_SequenceRange
+{
+ int begin; /**< the starting index of the range */
+ int end; /**< one beyond the last item in the range */
+ int context; /**< integer identifier for this window, can
+ indicate a translation frame or an index into a
+ set of sequences. */
+} BlastCompo_SequenceRange;
+
+
+/**
+ * BlastCompo_SequenceData - represents a string of amino acids or nucleotides
+ */
+typedef struct BlastCompo_SequenceData {
+ Uint1 * data; /**< amino acid or nucleotide data */
+ int length; /**< the length of data. For amino acid data
+ &data[-1] is a valid address and
+ data[-1] == 0. */
+ Uint1 * buffer; /**< if non-nil, points to memory that
+ must be freed when this instance of
+ BlastCompo_SequenceData is deleted. */
+} BlastCompo_SequenceData;
+
+
+/**
+ * A BlastCompo_MatchingSequence represents a subject sequence to be aligned
+ * with the query. This abstract sequence is used to hide the
+ * complexity associated with actually obtaining and releasing the
+ * data for a matching sequence, e.g. reading the sequence from a DB
+ * or translating it from a nucleotide sequence.
+ *
+ * We draw a distinction between a sequence itself, and strings of
+ * data that may be obtained from the sequence. The amino
+ * acid/nucleotide data is represented by an object of type
+ * BlastCompo_SequenceData. There may be more than one instance of
+ * BlastCompo_SequenceData per BlastCompo_MatchingSequence, each representing a
+ * different range in the sequence, or a different translation frame.
+ */
+typedef struct BlastCompo_MatchingSequence {
+ Int4 length; /**< length of this matching sequence */
+ Int4 index; /**< index of this sequence in the database */
+ void * local_data;
+} BlastCompo_MatchingSequence;
+
+
+/** Collected information about a query */
+struct BlastCompo_QueryInfo {
+ int origin; /**< origin of the query in a
+ concatenated query */
+ BlastCompo_SequenceData seq; /**< sequence data for the query */
+ Blast_AminoAcidComposition composition; /**< the composition of
+ the query */
+ double eff_search_space; /**< effective search space of searches
+ involving this query */
+};
+typedef struct BlastCompo_QueryInfo BlastCompo_QueryInfo;
+
+
+/** Callbacks **/
+
+/** Function type: calculate the statistical parameter Lambda from a
+ * set of score probabilities.
+ *
+ * @param probs an array of score probabilities
+ * @param min_score the score corresponding to probs[0]
+ * @param max_score the largest score in the probs array
+ * @param lambda0 an initial guess for Lambda
+ * @return Lambda
+ */
+typedef double
+calc_lambda_type(double * probs, int min_score, int max_score,
+ double lambda0);
+
+/**
+ * Function type: Get a range of data for a sequence.
+ *
+ * @param sequence a sequence
+ * @param range the range to get
+ * @param data the data obtained
+ */
+typedef int
+get_range_type(const BlastCompo_MatchingSequence * sequence,
+ const BlastCompo_SequenceRange * range,
+ BlastCompo_SequenceData * data);
+
+/**
+ * Function type: Calculate the traceback for one alignment by
+ * performing an x-drop alignment in both directions
+ *
+ * @param in_align the existing alignment, without traceback
+ * @param whichMode which mode of composition adjustment has
+ * been used to adjust the scoring matrix
+ * @param query_data query sequence data
+ * @param query_range range of this query in the concatenated
+ * query
+ * @param ccat_query_length total length of the concatenated query
+ * @param subject_data subject sequence data
+ * @param subject_range range of subject_data in the translated
+ * query, in amino acid coordinates
+ * @param full_subject_length length of the full subject sequence
+ * @param gapping_params parameters used to compute gapped
+ * alignments
+ */
+typedef BlastCompo_Alignment *
+redo_one_alignment_type(BlastCompo_Alignment * in_align,
+ ECompoAdjustModes whichMode,
+ BlastCompo_SequenceData * query_data,
+ BlastCompo_SequenceRange * query_range,
+ int ccat_query_length,
+ BlastCompo_SequenceData * subject_data,
+ BlastCompo_SequenceRange * subject_range,
+ int full_subject_length,
+ BlastCompo_GappingParams * gapping_params);
+
+/**
+ * Function type: Calculate the traceback for one alignment by
+ * performing an x-drop alignment in the forward direction, possibly
+ * increasing the x-drop parameter until the desired score is
+ * attained.
+ *
+ * The start, end and score of the alignment should be obtained
+ * using the Smith-Waterman algorithm before this routine is called.
+ *
+ * @param *palign the new alignment
+ * @param *pqueryEnd on entry, the end of the alignment in the
+ * query, as computed by the Smith-Waterman
+ * algorithm. On exit, the end as computed by
+ * the x-drop algorithm
+ * @param *pmatchEnd like as *pqueryEnd, but for the subject
+ * sequence
+ * @param queryStart the starting point in the query
+ * @param matchStart the starting point in the subject
+ * @param score the score of the alignment, as computed by
+ * the Smith-Waterman algorithm
+ * @param query query sequence data
+ * @param query_range range of this query in the concatenated
+ * query
+ * @param ccat_query_length total length of the concatenated query
+ * @param subject subject sequence data
+ * @param subject_range range of subject_data in the translated
+ * query, in amino acid coordinates
+ * @param full_subject_length length of the full subject sequence
+ * @param gapping_params parameters used to compute gapped
+ * alignments
+ * @param whichMode which mode of composition adjustment has
+ * been used to adjust the scoring matrix
+ * @return 0 on success, -1 for out-of-memory error
+ */
+typedef int
+new_xdrop_align_type(BlastCompo_Alignment **palign,
+ Int4 * pqueryEnd, Int4 * pmatchEnd,
+ Int4 queryStart, Int4 matchStart, Int4 score,
+ BlastCompo_SequenceData * query,
+ BlastCompo_SequenceRange * query_range,
+ Int4 ccat_query_length,
+ BlastCompo_SequenceData * subject,
+ BlastCompo_SequenceRange * subject_range,
+ Int4 full_subject_length,
+ BlastCompo_GappingParams * gapping_params,
+ ECompoAdjustModes whichMode);
+
+/** Callbacks used by Blast_RedoOneMatch and
+ * Blast_RedoOneMatchSmithWaterman routines */
+struct Blast_RedoAlignCallbacks {
+ calc_lambda_type * calc_lambda;
+ get_range_type * get_range;
+ redo_one_alignment_type * redo_one_alignment;
+ new_xdrop_align_type * new_xdrop_align;
+ void (*free_align_traceback)(void*);
+};
+typedef struct Blast_RedoAlignCallbacks Blast_RedoAlignCallbacks;
+
+/** A parameter block for the Blast_RedoOneMatch and
+ * Blast_RedoOneMatchSmithWaterman routines */
+struct Blast_RedoAlignParams {
+ Blast_MatrixInfo * matrix_info;
+ BlastCompo_GappingParams * gapping_params;
+ int adjustParameters;
+ int positionBased;
+ int RE_pseudocounts;
+ int subject_is_translated;
+ int ccat_query_length;
+ int cutoff_s;
+ double cutoff_e;
+ int do_link_hsps;
+ double Lambda;
+ double logK;
+ const Blast_RedoAlignCallbacks * callbacks;
+};
+typedef struct Blast_RedoAlignParams Blast_RedoAlignParams;
+
+
+NCBI_XBLAST_EXPORT
+Blast_RedoAlignParams *
+Blast_RedoAlignParamsNew(Blast_MatrixInfo ** pmatrix_info,
+ BlastCompo_GappingParams **pgapping_params,
+ int adjustParameters, int positionBased,
+ int subject_is_translated,
+ int ccat_query_length, int cutoff_s,
+ double cutoff_e, int do_link_hsps, double Lambda,
+ double logK,
+ const Blast_RedoAlignCallbacks * callbacks);
+
+NCBI_XBLAST_EXPORT
+void Blast_RedoAlignParamsFree(Blast_RedoAlignParams ** pparams);
+
+NCBI_XBLAST_EXPORT
+int Blast_RedoOneMatchSmithWaterman(BlastCompo_Alignment ** alignments,
+ Blast_RedoAlignParams * params,
+ BlastCompo_Alignment * in_aligns,
+ int hspcnt,
+ BlastCompo_MatchingSequence * matchingSeq,
+ BlastCompo_QueryInfo query[],
+ int numQueries,
+ int ** matrix,
+ Blast_CompositionWorkspace * NRrecord,
+ Blast_ForbiddenRanges * forbidden,
+ BlastCompo_Heap * significantMatches);
+
+NCBI_XBLAST_EXPORT
+int Blast_RedoOneMatch(BlastCompo_Alignment ** alignments,
+ Blast_RedoAlignParams * params,
+ BlastCompo_Alignment * incoming_aligns,
+ int hspcnt,
+ BlastCompo_MatchingSequence * matchingSeq,
+ int ccat_query_length,
+ BlastCompo_QueryInfo query[],
+ int numQueries,
+ int ** matrix,
+ Blast_CompositionWorkspace * NRrecord);
+
+NCBI_XBLAST_EXPORT
+int BlastCompo_EarlyTermination(double evalue,
+ BlastCompo_Heap significantMatches[],
+ int numQueries);
+
+#endif
diff --git a/algo/blast/composition_adjustment/smith_waterman.c b/algo/blast/composition_adjustment/smith_waterman.c
new file mode 100644
index 00000000..66b7f9cf
--- /dev/null
+++ b/algo/blast/composition_adjustment/smith_waterman.c
@@ -0,0 +1,715 @@
+/* ===========================================================================
+*
+* PUBLIC DOMAIN NOTICE
+* National Center for Biotechnology Information
+*
+* This software/database is a "United States Government Work" under the
+* terms of the United States Copyright Act. It was written as part of
+* the author's official duties as a United States Government employee and
+* thus cannot be copyrighted. This software/database is freely available
+* to the public for use. The National Library of Medicine and the U.S.
+* Government have not placed any restriction on its use or reproduction.
+*
+* Although all reasonable efforts have been taken to ensure the accuracy
+* and reliability of the software and data, the NLM and the U.S.
+* Government do not and cannot warrant the performance or results that
+* may be obtained by using this software or data. The NLM and the U.S.
+* Government disclaim all warranties, express or implied, including
+* warranties of performance, merchantability or fitness for any particular
+* purpose.
+*
+* Please cite the author in any work or product based on this material.
+*
+* ===========================================================================*/
+
+/**
+ * @file smith_waterman.c
+ *
+ * Routines for computing rigorous, Smith-Waterman alignments.
+ */
+#ifndef SKIP_DOXYGEN_PROCESSING
+static char const rcsid[] =
+ "$Id: smith_waterman.c,v 1.1 2005/12/01 13:48:09 gertz Exp $";
+#endif /* SKIP_DOXYGEN_PROCESSING */
+
+#include <algo/blast/core/ncbi_std.h>
+#include <algo/blast/composition_adjustment/composition_constants.h>
+#include <algo/blast/composition_adjustment/smith_waterman.h>
+
+/** A structure used internally by the Smith-Waterman algorithm to
+ * represent gaps */
+typedef struct SwGapInfo {
+ int noGap;
+ int gapExists;
+} SwGapInfo;
+
+
+/**
+ * Compute the score and right-hand endpoints of the locally optimal
+ * Smith-Waterman alignment.
+ *
+ * @param *score the computed score
+ * @param *matchSeqEnd the right-hand end of the alignment in the
+ * database sequence
+ * @param *queryEnd the right-hand end of the alignment in the
+ * query sequence
+ * @param matchSeq the database sequence data
+ * @param matchSeqLength length of matchSeq
+ * @param query the query sequence data
+ * @param queryLength length of query
+ * @param matrix amino-acid scoring matrix
+ * @param gapOpen penalty for opening a gap
+ * @param gapExtend penalty for extending a gap by one amino acid
+ * @param positionSpecific determines whether matrix is position
+ * specific or not
+ */
+static int
+BLbasicSmithWatermanScoreOnly(int *score, int *matchSeqEnd, int *queryEnd,
+ const Uint1 * matchSeq, int matchSeqLength,
+ const Uint1 * query, int queryLength,
+ int **matrix, int gapOpen, int gapExtend,
+ int positionSpecific)
+{
+ int bestScore; /* best score seen so far */
+ int newScore; /* score of next entry */
+ int bestMatchSeqPos, bestQueryPos; /* position ending best score in
+ matchSeq and query sequences */
+ SwGapInfo *scoreVector; /* keeps one row of the
+ Smith-Waterman matrix overwrite
+ old row with new row */
+ int *matrixRow; /* one row of score matrix */
+ int newGapCost; /* cost to have a gap of one character */
+ int prevScoreNoGapMatchSeq; /* score one row and column up with
+ no gaps */
+ int prevScoreGapMatchSeq; /* score if a gap already started in
+ matchSeq */
+ int continueGapScore; /* score for continuing a gap in matchSeq */
+ int matchSeqPos, queryPos; /* positions in matchSeq and query */
+
+ scoreVector = (SwGapInfo *) malloc(matchSeqLength * sizeof(SwGapInfo));
+ if (scoreVector == NULL) {
+ return -1;
+ }
+ bestMatchSeqPos = 0;
+ bestQueryPos = 0;
+ bestScore = 0;
+ newGapCost = gapOpen + gapExtend;
+ for (matchSeqPos = 0; matchSeqPos < matchSeqLength; matchSeqPos++) {
+ scoreVector[matchSeqPos].noGap = 0;
+ scoreVector[matchSeqPos].gapExists = -gapOpen;
+ }
+ for (queryPos = 0; queryPos < queryLength; queryPos++) {
+ if (positionSpecific)
+ matrixRow = matrix[queryPos];
+ else
+ matrixRow = matrix[query[queryPos]];
+ newScore = 0;
+ prevScoreNoGapMatchSeq = 0;
+ prevScoreGapMatchSeq = -(gapOpen);
+ for (matchSeqPos = 0; matchSeqPos < matchSeqLength; matchSeqPos++) {
+ /* testing scores with a gap in matchSeq, either starting a
+ * new gap or extending an existing gap*/
+ if ((newScore = newScore - newGapCost) >
+ (prevScoreGapMatchSeq = prevScoreGapMatchSeq - gapExtend))
+ prevScoreGapMatchSeq = newScore;
+ /* testing scores with a gap in query, either starting a
+ * new gap or extending an existing gap*/
+ if ((newScore = scoreVector[matchSeqPos].noGap - newGapCost) >
+ (continueGapScore =
+ scoreVector[matchSeqPos].gapExists - gapExtend))
+ continueGapScore = newScore;
+ /* compute new score extending one position in matchSeq
+ * and query */
+ newScore =
+ prevScoreNoGapMatchSeq + matrixRow[matchSeq[matchSeqPos]];
+ if (newScore < 0)
+ newScore = 0; /*Smith-Waterman locality condition*/
+ /*test two alternatives*/
+ if (newScore < prevScoreGapMatchSeq)
+ newScore = prevScoreGapMatchSeq;
+ if (newScore < continueGapScore)
+ newScore = continueGapScore;
+ prevScoreNoGapMatchSeq = scoreVector[matchSeqPos].noGap;
+ scoreVector[matchSeqPos].noGap = newScore;
+ scoreVector[matchSeqPos].gapExists = continueGapScore;
+ if (newScore > bestScore) {
+ bestScore = newScore;
+ bestQueryPos = queryPos;
+ bestMatchSeqPos = matchSeqPos;
+ }
+ }
+ }
+ free(scoreVector);
+ if (bestScore < 0)
+ bestScore = 0;
+ *matchSeqEnd = bestMatchSeqPos;
+ *queryEnd = bestQueryPos;
+ *score = bestScore;
+
+ return 0;
+}
+
+
+/**
+ * Find the left-hand endpoints of the locally optimal Smith-Waterman
+ * alignment given the score and right-hand endpoints computed by
+ * BLbasicSmithWatermanScoreOnly.
+ *
+ * @param *score_out the score of the optimal alignment -- should
+ * equal score_in.
+ * @param *matchSeqStart the left-hand endpoint of the alignment in
+ * the database sequence
+ * @param *queryStart the right-hand endpoint of the alignment
+ * in the query sequence
+ * @param matchSeq the database sequence data
+ * @param matchSeqLength length of matchSeq
+ * @param query the query sequence data
+ * @param matrix amino-acid scoring matrix
+ * @param gapOpen penalty for opening a gap
+ * @param gapExtend penalty for extending a gap by one amino acid
+ * @param matchSeqEnd right-hand endpoint of the alignment in
+ * the database sequence
+ * @param queryEnd right-hand endpoint of the alignment in
+ * the query
+ * @param score_in the score of the alignment
+ * @param positionSpecific determines whether matrix is position
+ * specific or not
+ */
+static int
+BLSmithWatermanFindStart(int *score_out,
+ int *matchSeqStart, int *queryStart,
+ const Uint1 * matchSeq, int matchSeqLength,
+ const Uint1 *query,
+ int **matrix, int gapOpen, int gapExtend,
+ int matchSeqEnd, int queryEnd, int score_in,
+ int positionSpecific)
+{
+ int bestScore; /* best score seen so far*/
+ int newScore; /* score of next entry*/
+ int bestMatchSeqPos, bestQueryPos; /*position starting best score in
+ matchSeq and database sequences */
+ SwGapInfo *scoreVector; /* keeps one row of the Smith-Waterman
+ matrix overwrite old row with new row */
+ int *matrixRow; /* one row of score matrix */
+ int newGapCost; /* cost to have a gap of one character */
+ int prevScoreNoGapMatchSeq; /* score one row and column up
+ with no gaps*/
+ int prevScoreGapMatchSeq; /* score if a gap already started in
+ matchSeq */
+ int continueGapScore; /* score for continuing a gap in query */
+ int matchSeqPos, queryPos; /* positions in matchSeq and query */
+
+ scoreVector = (SwGapInfo *) malloc(matchSeqLength * sizeof(SwGapInfo));
+ if (scoreVector == NULL) {
+ return -1;
+ }
+ bestMatchSeqPos = 0;
+ bestQueryPos = 0;
+ bestScore = 0;
+ newGapCost = gapOpen + gapExtend;
+ for (matchSeqPos = 0; matchSeqPos < matchSeqLength; matchSeqPos++) {
+ scoreVector[matchSeqPos].noGap = 0;
+ scoreVector[matchSeqPos].gapExists = -(gapOpen);
+ }
+ for (queryPos = queryEnd; queryPos >= 0; queryPos--) {
+ if (positionSpecific)
+ matrixRow = matrix[queryPos];
+ else
+ matrixRow = matrix[query[queryPos]];
+ newScore = 0;
+ prevScoreNoGapMatchSeq = 0;
+ prevScoreGapMatchSeq = -(gapOpen);
+ for (matchSeqPos = matchSeqEnd; matchSeqPos >= 0; matchSeqPos--) {
+ /* testing scores with a gap in matchSeq, either starting
+ * a new gap or extending an existing gap */
+ if ((newScore = newScore - newGapCost) >
+ (prevScoreGapMatchSeq = prevScoreGapMatchSeq - gapExtend))
+ prevScoreGapMatchSeq = newScore;
+ /* testing scores with a gap in query, either starting a
+ * new gap or extending an existing gap */
+ if ((newScore = scoreVector[matchSeqPos].noGap - newGapCost) >
+ (continueGapScore =
+ scoreVector[matchSeqPos].gapExists - gapExtend))
+ continueGapScore = newScore;
+ /* compute new score extending one position in matchSeq
+ * and query */
+ newScore =
+ prevScoreNoGapMatchSeq + matrixRow[matchSeq[matchSeqPos]];
+ if (newScore < 0)
+ newScore = 0; /* Smith-Waterman locality condition */
+ /* test two alternatives */
+ if (newScore < prevScoreGapMatchSeq)
+ newScore = prevScoreGapMatchSeq;
+ if (newScore < continueGapScore)
+ newScore = continueGapScore;
+ prevScoreNoGapMatchSeq = scoreVector[matchSeqPos].noGap;
+ scoreVector[matchSeqPos].noGap = newScore;
+ scoreVector[matchSeqPos].gapExists = continueGapScore;
+ if (newScore > bestScore) {
+ bestScore = newScore;
+ bestQueryPos = queryPos;
+ bestMatchSeqPos = matchSeqPos;
+ }
+ if (bestScore >= score_in)
+ break;
+ }
+ if (bestScore >= score_in)
+ break;
+ }
+ free(scoreVector);
+ if (bestScore < 0)
+ bestScore = 0;
+ *matchSeqStart = bestMatchSeqPos;
+ *queryStart = bestQueryPos;
+ *score_out = bestScore;
+
+ return 0;
+}
+
+
+/**
+ * Compute the score and right-hand endpoints of the locally optimal
+ * Smith-Waterman alignment, subject to the restriction that some
+ * ranges are forbidden.
+ *
+ * @param *score the computed score
+ * @param *matchSeqEnd the right-hand end of the alignment in the
+ * database sequence
+ * @param *queryEnd the right-hand end of the alignment in the
+ * query sequence
+ * @param matchSeq the database sequence data
+ * @param matchSeqLength length of matchSeq
+ * @param query the query sequence data
+ * @param queryLength length of query
+ * @param matrix amino-acid scoring matrix
+ * @param gapOpen penalty for opening a gap
+ * @param gapExtend penalty for extending a gap by one amino acid
+ * @param numForbidden number of forbidden ranges [in]
+ * @param forbiddenRanges lists areas that should not be aligned [in]
+ * @param positionSpecific determines whether matrix is position
+ * specific or not
+ */
+static int
+BLspecialSmithWatermanScoreOnly(int *score, int *matchSeqEnd, int *queryEnd,
+ const Uint1 * matchSeq, int matchSeqLength,
+ const Uint1 *query, int queryLength,
+ int **matrix, int gapOpen, int gapExtend,
+ const int *numForbidden,
+ int ** forbiddenRanges,
+ int positionSpecific)
+{
+ int bestScore; /* best score seen so far */
+ int newScore; /* score of next entry*/
+ int bestMatchSeqPos, bestQueryPos; /*position ending best score in
+ matchSeq and database sequences */
+ SwGapInfo *scoreVector; /* keeps one row of the Smith-Waterman
+ matrix overwrite old row with new row */
+ int *matrixRow; /* one row of score matrix */
+ int newGapCost; /* cost to have a gap of one character */
+ int prevScoreNoGapMatchSeq; /* score one row and column up
+ with no gaps*/
+ int prevScoreGapMatchSeq; /* score if a gap already started in
+ matchSeq */
+ int continueGapScore; /* score for continuing a gap in query */
+ int matchSeqPos, queryPos; /* positions in matchSeq and query */
+ int forbidden; /* is this position forbidden? */
+ int f; /* index over forbidden positions */
+
+ scoreVector = (SwGapInfo *) malloc(matchSeqLength * sizeof(SwGapInfo));
+ if (scoreVector == NULL) {
+ return -1;
+ }
+ bestMatchSeqPos = 0;
+ bestQueryPos = 0;
+ bestScore = 0;
+ newGapCost = gapOpen + gapExtend;
+ for (matchSeqPos = 0; matchSeqPos < matchSeqLength; matchSeqPos++) {
+ scoreVector[matchSeqPos].noGap = 0;
+ scoreVector[matchSeqPos].gapExists = -(gapOpen);
+ }
+ for (queryPos = 0; queryPos < queryLength; queryPos++) {
+ if (positionSpecific)
+ matrixRow = matrix[queryPos];
+ else
+ matrixRow = matrix[query[queryPos]];
+ newScore = 0;
+ prevScoreNoGapMatchSeq = 0;
+ prevScoreGapMatchSeq = -(gapOpen);
+ for (matchSeqPos = 0; matchSeqPos < matchSeqLength; matchSeqPos++) {
+ /* testing scores with a gap in matchSeq, either starting
+ * a new gap or extending an existing gap */
+ if ((newScore = newScore - newGapCost) >
+ (prevScoreGapMatchSeq = prevScoreGapMatchSeq - gapExtend))
+ prevScoreGapMatchSeq = newScore;
+ /* testing scores with a gap in query, either starting a
+ * new gap or extending an existing gap */
+ if ((newScore = scoreVector[matchSeqPos].noGap - newGapCost) >
+ (continueGapScore =
+ scoreVector[matchSeqPos].gapExists - gapExtend))
+ continueGapScore = newScore;
+ /* compute new score extending one position in matchSeq
+ * and query */
+ forbidden = FALSE;
+ for (f = 0; f < numForbidden[queryPos]; f++) {
+ if ((matchSeqPos >= forbiddenRanges[queryPos][2 * f]) &&
+ (matchSeqPos <= forbiddenRanges[queryPos][2*f + 1])) {
+ forbidden = TRUE;
+ break;
+ }
+ }
+ if (forbidden)
+ newScore = COMPO_SCORE_MIN;
+ else
+ newScore =
+ prevScoreNoGapMatchSeq + matrixRow[matchSeq[matchSeqPos]];
+ if (newScore < 0)
+ newScore = 0; /* Smith-Waterman locality condition */
+ /* test two alternatives */
+ if (newScore < prevScoreGapMatchSeq)
+ newScore = prevScoreGapMatchSeq;
+ if (newScore < continueGapScore)
+ newScore = continueGapScore;
+ prevScoreNoGapMatchSeq = scoreVector[matchSeqPos].noGap;
+ scoreVector[matchSeqPos].noGap = newScore;
+ scoreVector[matchSeqPos].gapExists = continueGapScore;
+ if (newScore > bestScore) {
+ bestScore = newScore;
+ bestQueryPos = queryPos;
+ bestMatchSeqPos = matchSeqPos;
+ }
+ }
+ }
+ free(scoreVector);
+ if (bestScore < 0)
+ bestScore = 0;
+ *matchSeqEnd = bestMatchSeqPos;
+ *queryEnd = bestQueryPos;
+ *score = bestScore;
+
+ return 0;
+}
+
+
+/**
+ * Find the left-hand endpoints of the locally optimal Smith-Waterman
+ * alignment, subject to the restriction that certain ranges may not
+ * be aligned, given the score and right-hand endpoints computed by
+ * BLspecialSmithWatermanScoreOnly.
+ *
+ * @param *score_out the score of the optimal alignment -- should
+ * equal score_in.
+ * @param *matchSeqStart the left-hand endpoint of the alignment in
+ * the database sequence
+ * @param *queryStart the right-hand endpoint of the alignment
+ * in the query sequence
+ * @param matchSeq the database sequence data
+ * @param matchSeqLength length of matchSeq
+ * @param query the query sequence data
+ * @param matrix amino-acid scoring matrix
+ * @param gapOpen penalty for opening a gap
+ * @param gapExtend penalty for extending a gap by one amino acid
+ * @param matchSeqEnd right-hand endpoint of the alignment in
+ * the database sequence
+ * @param queryEnd right-hand endpoint of the alignment in
+ * the query
+ * @param score_in the score of the alignment
+ * @param numForbidden number of forbidden ranges
+ * @param forbiddenRanges lists areas that should not be aligned
+ * @param positionSpecific determines whether matrix is position
+ * specific or not
+ */
+static int
+BLspecialSmithWatermanFindStart(int * score_out,
+ int *matchSeqStart, int *queryStart,
+ const Uint1 * matchSeq, int matchSeqLength,
+ const Uint1 *query, int **matrix,
+ int gapOpen, int gapExtend, int matchSeqEnd,
+ int queryEnd, int score_in,
+ const int *numForbidden,
+ int ** forbiddenRanges,
+ int positionSpecific)
+{
+ int bestScore; /* best score seen so far */
+ int newScore; /* score of next entry */
+ int bestMatchSeqPos, bestQueryPos; /* position starting best score in
+ matchSeq and database sequences */
+ SwGapInfo *scoreVector; /* keeps one row of the
+ Smith-Waterman matrix; overwrite
+ old row with new row*/
+ int *matrixRow; /* one row of score matrix */
+ int newGapCost; /* cost to have a gap of one character */
+ int prevScoreNoGapMatchSeq; /* score one row and column up
+ with no gaps*/
+ int prevScoreGapMatchSeq; /* score if a gap already started in
+ matchSeq */
+ int continueGapScore; /* score for continuing a gap in query */
+ int matchSeqPos, queryPos; /* positions in matchSeq and query */
+ int forbidden; /* is this position forbidden? */
+ int f; /* index over forbidden positions */
+
+ scoreVector = (SwGapInfo *) malloc(matchSeqLength * sizeof(SwGapInfo));
+ if (scoreVector == NULL) {
+ return -1;
+ }
+ bestMatchSeqPos = 0;
+ bestQueryPos = 0;
+ bestScore = 0;
+ newGapCost = gapOpen + gapExtend;
+ for (matchSeqPos = 0; matchSeqPos < matchSeqLength; matchSeqPos++) {
+ scoreVector[matchSeqPos].noGap = 0;
+ scoreVector[matchSeqPos].gapExists = -(gapOpen);
+ }
+ for (queryPos = queryEnd; queryPos >= 0; queryPos--) {
+ if (positionSpecific)
+ matrixRow = matrix[queryPos];
+ else
+ matrixRow = matrix[query[queryPos]];
+ newScore = 0;
+ prevScoreNoGapMatchSeq = 0;
+ prevScoreGapMatchSeq = -(gapOpen);
+ for (matchSeqPos = matchSeqEnd; matchSeqPos >= 0; matchSeqPos--) {
+ /* testing scores with a gap in matchSeq, either starting a
+ * new gap or extending an existing gap*/
+ if ((newScore = newScore - newGapCost) >
+ (prevScoreGapMatchSeq = prevScoreGapMatchSeq - gapExtend))
+ prevScoreGapMatchSeq = newScore;
+ /* testing scores with a gap in query, either starting a
+ * new gap or extending an existing gap*/
+ if ((newScore = scoreVector[matchSeqPos].noGap - newGapCost) >
+ (continueGapScore =
+ scoreVector[matchSeqPos].gapExists - gapExtend))
+ continueGapScore = newScore;
+ /* compute new score extending one position in matchSeq
+ * and query */
+ forbidden = FALSE;
+ for (f = 0; f < numForbidden[queryPos]; f++) {
+ if ((matchSeqPos >= forbiddenRanges[queryPos][2 * f]) &&
+ (matchSeqPos <= forbiddenRanges[queryPos][2*f + 1])) {
+ forbidden = TRUE;
+ break;
+ }
+ }
+ if (forbidden)
+ newScore = COMPO_SCORE_MIN;
+ else
+ newScore =
+ prevScoreNoGapMatchSeq + matrixRow[matchSeq[matchSeqPos]];
+ if (newScore < 0)
+ newScore = 0; /* Smith-Waterman locality condition */
+ /* test two alternatives */
+ if (newScore < prevScoreGapMatchSeq)
+ newScore = prevScoreGapMatchSeq;
+ if (newScore < continueGapScore)
+ newScore = continueGapScore;
+ prevScoreNoGapMatchSeq = scoreVector[matchSeqPos].noGap;
+ scoreVector[matchSeqPos].noGap = newScore;
+ scoreVector[matchSeqPos].gapExists = continueGapScore;
+ if (newScore > bestScore) {
+ bestScore = newScore;
+ bestQueryPos = queryPos;
+ bestMatchSeqPos = matchSeqPos;
+ }
+ if (bestScore >= score_in)
+ break;
+ }
+ if (bestScore >= score_in)
+ break;
+ }
+ free(scoreVector);
+ if (bestScore < 0)
+ bestScore = 0;
+ *matchSeqStart = bestMatchSeqPos;
+ *queryStart = bestQueryPos;
+ *score_out = bestScore;
+
+ return 0;
+}
+
+
+/**
+ * Release the storage associated with the fields of self, but do not
+ * delete self
+ *
+ * @param self an instance of Blast_ForbiddenRanges [in][out]
+ */
+void
+Blast_ForbiddenRangesRelease(Blast_ForbiddenRanges * self)
+{
+ int f;
+ if (self->ranges) {
+ for (f = 0; f < self->capacity; f++) free(self->ranges[f]);
+ }
+ free(self->ranges); self->ranges = NULL;
+ free(self->numForbidden); self->numForbidden = NULL;
+}
+
+
+/**
+ * Initialize a new, empty Blast_ForbiddenRanges
+ *
+ * @param self object to be initialized
+ * @param capacity the number of ranges that may be stored
+ * (must be at least as long as the length
+ * of the query)
+ */
+int
+Blast_ForbiddenRangesInitialize(Blast_ForbiddenRanges * self,
+ int capacity)
+{
+ int f;
+ self->capacity = capacity;
+ self->numForbidden = NULL;
+ self->ranges = NULL;
+ self->isEmpty = TRUE;
+
+ self->numForbidden = (int *) calloc(capacity, sizeof(int));
+ if (self->numForbidden == NULL)
+ goto error_return;
+ self->ranges = (int **) calloc(capacity, sizeof(int *));
+ if (self->ranges == NULL)
+ goto error_return;
+ for (f = 0; f < capacity; f++) {
+ self->numForbidden[f] = 0;
+ self->ranges[f] = (int *) malloc(2 * sizeof(int));
+ if (self->ranges[f] == NULL)
+ goto error_return;
+ self->ranges[f][0] = 0;
+ self->ranges[f][1] = 0;
+ }
+ return 0;
+error_return:
+ Blast_ForbiddenRangesRelease(self);
+ return -1;
+}
+
+
+/** Reset self to be empty */
+void
+Blast_ForbiddenRangesClear(Blast_ForbiddenRanges * self)
+{
+ int f;
+ for (f = 0; f < self->capacity; f++) {
+ self->numForbidden[f] = 0;
+ }
+ self->isEmpty = TRUE;
+}
+
+
+/** Add some ranges to self
+ * @param self an instance of Blast_ForbiddenRanges [in][out]
+ * @param queryStart start of the alignment in the query sequence
+ * @param queryAlignmentExtent length of the alignment in the query sequence
+ * @param matchStart start of the alignment in the subject sequence
+ * @param matchAlignmentExtent length of the alignment in the
+ * subject sequence
+ */
+int
+Blast_ForbiddenRangesPush(Blast_ForbiddenRanges * self,
+ int queryStart,
+ int queryEnd,
+ int matchStart,
+ int matchEnd)
+{
+ int f;
+ for (f = queryStart; f < queryEnd; f++) {
+ int last = 2 * self->numForbidden[f];
+ if (0 != last) { /* we must resize the array */
+ int * new_ranges =
+ realloc(self->ranges[f], (last + 2) * sizeof(int));
+ if (new_ranges == NULL)
+ return -1;
+ self->ranges[f] = new_ranges;
+ }
+ self->ranges[f][last] = matchStart;
+ self->ranges[f][last + 1] = matchEnd;
+
+ self->numForbidden[f]++;
+ }
+ self->isEmpty = FALSE;
+
+ return 0;
+}
+
+
+/**
+ * Calls BLbasicSmithWatermanScoreOnly if forbiddenRanges is empty and
+ * calls BLspecialSmithWatermanScoreOnly otherwise. See
+ * BLspecialSmithWatermanScoreOnly for the meaning of the parameters
+ * to this routine.
+ */
+int
+Blast_SmithWatermanScoreOnly(int *score,
+ int *matchSeqEnd, int *queryEnd,
+ const Uint1 * subject_data, int subject_length,
+ const Uint1 * query_data, int query_length,
+ int **matrix,
+ int gapOpen,
+ int gapExtend,
+ int positionSpecific,
+ const Blast_ForbiddenRanges * forbiddenRanges )
+{
+ if (forbiddenRanges->isEmpty) {
+ return BLbasicSmithWatermanScoreOnly(score, matchSeqEnd,
+ queryEnd, subject_data,
+ subject_length,
+ query_data, query_length,
+ matrix, gapOpen,
+ gapExtend,
+ positionSpecific);
+ } else {
+ return BLspecialSmithWatermanScoreOnly(score, matchSeqEnd,
+ queryEnd, subject_data,
+ subject_length,
+ query_data,
+ query_length, matrix,
+ gapOpen, gapExtend,
+ forbiddenRanges->numForbidden,
+ forbiddenRanges->ranges,
+ positionSpecific);
+ }
+}
+
+
+/**
+ * Calls BLSmithWatermanFindStart if forbiddenRanges is empty and
+ * calls BLspecialSmithWatermanFindStart otherwise. See
+ * BLspecialSmithWatermanFindStart for the meaning of the parameters
+ * to this routine.
+ */
+int
+Blast_SmithWatermanFindStart(int * score_out,
+ int *matchSeqStart,
+ int *queryStart,
+ const Uint1 * subject_data, int subject_length,
+ const Uint1 * query_data,
+ int **matrix,
+ int gapOpen,
+ int gapExtend,
+ int matchSeqEnd,
+ int queryEnd,
+ int score_in,
+ int positionSpecific,
+ const Blast_ForbiddenRanges * forbiddenRanges)
+{
+ if (forbiddenRanges->isEmpty) {
+ return BLSmithWatermanFindStart(score_out, matchSeqStart,
+ queryStart, subject_data,
+ subject_length, query_data,
+ matrix, gapOpen, gapExtend,
+ matchSeqEnd, queryEnd,
+ score_in, positionSpecific);
+ } else {
+ return BLspecialSmithWatermanFindStart(score_out,
+ matchSeqStart,
+ queryStart,
+ subject_data,
+ subject_length,
+ query_data, matrix,
+ gapOpen, gapExtend,
+ matchSeqEnd, queryEnd,
+ score_in,
+ forbiddenRanges->numForbidden,
+ forbiddenRanges->ranges,
+ positionSpecific);
+ }
+}
diff --git a/algo/blast/composition_adjustment/smith_waterman.h b/algo/blast/composition_adjustment/smith_waterman.h
new file mode 100644
index 00000000..b206c4bd
--- /dev/null
+++ b/algo/blast/composition_adjustment/smith_waterman.h
@@ -0,0 +1,103 @@
+/* $Id: smith_waterman.h,v 1.1 2005/12/01 13:52:20 gertz Exp $
+ * ===========================================================================
+ *
+ * PUBLIC DOMAIN NOTICE
+ * National Center for Biotechnology Information
+ *
+ * This software/database is a "United States Government Work" under the
+ * terms of the United States Copyright Act. It was written as part of
+ * the author's official duties as a United States Government employee and
+ * thus cannot be copyrighted. This software/database is freely available
+ * to the public for use. The National Library of Medicine and the U.S.
+ * Government have not placed any restriction on its use or reproduction.
+ *
+ * Although all reasonable efforts have been taken to ensure the accuracy
+ * and reliability of the software and data, the NLM and the U.S.
+ * Government do not and cannot warrant the performance or results that
+ * may be obtained by using this software or data. The NLM and the U.S.
+ * Government disclaim all warranties, express or implied, including
+ * warranties of performance, merchantability or fitness for any particular
+ * purpose.
+ *
+ * Please cite the author in any work or product based on this material.
+ *
+ * ===========================================================================*/
+/**
+ * @file smith_waterman.h
+ * @author Alejandro Schaffer, E. Michael Gertz
+ *
+ * Definitions for computing Smith-Waterman alignments
+ */
+#ifndef __SMITH_WATERMAN__
+#define __SMITH_WATERMAN__
+
+#include <algo/blast/core/blast_export.h>
+#include <algo/blast/core/ncbi_std.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * An instance of Blast_ForbiddenRanges is used by the Smith-Waterman
+ * algorithm to represent ranges in the database that are not to be
+ * aligned.
+ */
+typedef struct Blast_ForbiddenRanges {
+ int isEmpty; /**< True if there are no forbidden ranges */
+ int *numForbidden; /**< how many forbidden ranges at each
+ database position */
+ int **ranges; /**< forbidden ranges for each database
+ position */
+ int capacity; /**< length of the query sequence */
+} Blast_ForbiddenRanges;
+
+NCBI_XBLAST_EXPORT
+int Blast_ForbiddenRangesInitialize(Blast_ForbiddenRanges * self,
+ int capacity);
+
+NCBI_XBLAST_EXPORT
+void Blast_ForbiddenRangesClear(Blast_ForbiddenRanges * self);
+
+NCBI_XBLAST_EXPORT
+int Blast_ForbiddenRangesPush(Blast_ForbiddenRanges * self,
+ int queryStart, int queryEnd,
+ int matchStart, int matchEnd);
+
+NCBI_XBLAST_EXPORT
+void Blast_ForbiddenRangesRelease(Blast_ForbiddenRanges * self);
+
+NCBI_XBLAST_EXPORT
+int Blast_SmithWatermanFindStart(int * score_out,
+ int *matchSeqStart,
+ int *queryStart,
+ const Uint1 * subject_data,
+ int subject_length,
+ const Uint1 * query_data,
+ int **matrix,
+ int gapOpen,
+ int gapExtend,
+ int matchSeqEnd,
+ int queryEnd,
+ int score_in,
+ int positionSpecific,
+ const Blast_ForbiddenRanges *
+ forbiddenRanges);
+
+NCBI_XBLAST_EXPORT
+int Blast_SmithWatermanScoreOnly(int *score,
+ int *matchSeqEnd, int *queryEnd,
+ const Uint1 * subject_data,
+ int subject_length,
+ const Uint1 * query_data,
+ int query_length, int **matrix,
+ int gapOpen, int gapExtend,
+ int positionSpecific,
+ const Blast_ForbiddenRanges *
+ forbiddenRanges);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/algo/blast/core/aa_ungapped.c b/algo/blast/core/aa_ungapped.c
index f7544f3a..e1514540 100644
--- a/algo/blast/core/aa_ungapped.c
+++ b/algo/blast/core/aa_ungapped.c
@@ -1,4 +1,4 @@
-/* $Id: aa_ungapped.c,v 1.44 2005/04/06 13:42:01 camacho Exp $
+/* $Id: aa_ungapped.c,v 1.45 2005/11/16 14:27:03 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
@@ -30,7 +30,7 @@
#ifndef SKIP_DOXYGEN_PROCESSING
static char const rcsid[] =
- "$Id: aa_ungapped.c,v 1.44 2005/04/06 13:42:01 camacho Exp $";
+ "$Id: aa_ungapped.c,v 1.45 2005/11/16 14:27:03 madden Exp $";
#endif /* SKIP_DOXYGEN_PROCESSING */
#include <algo/blast/core/aa_ungapped.h>
diff --git a/algo/blast/core/aa_ungapped.h b/algo/blast/core/aa_ungapped.h
index a3803d27..20580d57 100644
--- a/algo/blast/core/aa_ungapped.h
+++ b/algo/blast/core/aa_ungapped.h
@@ -1,4 +1,4 @@
-/* $Id: aa_ungapped.h,v 1.23 2005/03/28 21:22:50 dondosha Exp $
+/* $Id: aa_ungapped.h,v 1.24 2005/11/16 14:31:36 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
diff --git a/algo/blast/core/blast_def.h b/algo/blast/core/blast_def.h
index 5a2f322d..b7bfebba 100644
--- a/algo/blast/core/blast_def.h
+++ b/algo/blast/core/blast_def.h
@@ -1,4 +1,4 @@
-/* $Id: blast_def.h,v 1.61 2005/06/27 17:58:05 camacho Exp $
+/* $Id: blast_def.h,v 1.64 2005/11/16 14:31:36 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
@@ -23,7 +23,6 @@
*
* ===========================================================================
*
- * Author: Ilya Dondoshansky
*
*/
@@ -122,24 +121,29 @@ typedef struct BlastSeqLoc {
/** Structure for keeping the query masking information */
typedef struct BlastMaskLoc {
- Int4 total_size; /**< Total size of the BlastSeqLoc array below. Inside the
- engine equal to number of contexts in the BlastQueryInfo
- structure. For lower case mask in a translated search,
- total size is at first equal to number of query
- sequences, but then expanded to number of contexts
- (total number of translated frames), i.e. 6 times number
- of queries. */
- BlastSeqLoc** seqloc_array; /**< array of mask locations. */
+ /** Total size of the BlastSeqLoc array below. This is always the number
+ of queries times the number of contexts. Note that in the case of
+ translated query searches, these locations must be provided in protein
+ coordinates to BLAST_MainSetUp.
+ @sa BLAST_GetNumberOfContexts
+ @sa BlastMaskLocDNAToProtein
+ */
+ Int4 total_size;
+
+ /** Array of masked locations.
+ Every query is allocated the number of contexts associated with the
+ program. In the case of nucleotide searches, the strand(s) to search
+ dictatate which elements of the array for a given query are filled. For
+ translated searches, this should also be the same (by design) but the
+ C toolkit API does NOT implement this, it rather fills all elements
+ for all queries with masked locations in protein coordinates (if any).
+ The C++ API does follow the convention which populates each element, only
+ if so dictated by the strand(s) to search for each query.
+ @sa BLAST_GetNumberOfContexts
+ */
+ BlastSeqLoc** seqloc_array;
} BlastMaskLoc;
-
-/** Encapsulates masking/filtering information. */
-typedef struct BlastMaskInformation {
- BlastMaskLoc* filter_slp; /**< masking locations. */
- Boolean mask_at_hash; /**< if TRUE masking used only for building lookup table. */
-} BlastMaskInformation;
-
-
/** Structure to hold a sequence. */
typedef struct BLAST_SequenceBlk {
Uint1* sequence; /**< Sequence used for search (could be translation). */
diff --git a/algo/blast/core/blast_diagnostics.c b/algo/blast/core/blast_diagnostics.c
index 496dee14..cec16621 100644
--- a/algo/blast/core/blast_diagnostics.c
+++ b/algo/blast/core/blast_diagnostics.c
@@ -1,4 +1,4 @@
-/* $Id: blast_diagnostics.c,v 1.7 2005/06/09 17:06:14 dondosha Exp $
+/* $Id: blast_diagnostics.c,v 1.8 2005/11/16 14:27:03 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
@@ -34,7 +34,7 @@
#ifndef SKIP_DOXYGEN_PROCESSING
static char const rcsid[] =
- "$Id: blast_diagnostics.c,v 1.7 2005/06/09 17:06:14 dondosha Exp $";
+ "$Id: blast_diagnostics.c,v 1.8 2005/11/16 14:27:03 madden Exp $";
#endif /* SKIP_DOXYGEN_PROCESSING */
#include <algo/blast/core/blast_diagnostics.h>
diff --git a/algo/blast/core/blast_diagnostics.h b/algo/blast/core/blast_diagnostics.h
index 997cc579..3f6cf53c 100644
--- a/algo/blast/core/blast_diagnostics.h
+++ b/algo/blast/core/blast_diagnostics.h
@@ -1,4 +1,4 @@
-/* $Id: blast_diagnostics.h,v 1.8 2005/01/24 14:23:05 camacho Exp $
+/* $Id: blast_diagnostics.h,v 1.9 2005/11/16 14:31:36 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
@@ -23,7 +23,6 @@
*
* ===========================================================================
*
- * Author: Ilya Dondoshansky
*
*/
diff --git a/algo/blast/core/blast_dust.c b/algo/blast/core/blast_dust.c
index d9c8788e..7d311f18 100644
--- a/algo/blast/core/blast_dust.c
+++ b/algo/blast/core/blast_dust.c
@@ -1,4 +1,4 @@
-/* $Id: blast_dust.c,v 1.35 2005/07/21 13:52:38 camacho Exp $
+/* $Id: blast_dust.c,v 1.37 2005/11/16 14:27:03 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
@@ -37,7 +37,7 @@
#ifndef SKIP_DOXYGEN_PROCESSING
static char const rcsid[] =
- "$Id: blast_dust.c,v 1.35 2005/07/21 13:52:38 camacho Exp $";
+ "$Id: blast_dust.c,v 1.37 2005/11/16 14:27:03 madden Exp $";
#endif /* SKIP_DOXYGEN_PROCESSING */
#include <algo/blast/core/blast_dust.h>
@@ -331,6 +331,7 @@ dust_triplet_find (Uint1* seq_start, Int4 icur, Int4 max, Uint1* s1)
static Int2
GetDustLocations (BlastSeqLoc** loc, DREGION* reg, Int4 nreg)
{
+ BlastSeqLoc* tail = NULL; /* pointer to tail of loc linked list */
if (!loc)
return -1;
@@ -341,7 +342,9 @@ GetDustLocations (BlastSeqLoc** loc, DREGION* reg, Int4 nreg)
if (nreg > 0) {
Int4 i;
for (i = 0; reg && i < nreg; i++) {
- BlastSeqLocNew(loc, reg->from, reg->to);
+ /* Cache the tail of the list to avoid the overhead of traversing the
+ * list when appending to it */
+ tail = BlastSeqLocNew(tail ? &tail : loc, reg->from, reg->to);
reg = reg->next;
}
}
diff --git a/algo/blast/core/blast_dust.h b/algo/blast/core/blast_dust.h
index b2d82570..cab3dca2 100644
--- a/algo/blast/core/blast_dust.h
+++ b/algo/blast/core/blast_dust.h
@@ -1,4 +1,4 @@
-/* $Id: blast_dust.h,v 1.13 2004/08/10 14:52:00 ivanov Exp $
+/* $Id: blast_dust.h,v 1.14 2005/11/16 14:31:36 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
diff --git a/algo/blast/core/blast_engine.c b/algo/blast/core/blast_engine.c
index ae74288c..c3825ab1 100644
--- a/algo/blast/core/blast_engine.c
+++ b/algo/blast/core/blast_engine.c
@@ -1,4 +1,4 @@
-/* $Id: blast_engine.c,v 1.198 2005/08/15 16:11:20 dondosha Exp $
+/* $Id: blast_engine.c,v 1.203 2005/11/22 13:44:13 coulouri Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
@@ -23,8 +23,6 @@
*
* ===========================================================================
*
- * Author: Ilya Dondoshansky
- *
*/
/** @file blast_engine.c
@@ -57,7 +55,7 @@
#ifndef SKIP_DOXYGEN_PROCESSING
static char const rcsid[] =
- "$Id: blast_engine.c,v 1.198 2005/08/15 16:11:20 dondosha Exp $";
+ "$Id: blast_engine.c,v 1.203 2005/11/22 13:44:13 coulouri Exp $";
#endif /* SKIP_DOXYGEN_PROCESSING */
#include <algo/blast/core/blast_engine.h>
@@ -75,8 +73,8 @@ static char const rcsid[] =
NCBI_XBLAST_EXPORT const int kBlastMajorVersion = 2;
NCBI_XBLAST_EXPORT const int kBlastMinorVersion = 2;
-NCBI_XBLAST_EXPORT const int kBlastPatchVersion = 12;
-NCBI_XBLAST_EXPORT const char* kBlastReleaseDate = "Aug-07-2005";
+NCBI_XBLAST_EXPORT const int kBlastPatchVersion = 13;
+NCBI_XBLAST_EXPORT const char* kBlastReleaseDate = "Nov-27-2005";
/** Structure to be passed to s_BlastSearchEngineCore, containing pointers
to various preallocated structures and arrays. */
@@ -385,10 +383,7 @@ s_BlastSearchEngineCore(EBlastProgramType program_number, BLAST_SequenceBlk* que
/* For nucleotide search, if match score is = 2, the odd scores
are rounded down to the nearest even number. */
- if (program_number == eBlastTypeBlastn &&
- score_params->options->reward == 2) {
- Blast_HSPListAdjustOddBlastnScores(hsp_list);
- }
+ Blast_HSPListAdjustOddBlastnScores(hsp_list, score_options->gapped_calculation, gap_align->sbp);
Blast_HSPListSortByScore(hsp_list);
@@ -804,7 +799,6 @@ BLAST_PreliminarySearchEngine(EBlastProgramType program_number,
seq_arg.seq, lookup_wrap, gap_align, score_params, word_params,
ext_params, hit_params, db_options, diagnostics, aux_struct,
&hsp_list);
-
if (status)
break;
@@ -844,7 +838,9 @@ BLAST_PreliminarySearchEngine(EBlastProgramType program_number,
}
/* Save the results. */
- BlastHSPStreamWrite(hsp_stream, &hsp_list);
+ status = BlastHSPStreamWrite(hsp_stream, &hsp_list);
+ if (status != 0)
+ break;
}
BlastSeqSrcReleaseSequence(seq_src, (void*) &seq_arg);
@@ -866,15 +862,20 @@ BLAST_PreliminarySearchEngine(EBlastProgramType program_number,
Int2
Blast_RunPreliminarySearch(EBlastProgramType program,
- BLAST_SequenceBlk* query, BlastQueryInfo* query_info,
- const BlastSeqSrc* seq_src, const BlastScoringOptions* score_options,
- BlastScoreBlk* sbp, LookupTableWrap* lookup_wrap,
+ BLAST_SequenceBlk* query,
+ BlastQueryInfo* query_info,
+ const BlastSeqSrc* seq_src,
+ const BlastScoringOptions* score_options,
+ BlastScoreBlk* sbp,
+ LookupTableWrap* lookup_wrap,
const BlastInitialWordOptions* word_options,
const BlastExtensionOptions* ext_options,
const BlastHitSavingOptions* hit_options,
const BlastEffectiveLengthsOptions* eff_len_options,
- const PSIBlastOptions* psi_options, const BlastDatabaseOptions* db_options,
- BlastHSPStream* hsp_stream, BlastDiagnostics* diagnostics)
+ const PSIBlastOptions* psi_options,
+ const BlastDatabaseOptions* db_options,
+ BlastHSPStream* hsp_stream,
+ BlastDiagnostics* diagnostics)
{
Int2 status = 0;
BlastScoringParameters* score_params = NULL;/**< Scoring parameters */
diff --git a/algo/blast/core/blast_engine.h b/algo/blast/core/blast_engine.h
index d08f94fd..7e59d99d 100644
--- a/algo/blast/core/blast_engine.h
+++ b/algo/blast/core/blast_engine.h
@@ -1,4 +1,4 @@
-/* $Id: blast_engine.h,v 1.51 2005/06/28 12:29:24 ivanov Exp $
+/* $Id: blast_engine.h,v 1.52 2005/08/31 17:36:28 jcherry Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -93,38 +93,6 @@ Blast_RunFullSearch(EBlastProgramType program_number,
BlastHSPStream* hsp_stream, const BlastRPSInfo* rps_info,
BlastDiagnostics* diagnostics, BlastHSPResults** results);
-/** The high level function performing an RPS BLAST search
- * @param program_number Type of BLAST program [in]
- * @param query The query sequence [in]
- * @param query_info Additional query information [in]
- * @param seq_src Structure containing BLAST database [in]
- * @param sbp Scoring and statistical parameters [in]
- * @param score_options Hit scoring options [in]
- * @param lookup_wrap The lookup table, constructed earlier [in]
- * @param word_options Options for processing initial word hits [in]
- * @param ext_options Options and parameters for the gapped extension [in]
- * @param hit_options Options for saving the HSPs [in]
- * @param eff_len_options Options for setting effective lengths [in]
- * @param psi_options Options specific to PSI-BLAST [in]
- * @param hsp_stream Placeholder for saving results [in]
- * @param diagnostics Return statistics containing numbers of hits on
- * different stages of the search [out]
- * @param results Structure holding all saved results [in] [out]
- */
-Int4
-BLAST_RPSSearchEngine(EBlastProgramType program_number,
- BLAST_SequenceBlk* query, BlastQueryInfo* query_info,
- const BlastSeqSrc* seq_src, BlastScoreBlk* sbp,
- const BlastScoringOptions* score_options,
- LookupTableWrap* lookup_wrap,
- const BlastInitialWordOptions* word_options,
- const BlastExtensionOptions* ext_options,
- const BlastHitSavingOptions* hit_options,
- const BlastEffectiveLengthsOptions* eff_len_options,
- const PSIBlastOptions* psi_options,
- BlastHSPStream* hsp_stream, BlastDiagnostics* diagnostics,
- BlastHSPResults** results);
-
/** Perform the preliminary stage of the BLAST search.
* @param program_number Type of BLAST program [in]
* @param query The query sequence [in]
diff --git a/algo/blast/core/blast_extend.c b/algo/blast/core/blast_extend.c
index ec45fe4f..51eec8a4 100644
--- a/algo/blast/core/blast_extend.c
+++ b/algo/blast/core/blast_extend.c
@@ -1,4 +1,4 @@
-/* $Id: blast_extend.c,v 1.87 2005/06/23 19:06:04 madden Exp $
+/* $Id: blast_extend.c,v 1.90 2005/12/05 16:36:50 papadopo Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,6 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
@@ -23,8 +22,6 @@
*
* ===========================================================================
*
- * Author: Ilya Dondoshansky
- *
*/
/** @file blast_extend.c
@@ -33,7 +30,7 @@
#ifndef SKIP_DOXYGEN_PROCESSING
static char const rcsid[] =
- "$Id: blast_extend.c,v 1.87 2005/06/23 19:06:04 madden Exp $";
+ "$Id: blast_extend.c,v 1.90 2005/12/05 16:36:50 papadopo Exp $";
#endif /* SKIP_DOXYGEN_PROCESSING */
#include <algo/blast/core/blast_extend.h>
@@ -89,6 +86,7 @@ score_compare_match(const void* v1, const void* v2)
{
BlastInitHSP* h1,* h2;
+ int result = 0;
h1 = (BlastInitHSP*) v1;
h2 = (BlastInitHSP*) v2;
@@ -102,32 +100,19 @@ score_compare_match(const void* v1, const void* v2)
else if (h2->ungapped_data == NULL)
return -1;
- if (h1->ungapped_data->score < h2->ungapped_data->score)
- return 1;
- if (h1->ungapped_data->score > h2->ungapped_data->score)
- return -1;
-
-
- /* Tie breaks: starting offset in subject; then length
- * (equivalent to ending offset in subject), then starting
- * offset in query.
- */
- if (h1->ungapped_data->s_start < h2->ungapped_data->s_start)
- return 1;
- if (h1->ungapped_data->s_start > h2->ungapped_data->s_start )
- return -1;
-
- if (h1->ungapped_data->length < h2->ungapped_data->length)
- return 1;
- if (h1->ungapped_data->length > h2->ungapped_data->length)
- return -1;
+ if (0 == (result = BLAST_CMP(h2->ungapped_data->score,
+ h1->ungapped_data->score)) &&
+ 0 == (result = BLAST_CMP(h1->ungapped_data->s_start,
+ h2->ungapped_data->s_start)) &&
+ 0 == (result = BLAST_CMP(h2->ungapped_data->length,
+ h1->ungapped_data->length)) &&
+ 0 == (result = BLAST_CMP(h1->ungapped_data->q_start,
+ h2->ungapped_data->q_start))) {
+ result = BLAST_CMP(h2->ungapped_data->length,
+ h1->ungapped_data->length);
+ }
- if( h1->ungapped_data->q_start < h2->ungapped_data->q_start )
- return 1;
- if( h1->ungapped_data->q_start > h2->ungapped_data->q_start )
- return -1;
-
- return 0;
+ return result;
}
void Blast_InitHitListSortByScore(BlastInitHitList* init_hitlist)
@@ -296,7 +281,7 @@ static Int2
s_NuclUngappedExtend(BLAST_SequenceBlk* query,
BLAST_SequenceBlk* subject, Int4** matrix,
Int4 q_off, Int4 s_off, Int4 X,
- BlastUngappedData** ungapped_data)
+ BlastUngappedData* ungapped_data)
{
Uint1* q;
Int4 sum, score;
@@ -343,14 +328,8 @@ s_NuclUngappedExtend(BLAST_SequenceBlk* query,
}
}
- if (ungapped_data) {
- if ((*ungapped_data = (BlastUngappedData*)
- malloc(sizeof(BlastUngappedData))) == NULL)
- return -1;
- (*ungapped_data)->q_start = q_beg - query->sequence;
- (*ungapped_data)->s_start =
- s_off - (q_off - (*ungapped_data)->q_start);
- }
+ ungapped_data->q_start = q_beg - query->sequence;
+ ungapped_data->s_start = s_off - (q_off - ungapped_data->q_start);
if (q_avail < s_avail) {
sf = subject0 + (s_off + q_avail)/COMPRESSION_RATIO;
@@ -381,10 +360,8 @@ s_NuclUngappedExtend(BLAST_SequenceBlk* query,
base--;
}
- if (ungapped_data) {
- (*ungapped_data)->length = q_end - q_beg;
- (*ungapped_data)->score = score;
- }
+ ungapped_data->length = q_end - q_beg;
+ ungapped_data->score = score;
return 0;
}
@@ -428,6 +405,7 @@ s_BlastnDiagExtendInitialHit(BLAST_SequenceBlk* query,
Int4 diag, real_diag;
Int4 s_pos;
BlastUngappedData* ungapped_data;
+ BlastUngappedData dummy_ungapped_data;
Int4 window_size = word_params->options->window_size;
Boolean hit_ready;
Boolean new_hit = FALSE, second_hit = FALSE;
@@ -465,8 +443,9 @@ s_BlastnDiagExtendInitialHit(BLAST_SequenceBlk* query,
if (hit_ready) {
if (word_params->options->ungapped_extension) {
/* Perform ungapped extension */
+ ungapped_data = &dummy_ungapped_data;
s_NuclUngappedExtend(query, subject, matrix, q_off, s_off,
- -word_params->x_dropoff, &ungapped_data);
+ -word_params->x_dropoff, ungapped_data);
last_hit = ungapped_data->length + ungapped_data->s_start
+ diag_table->offset;
@@ -474,13 +453,18 @@ s_BlastnDiagExtendInitialHit(BLAST_SequenceBlk* query,
ungapped_data = NULL;
last_hit = s_pos;
}
- if (!ungapped_data ||
- ungapped_data->score >= word_params->cutoff_score) {
+ if (ungapped_data == NULL) {
BLAST_SaveInitialHit(init_hitlist, q_off, s_off, ungapped_data);
/* Set the "saved" flag for this hit */
hit_saved = ~LAST_HIT_MASK;
+ } else if (ungapped_data->score >= word_params->cutoff_score) {
+ BlastUngappedData *final_data = (BlastUngappedData *)malloc(
+ sizeof(BlastUngappedData));
+ *final_data = *ungapped_data;
+ BLAST_SaveInitialHit(init_hitlist, q_off, s_off, final_data);
+ /* Set the "saved" flag for this hit */
+ hit_saved = ~LAST_HIT_MASK;
} else {
- sfree(ungapped_data);
/* Unset the "saved" flag for this hit */
hit_saved = 0;
}
@@ -567,6 +551,7 @@ s_BlastnStacksExtendInitialHit(BLAST_SequenceBlk* query,
Int4 stack_top;
Int4 window_size;
Boolean hit_ready = FALSE, two_hits;
+ BlastUngappedData dummy_ungapped_data;
BlastUngappedData* ungapped_data = NULL;
window_size = word_params->options->window_size;
@@ -607,21 +592,27 @@ s_BlastnStacksExtendInitialHit(BLAST_SequenceBlk* query,
if (hit_ready) {
if (word_params->options->ungapped_extension) {
/* Perform ungapped extension */
+ ungapped_data = &dummy_ungapped_data;
s_NuclUngappedExtend(query, subject, matrix, q_off, s_off,
- -word_params->x_dropoff, &ungapped_data);
+ -word_params->x_dropoff, ungapped_data);
last_hit = ungapped_data->length + ungapped_data->s_start;
} else {
ungapped_data = NULL;
last_hit = s_end;
}
- if (!ungapped_data ||
- ungapped_data->score >= word_params->cutoff_score) {
+ if (ungapped_data == NULL) {
BLAST_SaveInitialHit(init_hitlist, q_off, s_off, ungapped_data);
/* Set the "saved" flag for this hit */
hit_saved = ~LAST_HIT_MASK;
+ } else if (ungapped_data->score >= word_params->cutoff_score) {
+ BlastUngappedData *final_data = (BlastUngappedData *)malloc(
+ sizeof(BlastUngappedData));
+ *final_data = *ungapped_data;
+ BLAST_SaveInitialHit(init_hitlist, q_off, s_off, final_data);
+ /* Set the "saved" flag for this hit */
+ hit_saved = ~LAST_HIT_MASK;
} else {
- sfree(ungapped_data);
/* Unset the "saved" flag for this hit */
hit_saved = 0;
}
@@ -669,20 +660,24 @@ s_BlastnStacksExtendInitialHit(BLAST_SequenceBlk* query,
hit_ready = TRUE;
if (word_params->options->ungapped_extension) {
/* Perform ungapped extension */
+ ungapped_data = &dummy_ungapped_data;
s_NuclUngappedExtend(query, subject, matrix, q_off, s_off,
- -word_params->x_dropoff, &ungapped_data);
+ -word_params->x_dropoff, ungapped_data);
stack[stack_top].level =
(ungapped_data->length + ungapped_data->s_start);
} else {
ungapped_data = NULL;
}
- if (!ungapped_data ||
- ungapped_data->score >= word_params->cutoff_score) {
- BLAST_SaveInitialHit(init_hitlist, q_off, s_off,
- ungapped_data);
+ if (ungapped_data == NULL) {
+ BLAST_SaveInitialHit(init_hitlist, q_off, s_off, ungapped_data);
+ stack[stack_top].level |= ~LAST_HIT_MASK;
+ } else if (ungapped_data->score >= word_params->cutoff_score) {
+ BlastUngappedData *final_data = (BlastUngappedData *)malloc(
+ sizeof(BlastUngappedData));
+ *final_data = *ungapped_data;
+ BLAST_SaveInitialHit(init_hitlist, q_off, s_off, final_data);
stack[stack_top].level |= ~LAST_HIT_MASK;
} else {
- sfree(ungapped_data);
/* Set hit length back to 0 after ungapped extension
failure */
stack[stack_top].level &= LAST_HIT_MASK;
diff --git a/algo/blast/core/blast_filter.c b/algo/blast/core/blast_filter.c
index 255aa088..2ad89ad9 100644
--- a/algo/blast/core/blast_filter.c
+++ b/algo/blast/core/blast_filter.c
@@ -1,4 +1,4 @@
-/* $Id: blast_filter.c,v 1.73 2005/07/19 13:43:30 madden Exp $
+/* $Id: blast_filter.c,v 1.78 2005/11/16 14:27:03 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,6 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
@@ -23,8 +22,6 @@
*
* ===========================================================================
*
- * Author: Ilya Dondoshansky
- *
*/
/** @file blast_filter.c
@@ -33,7 +30,7 @@
#ifndef SKIP_DOXYGEN_PROCESSING
static char const rcsid[] =
- "$Id: blast_filter.c,v 1.73 2005/07/19 13:43:30 madden Exp $";
+ "$Id: blast_filter.c,v 1.78 2005/11/16 14:27:03 madden Exp $";
#endif /* SKIP_DOXYGEN_PROCESSING */
#include <algo/blast/core/blast_def.h>
@@ -397,75 +394,127 @@ BlastFilteringOptionsFromString(EBlastProgramType program_number, const char* in
BlastSeqLoc* BlastSeqLocNew(BlastSeqLoc** head, Int4 from, Int4 to)
{
BlastSeqLoc* loc = (BlastSeqLoc*) calloc(1, sizeof(BlastSeqLoc));
- SSeqRange* seq_range = (SSeqRange*) malloc(sizeof(SSeqRange));
-
- seq_range->left = from;
- seq_range->right = to;
- loc->ssr = seq_range;
-
- if (head)
- {
- if (*head)
- {
- BlastSeqLoc* tmp = *head;
- while (tmp->next)
- tmp = tmp->next;
- tmp->next = loc;
- }
- else
- {
- *head = loc;
- }
+ if ( !loc ) {
+ return NULL;
}
-
- return loc;
+ loc->ssr = (SSeqRange*) calloc(1, sizeof(SSeqRange));
+ loc->ssr->left = from;
+ loc->ssr->right = to;
+
+ return BlastSeqLocAppend(head, loc);
}
-BlastSeqLoc* BlastSeqLocFree(BlastSeqLoc* loc)
+BlastSeqLoc* BlastSeqLocAppend(BlastSeqLoc** head, BlastSeqLoc* node)
{
- SSeqRange* seq_range;
- BlastSeqLoc* next_loc;
-
- while (loc) {
- next_loc = loc->next;
- seq_range = loc->ssr;
- sfree(seq_range);
- sfree(loc);
- loc = next_loc;
- }
- return NULL;
+ if ( !node ) {
+ return NULL;
+ }
+
+ if (head)
+ {
+ if (*head)
+ {
+ BlastSeqLoc* tmp = *head;
+ while (tmp->next)
+ tmp = tmp->next;
+ tmp->next = node;
+ }
+ else
+ {
+ *head = node;
+ }
+ }
+
+ return node;
}
/** Makes a copy of the BlastSeqLoc and also a copy of the
* SSRange element. Does not copy BlastSeqLoc that is pointed
* to by "next".
- * @param from the object to be copied [in]
+ * @param source the object to be copied [in]
* @return another BlastSeqLoc*
*/
+static BlastSeqLoc* s_BlastSeqLocNodeDup(BlastSeqLoc* source)
+{
+ if ( !source ) {
+ return NULL;
+ }
+ ASSERT(source->ssr);
+ return BlastSeqLocNew(NULL, source->ssr->left, source->ssr->right);
+}
-static BlastSeqLoc* s_BlastSeqLocDup(BlastSeqLoc* from)
+/** Prepend node to the head of the list and return the new head of the list */
+static BlastSeqLoc* s_BlastSeqLocPrepend(BlastSeqLoc* head, BlastSeqLoc* node)
{
- BlastSeqLoc* to;
- SSeqRange* seq_range;
+ if ( !node ) {
+ return NULL;
+ }
+ node->next = head;
+ return node;
+}
- if (from == NULL)
- return NULL;
+/** Reverse elements in the list
+ * @param head pointer to pointer to the head of the list. After this call,
+ * this is set to NULL [in|out]
+ * @return the new head of the list or NULL if argument is NULL
+ */
+static BlastSeqLoc* s_BlastSeqLocListReverse(BlastSeqLoc** head)
+{
+ BlastSeqLoc* retval = NULL; /* return value */
+ BlastSeqLoc* itr = NULL; /* iterator */
+
+ if ( !head ) {
+ return NULL;
+ }
- seq_range = from->ssr;
- ASSERT(seq_range);
+ for (itr = *head; itr; itr = itr->next) {
+ retval = s_BlastSeqLocPrepend(retval, s_BlastSeqLocNodeDup(itr));
+ }
+ *head = BlastSeqLocFree(*head);
+ return retval;
+}
- to = BlastSeqLocNew(NULL, seq_range->left, seq_range->right);
+BlastSeqLoc* BlastSeqLocNodeFree(BlastSeqLoc* loc)
+{
+ if ( !loc ) {
+ return NULL;
+ }
+ sfree(loc->ssr);
+ sfree(loc);
+ return NULL;
+}
- return to;
+BlastSeqLoc* BlastSeqLocFree(BlastSeqLoc* loc)
+{
+ while (loc) {
+ BlastSeqLoc* next_loc = loc->next;
+ loc = BlastSeqLocNodeFree(loc);
+ loc = next_loc;
+ }
+ return NULL;
+}
+
+BlastSeqLoc* BlastSeqLocListDup(BlastSeqLoc* head)
+{
+ BlastSeqLoc* retval = NULL;
+ BlastSeqLoc* retval_tail = NULL;
+
+ for (; head; head = head->next) {
+ retval_tail = BlastSeqLocAppend(retval_tail ? &retval_tail : &retval,
+ s_BlastSeqLocNodeDup(head));
+ }
+
+ return retval;
}
BlastMaskLoc* BlastMaskLocNew(Int4 total)
{
- BlastMaskLoc* retval = (BlastMaskLoc *) calloc(1, sizeof(BlastMaskLoc));
- retval->total_size = total;
- if (total > 0)
- retval->seqloc_array = (BlastSeqLoc **) calloc(total, sizeof(BlastSeqLoc *));
- return retval;
+ BlastMaskLoc* retval = (BlastMaskLoc *) calloc(1, sizeof(BlastMaskLoc));
+ retval->total_size = total;
+ if (total > 0)
+ retval->seqloc_array = (BlastSeqLoc **) calloc(total,
+ sizeof(BlastSeqLoc *));
+ return retval;
}
BlastMaskLoc* BlastMaskLocFree(BlastMaskLoc* mask_loc)
@@ -485,61 +534,27 @@ BlastMaskLoc* BlastMaskLocFree(BlastMaskLoc* mask_loc)
return NULL;
}
-/** Calculates length of the DNA query from the BlastQueryInfo structure that
- * contains context information for translated frames for a set of queries.
- * @param query_info Query information containing data for all contexts [in]
- * @param query_index Which query to find DNA length for?
- * @return DNA length of the query, calculated as sum of 3 protein frame lengths,
- * plus 2, because 2 last nucleotide residues do not have a
- * corresponding codon.
- */
-static Int4
-s_GetTranslatedQueryDNALength(const BlastQueryInfo* query_info, Int4 query_index)
-{
- Int4 start_context = NUM_FRAMES*query_index;
- Int4 dna_length = 2;
- Int4 index;
-
- /* Make sure that query index is within appropriate range, and that this is
- really a translated search */
- ASSERT(query_index < query_info->num_queries);
- ASSERT(start_context < query_info->last_context);
-
- /* If only reverse strand is searched, then forward strand contexts don't
- have lengths information */
- if (query_info->contexts[start_context].query_length == 0)
- start_context += 3;
-
- for (index = start_context; index < start_context + 3; ++index)
- dna_length += query_info->contexts[index].query_length;
-
- return dna_length;
-}
-
Int2 BlastMaskLocDNAToProtein(BlastMaskLoc* mask_loc,
const BlastQueryInfo* query_info)
{
- BlastSeqLoc** prot_seqloc_array;
Uint4 seq_index;
if (!mask_loc)
return 0;
- /* Check that the number of sequences in BlastQueryInfo is the same as the
- size of the DNA mask locations array in the BlastMaskLoc. */
- ASSERT(mask_loc->total_size == query_info->num_queries);
-
- mask_loc->total_size *= NUM_FRAMES;
- prot_seqloc_array =
- (BlastSeqLoc**) calloc(mask_loc->total_size, sizeof(BlastSeqLoc*));
+ /* Check that the array size in BlastMaskLoc corresponds to the number
+ of contexts in BlastQueryInfo. */
+ ASSERT(mask_loc->total_size == query_info->last_context + 1);
/* Loop over multiple DNA sequences */
for (seq_index = 0; seq_index < (Uint4)query_info->num_queries;
++seq_index) {
- BlastSeqLoc** prot_seqloc =
- &(prot_seqloc_array[NUM_FRAMES*seq_index]);
- BlastSeqLoc* dna_seqloc = mask_loc->seqloc_array[seq_index];
- Int4 dna_length = s_GetTranslatedQueryDNALength(query_info, seq_index);
+ const Uint4 kCtxIndex = NUM_FRAMES * seq_index;
+ BlastSeqLoc* dna_seqloc = mask_loc->seqloc_array[kCtxIndex];
+ BlastSeqLoc** prot_seqloc = &(mask_loc->seqloc_array[kCtxIndex]);
+ Int4 dna_length = BlastQueryInfoGetQueryLength(query_info,
+ eBlastTypeBlastx,
+ seq_index);
Int4 context;
/* Reproduce this mask for all 6 frames, with translated coordinates */
@@ -565,8 +580,6 @@ Int2 BlastMaskLocDNAToProtein(BlastMaskLoc* mask_loc,
}
BlastSeqLocFree(dna_seqloc);
}
- sfree(mask_loc->seqloc_array);
- mask_loc->seqloc_array = prot_seqloc_array;
return 0;
}
@@ -592,7 +605,9 @@ Int2 BlastMaskLocProteinToDNA(BlastMaskLoc* mask_loc,
{
Int4 frame_start = index*NUM_FRAMES;
Int4 frame_index;
- Int4 dna_length = s_GetTranslatedQueryDNALength(query_info, index);
+ Int4 dna_length = BlastQueryInfoGetQueryLength(query_info,
+ eBlastTypeBlastx,
+ index);
/* Loop over all frames of one DNA sequence */
for (frame_index=frame_start; frame_index<(frame_start+NUM_FRAMES);
frame_index++) {
@@ -684,64 +699,48 @@ s_BlastSeqLocSort (BlastSeqLoc* list,
return list;
}
-/* This will go in place of CombineSeqLocs to combine filtered locations */
-Int2
-CombineMaskLocations(BlastSeqLoc* mask_loc, BlastSeqLoc* *mask_loc_out,
- Int4 link_value)
+BlastSeqLoc*
+BlastSeqLocCombine(BlastSeqLoc* mask_loc, Int4 link_value)
{
- Int2 status=0; /* return value. */
- Int4 start, stop; /* USed to merge overlapping SeqLoc's. */
- SSeqRange* ssr = NULL;
- BlastSeqLoc* loc_head=NULL,* last_loc=NULL,* loc_var=NULL;
- BlastSeqLoc* new_loc = NULL;
+ BlastSeqLoc* retval = NULL;
+ BlastSeqLoc* retval_tail = NULL;
+ Int4 start, stop; /* Used to merge overlapping SeqLoc's. */
+ BlastSeqLoc* loc_head=NULL,* loc_var=NULL;
- if (!mask_loc) {
- *mask_loc_out = NULL;
- return 0;
+ if ( !mask_loc ) {
+ return NULL;
}
- /* Put all the SeqLoc's into one big linked list. */
- loc_var = mask_loc;
- loc_head = last_loc = s_BlastSeqLocDup(loc_var);
- while (loc_var->next)
- {
- last_loc->next = s_BlastSeqLocDup(loc_var->next);
- last_loc = last_loc->next;
- loc_var = loc_var->next;
+ /* Copy the BlastSeqLoc-s and sort them by starting position. */
+ loc_head = loc_var = s_BlastSeqLocSort(BlastSeqLocListDup(mask_loc),
+ s_SeqRangeSortByStartPosition);
+ if ( !loc_head ) {
+ return NULL;
}
-
- /* Sort them by starting position. */
- loc_head = (BlastSeqLoc*)
- s_BlastSeqLocSort (loc_head, s_SeqRangeSortByStartPosition);
-
- ssr = (SSeqRange*) loc_head->ssr;
- start = ssr->left;
- stop = ssr->right;
- loc_var = loc_head;
- ssr = NULL;
-
- while (loc_var) {
- if (loc_var->next)
- ssr = loc_var->next->ssr;
- if (ssr && ((stop + link_value) > ssr->left)) {
- stop = MAX(stop, ssr->right);
- } else {
- BlastSeqLocNew(&new_loc, start, stop);
- if (loc_var->next) {
- start = ssr->left;
- stop = ssr->right;
- }
- }
- loc_var = loc_var->next;
- ssr = NULL;
+ start = loc_head->ssr->left;
+ stop = loc_head->ssr->right;
+
+ for (; loc_var; loc_var = loc_var->next) {
+ SSeqRange* ssr = loc_var->next ? loc_var->next->ssr : NULL;
+
+ if (ssr && ((stop + link_value) > ssr->left)) {
+ stop = MAX(stop, ssr->right);
+ } else {
+ /* Cache the tail of the list to avoid the overhead of traversing the
+ * list when appending to it */
+ retval_tail = BlastSeqLocNew((retval_tail ? &retval_tail : &retval),
+ start, stop);
+ if (loc_var->next) {
+ start = ssr->left;
+ stop = ssr->right;
+ }
+ }
}
- *mask_loc_out = new_loc;
-
- /* Free memory allocated for the temporary list of SeqLocs */
+ /* Free memory allocated for the temporary list of BlastSeqLoc-s */
BlastSeqLocFree(loc_head);
- return status;
+ return retval;
}
Int2
@@ -750,8 +749,9 @@ BLAST_ComplementMaskLocations(EBlastProgramType program_number,
const BlastMaskLoc* mask_loc, BlastSeqLoc* *complement_mask)
{
Int4 context;
- BlastSeqLoc* loc,* last_loc = NULL,* start_loc = NULL;
const Boolean kIsNucl = (program_number == eBlastTypeBlastn);
+ BlastSeqLoc* tail = NULL; /* Pointer to the tail of the complement_mask
+ linked list */
if (complement_mask == NULL)
return -1;
@@ -763,52 +763,39 @@ BLAST_ComplementMaskLocations(EBlastProgramType program_number,
Boolean first = TRUE; /* Specifies beginning of query. */
Boolean last_interval_open=TRUE; /* if TRUE last interval needs to be closed. */
- Boolean reverse = FALSE; /* Sequence on minus strand. */
- Int4 index; /* loop index */
Int4 start_offset, end_offset, filter_start, filter_end;
Int4 left=0, right; /* Used for left/right extent of a region. */
+ BlastSeqLoc* loc = NULL;
- start_offset = query_info->contexts[context].query_offset;
- end_offset = query_info->contexts[context].query_length + start_offset - 1;
-
- /* For blastn: check if this strand is not searched at all */
- if (end_offset < start_offset)
+ if (query_info->contexts[context].query_length <= 0) {
continue;
- index = BlastGetMaskLocIndexFromContext(kIsNucl, context);
- reverse = BlastIsReverseStrand(kIsNucl, context);
-
-
- /* mask_loc NULL is simply the case that NULL was passed in, which we take to
- mean that nothing on query is masked. */
- if (mask_loc == NULL || mask_loc->seqloc_array[index] == NULL)
- {
- /* No masks for this context */
- if (!last_loc)
- last_loc = BlastSeqLocNew(complement_mask, start_offset, end_offset);
- else
- last_loc = BlastSeqLocNew(&last_loc, start_offset, end_offset);
+ }
+
+ start_offset = query_info->contexts[context].query_offset;
+ end_offset = query_info->contexts[context].query_length
+ + start_offset - 1;
+ ASSERT(start_offset <= end_offset);
+
+ /* mask_loc NULL is simply the case that NULL was passed in, which we
+ take to mean that nothing on query is masked. */
+ if (mask_loc == NULL || mask_loc->seqloc_array[context] == NULL) {
+ /* Cache the tail of the list to avoid the overhead of traversing the
+ * list when appending to it */
+ tail = BlastSeqLocNew(tail ? &tail : complement_mask,
+ start_offset, end_offset);
continue;
}
- if (reverse) {
- BlastSeqLoc* prev_loc = NULL;
- /* Reverse the order of the locations */
- for (start_loc = mask_loc->seqloc_array[index]; start_loc;
- start_loc = start_loc->next) {
- loc = s_BlastSeqLocDup(start_loc);
- loc->next = prev_loc;
- prev_loc = loc;
- }
- /* Save where this list starts, so it can be freed later */
- start_loc = loc;
- } else {
- loc = mask_loc->seqloc_array[index];
+ if (BlastIsReverseStrand(kIsNucl, context)) {
+ mask_loc->seqloc_array[context] =
+ s_BlastSeqLocListReverse(&mask_loc->seqloc_array[context]);
}
+ loc = mask_loc->seqloc_array[context];
first = TRUE;
for ( ; loc; loc = loc->next) {
SSeqRange* seq_range = loc->ssr;
- if (reverse) {
+ if (BlastIsReverseStrand(kIsNucl, context)) {
filter_start = end_offset - seq_range->right;
filter_end = end_offset - seq_range->left;
} else {
@@ -837,10 +824,9 @@ BLAST_ComplementMaskLocations(EBlastProgramType program_number,
right = filter_start - 1;
- if (!last_loc)
- last_loc = BlastSeqLocNew(complement_mask, left, right);
- else
- last_loc = BlastSeqLocNew(&last_loc, left, right);
+ /* Cache the tail of the list to avoid the overhead of traversing the
+ * list when appending to it */
+ tail = BlastSeqLocNew((tail ? &tail : complement_mask), left, right);
if (filter_end >= end_offset) {
/* last masked region at end of sequence */
last_interval_open = FALSE;
@@ -850,17 +836,12 @@ BLAST_ComplementMaskLocations(EBlastProgramType program_number,
}
}
- if (reverse) {
- start_loc = BlastSeqLocFree(start_loc);
- }
-
if (last_interval_open) {
/* Need to finish SSeqRange* for last interval. */
right = end_offset;
- if (!last_loc)
- last_loc = BlastSeqLocNew(complement_mask, left, right);
- else
- last_loc = BlastSeqLocNew(&last_loc, left, right);
+ /* Cache the tail of the list to avoid the overhead of traversing the
+ * list when appending to it */
+ tail = BlastSeqLocNew((tail ? &tail : complement_mask), left, right);
}
}
return 0;
@@ -868,51 +849,44 @@ BLAST_ComplementMaskLocations(EBlastProgramType program_number,
Int2
-BlastSetUp_Filter(EBlastProgramType program_number, Uint1* sequence, Int4 length,
- Int4 offset, const SBlastFilterOptions* filter_options, BlastSeqLoc* *seqloc_retval,
- Blast_Message* *blast_message)
+BlastSetUp_Filter(EBlastProgramType program_number,
+ Uint1* sequence,
+ Int4 length,
+ Int4 offset,
+ const SBlastFilterOptions* filter_options,
+ BlastSeqLoc** seqloc_retval,
+ Blast_Message* *blast_message)
{
- Int2 seqloc_num=0;
Int2 status=0; /* return value. */
- BlastSeqLoc* seg_loc = NULL;
- ASSERT(filter_options);
- ASSERT(seqloc_retval);
+ ASSERT(filter_options);
+ ASSERT(seqloc_retval);
- *seqloc_retval = NULL;
+ *seqloc_retval = NULL;
- status = SBlastFilterOptionsValidate(program_number, filter_options, blast_message);
- if (status)
- return status;
+ status = SBlastFilterOptionsValidate(program_number, filter_options,
+ blast_message);
+ if (status)
+ return status;
if (filter_options->segOptions)
{
- SSegOptions* seg_options = filter_options->segOptions;
- SegParameters* sparamsp=NULL;
-
- sparamsp = SegParametersNewAa();
- sparamsp->overlaps = TRUE;
- if (seg_options->window > 0)
- sparamsp->window = seg_options->window;
- if (seg_options->locut > 0.0)
- sparamsp->locut = seg_options->locut;
- if (seg_options->hicut > 0.0)
- sparamsp->hicut = seg_options->hicut;
-
- SeqBufferSeg(sequence, length, offset, sparamsp, &seg_loc);
+ SSegOptions* seg_options = filter_options->segOptions;
+ SegParameters* sparamsp=NULL;
+
+ sparamsp = SegParametersNewAa();
+ sparamsp->overlaps = TRUE;
+ if (seg_options->window > 0)
+ sparamsp->window = seg_options->window;
+ if (seg_options->locut > 0.0)
+ sparamsp->locut = seg_options->locut;
+ if (seg_options->hicut > 0.0)
+ sparamsp->hicut = seg_options->hicut;
+
+ status = SeqBufferSeg(sequence, length, offset, sparamsp,
+ seqloc_retval);
SegParametersFree(sparamsp);
sparamsp = NULL;
- seqloc_num++;
- }
-
- if (seqloc_num)
- {
- BlastSeqLoc* seqloc_list=NULL; /* Holds all SeqLoc's for
- return. */
- if (seg_loc)
- seqloc_list = seg_loc;
-
- *seqloc_retval = seqloc_list;
}
return status;
@@ -938,139 +912,136 @@ BlastSeqLocReverse(const BlastSeqLoc* filter_in, Int4 query_length)
}
static Int2
-s_GetFilteringLocationsForOneContext(BLAST_SequenceBlk* query_blk, const BlastQueryInfo* query_info, Int4 context, EBlastProgramType program_number, const SBlastFilterOptions* filter_options, BlastSeqLoc* *filter_out, Blast_Message* *blast_message)
+s_GetFilteringLocationsForOneContext(BLAST_SequenceBlk* query_blk,
+ const BlastQueryInfo* query_info,
+ Int4 context,
+ EBlastProgramType program_number,
+ const SBlastFilterOptions* filter_options,
+ BlastSeqLoc* *filter_out,
+ Blast_Message* *blast_message)
{
- Int2 status = 0;
- Int4 query_length = 0; /* Length of query described by SeqLocPtr. */
- Int4 context_offset;
- BlastSeqLoc *lcase_mask_slp = NULL; /* Auxiliary locations for lower-case masking */
- BlastSeqLoc *filter_slp = NULL; /* SeqLocPtr computed for filtering. */
- BlastSeqLoc *filter_slp_combined; /* Used to hold combined SeqLoc's */
- Uint1 *buffer; /* holds sequence for plus strand or protein. */
-
- const Boolean kIsNucl = (program_number == eBlastTypeBlastn);
- Int4 index = BlastGetMaskLocIndexFromContext(kIsNucl, context);
-
- context_offset = query_info->contexts[context].query_offset;
- buffer = &query_blk->sequence[context_offset];
+ Int2 status = 0;
+ Int4 query_length = 0; /* Length of query described by SeqLocPtr. */
+ Int4 context_offset;
+ BlastSeqLoc *filter_slp = NULL; /* SeqLocPtr computed for filtering. */
+ Uint1 *buffer; /* holds sequence for plus strand or protein. */
+
+ const Boolean kIsNucl = (program_number == eBlastTypeBlastn);
- if ((query_length = query_info->contexts[context].query_length) <= 0)
- return 0;
+ context_offset = query_info->contexts[context].query_offset;
+ buffer = &query_blk->sequence[context_offset];
- if ((status = BlastSetUp_Filter(program_number, buffer,
- query_length, 0, filter_options, &filter_slp, blast_message)))
- return status;
+ if ((query_length = query_info->contexts[context].query_length) <= 0) {
+ return 0;
+ }
- if (BlastIsReverseStrand(kIsNucl, context) == TRUE)
- { /* Reverse this as it's on minus strand. */
- BlastSeqLoc *filter_slp_rev = BlastSeqLocReverse(filter_slp, query_length);
- filter_slp = BlastSeqLocFree(filter_slp);
- filter_slp = filter_slp_rev;
- }
+ status = BlastSetUp_Filter(program_number,
+ buffer,
+ query_length,
+ 0,
+ filter_options,
+ &filter_slp,
+ blast_message);
+ if (status)
+ return status;
+
+ if (BlastIsReverseStrand(kIsNucl, context) == TRUE)
+ { /* Reverse this as it's on minus strand. */
+ BlastSeqLoc* tmp = BlastSeqLocReverse(filter_slp, query_length);
+ filter_slp = BlastSeqLocFree(filter_slp);
+ filter_slp = tmp;
+ }
- /* Extract the mask locations corresponding to this query
- (frame, strand), detach it from other masks.
- NB: for translated search the mask locations are expected in
- protein coordinates. The nucleotide locations must be converted
- to protein coordinates prior to the call to BLAST_MainSetUp.
- */
- lcase_mask_slp = NULL;
+ /* Extract the mask locations corresponding to this query
+ (frame, strand), detach it from other masks.
+ NB: for translated search the mask locations are expected in
+ protein coordinates. The nucleotide locations must be converted
+ to protein coordinates prior to the call to BLAST_MainSetUp.
+ */
+ {
+ /* Auxiliary locations for lower-case masking or any other masking
+ * which occurred outside of CORE BLAST */
+ BlastSeqLoc *lcase_mask_slp = NULL;
if (query_blk->lcase_mask && query_blk->lcase_mask->seqloc_array)
{
- lcase_mask_slp = query_blk->lcase_mask->seqloc_array[index];
- /* Set location list to NULL, to allow safe memory deallocation,
- ownership transferred to filter_slp below. */
- query_blk->lcase_mask->seqloc_array[index] = NULL;
- }
-
- /* Attach the lower case mask locations to the filter locations and combine them */
- if (lcase_mask_slp) {
- if (filter_slp) {
- BlastSeqLoc *loc; /* Iterator variable */
- for (loc = filter_slp; loc->next; loc = loc->next);
- loc->next = lcase_mask_slp;
- } else {
- filter_slp = lcase_mask_slp;
- }
+ ASSERT(context < query_blk->lcase_mask->total_size);
+ lcase_mask_slp = query_blk->lcase_mask->seqloc_array[context];
+ /* Set location list to NULL, to allow safe memory deallocation,
+ ownership transferred to filter_slp below. */
+ query_blk->lcase_mask->seqloc_array[context] = NULL;
}
- filter_slp_combined = NULL;
- CombineMaskLocations(filter_slp, &filter_slp_combined, 0);
- *filter_out = filter_slp_combined;
+ /* Attach the lower case mask locations to the filter locations and
+ combine them */
+ BlastSeqLocAppend(&filter_slp, lcase_mask_slp);
+ }
- filter_slp = BlastSeqLocFree(filter_slp);
+ *filter_out = BlastSeqLocCombine(filter_slp, 0);
+ filter_slp = BlastSeqLocFree(filter_slp);
return 0;
}
-
Int2
-BlastSetUp_GetFilteringLocations(BLAST_SequenceBlk* query_blk, const BlastQueryInfo* query_info, EBlastProgramType program_number, const SBlastFilterOptions* filter_options, BlastMaskLoc** filter_maskloc, Blast_Message * *blast_message)
+BlastSetUp_GetFilteringLocations(BLAST_SequenceBlk* query_blk,
+ const BlastQueryInfo* query_info,
+ EBlastProgramType program_number,
+ const SBlastFilterOptions* filter_options,
+ BlastMaskLoc** filter_maskloc,
+ Blast_Message** blast_message)
{
-
Int2 status = 0;
Int4 context = 0; /* loop variable. */
- const Boolean kIsNucl = (program_number == eBlastTypeBlastn);
- Boolean no_forward_strand = (query_info->first_context > 0); /* filtering needed on reverse strand. */
+ const int kNumContexts = query_info->last_context + 1;
ASSERT(query_info && query_blk && filter_maskloc);
- *filter_maskloc = BlastMaskLocNew(query_info->last_context+1);
+ ASSERT(kNumContexts ==
+ query_info->num_queries*BLAST_GetNumberOfContexts(program_number));
+ *filter_maskloc = BlastMaskLocNew(kNumContexts);
for (context = query_info->first_context;
context <= query_info->last_context; ++context) {
-
- Boolean reverse = BlastIsReverseStrand(kIsNucl, context);
-
- /* For each query, check if forward strand is present */
- if (query_info->contexts[context].query_length <= 0)
- {
- if (kIsNucl && (context & 1) == 0) /* Needed only for blastn, or does this not apply FIXME */
- no_forward_strand = TRUE; /* No plus strand, we cannot simply infer locations by going from plus to minus */
- continue;
+
+ BlastSeqLoc *filter_per_context = NULL;
+ status = s_GetFilteringLocationsForOneContext(query_blk,
+ query_info,
+ context,
+ program_number,
+ filter_options,
+ &filter_per_context,
+ blast_message);
+ if (status) {
+ Blast_MessageWrite(blast_message, eBlastSevError, 2, 1,
+ "Failure at filtering");
+ return status;
}
- else if (!reverse) /* This is a plus strand, safe to set no_forward_strand to FALSE as clearly there is one. */
- no_forward_strand = FALSE;
- if (!reverse || no_forward_strand)
- {
- BlastSeqLoc *filter_per_context = NULL; /* Used to hold combined SeqLoc's */
- Int4 filter_index = BlastGetMaskLocIndexFromContext(kIsNucl, context);
- if ((status=s_GetFilteringLocationsForOneContext(query_blk, query_info, context, program_number, filter_options, &filter_per_context, blast_message)))
- {
- Blast_MessageWrite(blast_message, eBlastSevError, 2, 1,
- "Failure at filtering");
- return status;
- }
-
- /* NB: for translated searches filter locations are returned in
- protein coordinates, because the DNA lengths of sequences are
- not available here. The caller must take care of converting
- them back to nucleotide coordinates. */
- (*filter_maskloc)->seqloc_array[filter_index] = filter_per_context;
- }
+ /* NB: for translated searches filter locations are returned in
+ protein coordinates, because the DNA lengths of sequences are
+ not available here. The caller must take care of converting
+ them back to nucleotide coordinates. */
+ (*filter_maskloc)->seqloc_array[context] = filter_per_context;
}
-
return 0;
}
-Int2
+void
Blast_MaskTheResidues(Uint1 * buffer, Int4 length, Boolean is_na,
const BlastSeqLoc* mask_loc, Boolean reverse, Int4 offset)
{
- SSeqRange *loc = NULL;
- Int2 status = 0;
- Int4 index, start, stop;
- const Uint1 kMaskingLetter = is_na ? kNuclMask : kProtMask;
-
+ ASSERT(buffer);
for (; mask_loc; mask_loc = mask_loc->next) {
- loc = (SSeqRange *) mask_loc->ssr;
+
+ Int4 index, start, stop;
+ const Uint1 kMaskingLetter = is_na ? kNuclMask : kProtMask;
+
if (reverse) {
- start = length - 1 - loc->right;
- stop = length - 1 - loc->left;
+ start = length - 1 - mask_loc->ssr->right;
+ stop = length - 1 - mask_loc->ssr->left;
} else {
- start = loc->left;
- stop = loc->right;
+ start = mask_loc->ssr->left;
+ stop = mask_loc->ssr->right;
}
start -= offset;
@@ -1079,47 +1050,38 @@ Blast_MaskTheResidues(Uint1 * buffer, Int4 length, Boolean is_na,
for (index = start; index <= stop; index++)
buffer[index] = kMaskingLetter;
}
-
- return status;
}
-Int2
-BlastSetUp_MaskQuery(BLAST_SequenceBlk* query_blk, const BlastQueryInfo* query_info, const BlastMaskLoc *filter_maskloc, EBlastProgramType program_number)
+void
+BlastSetUp_MaskQuery(BLAST_SequenceBlk* query_blk,
+ const BlastQueryInfo* query_info,
+ const BlastMaskLoc *filter_maskloc,
+ EBlastProgramType program_number)
{
const Boolean kIsNucl = (program_number == eBlastTypeBlastn);
Int4 context; /* loop variable. */
- Int2 status=0;
+
+ ASSERT(query_blk);
+ ASSERT(query_info);
+ ASSERT(filter_maskloc);
for (context = query_info->first_context;
context <= query_info->last_context; ++context) {
- BlastSeqLoc *filter_per_context = NULL; /* Used to hold combined SeqLoc's */
- Boolean reverse = BlastIsReverseStrand(kIsNucl, context);
- Int4 query_length;
- Int4 context_offset;
- Int4 maskloc_index;
- Uint1 *buffer; /* holds sequence */
+ Int4 query_length = 0;
+ Int4 context_offset = 0;
+ Uint1 *buffer = NULL; /* holds sequence */
/* For each query, check if forward strand is present */
- if ((query_length = query_info->contexts[context].query_length) <= 0)
+ if ( (query_length = query_info->contexts[context].query_length) <= 0)
continue;
context_offset = query_info->contexts[context].query_offset;
buffer = &query_blk->sequence[context_offset];
+ ASSERT(buffer);
- maskloc_index = BlastGetMaskLocIndexFromContext(kIsNucl, context);
- filter_per_context = filter_maskloc->seqloc_array[maskloc_index];
-
- if (buffer) {
-
- if ((status =
- Blast_MaskTheResidues(buffer, query_length, kIsNucl,
- filter_per_context, reverse, 0)))
- {
- return status;
- }
- }
+ Blast_MaskTheResidues(buffer, query_length, kIsNucl,
+ filter_maskloc->seqloc_array[context],
+ BlastIsReverseStrand(kIsNucl, context), 0);
}
-
- return 0;
}
diff --git a/algo/blast/core/blast_filter.h b/algo/blast/core/blast_filter.h
index d03e1531..45240ffc 100644
--- a/algo/blast/core/blast_filter.h
+++ b/algo/blast/core/blast_filter.h
@@ -1,4 +1,4 @@
-/* $Id: blast_filter.h,v 1.32 2005/07/13 16:47:34 bealer Exp $
+/* $Id: blast_filter.h,v 1.34 2005/09/20 00:02:47 camacho Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -75,6 +75,22 @@ extern const Uint1 kProtMask;
NCBI_XBLAST_EXPORT
BlastSeqLoc* BlastSeqLocNew(BlastSeqLoc** head, Int4 from, Int4 to);
+/** Appends the BlastSeqLoc to the list of BlastSeqLoc-s pointed to by head.
+ * @param head Pointer to the head of the linked list of BlastSeqLoc-s [in]
+ * @param node Pointer to the node to be added to the list. If this is NULL,
+ * this function does nothing. [in]
+ * @returns pointer to the second argument to this function (i.e.: tail of the
+ * list)
+ */
+BlastSeqLoc* BlastSeqLocAppend(BlastSeqLoc** head, BlastSeqLoc* node);
+
+/** Deallocate a single BlastSeqLoc structure and its contents, without
+ * following its next pointer
+ * @param node structure to deallocate [in]
+ * @return NULL
+ */
+BlastSeqLoc* BlastSeqLocNodeFree(BlastSeqLoc* node);
+
/** Deallocate all BlastSeqLoc objects in a chain.
* @param loc object to be freed [in]
* @return NULL pointer returned.
@@ -82,13 +98,35 @@ BlastSeqLoc* BlastSeqLocNew(BlastSeqLoc** head, Int4 from, Int4 to);
NCBI_XBLAST_EXPORT
BlastSeqLoc* BlastSeqLocFree(BlastSeqLoc* loc);
+/** Make a deep copy of the linked list of BlastSeqLoc-s pointed to by its
+ * argument
+ * @param head head of the linked list [in]
+ * @return NULL on NULL input or memory allocation failure, else a copy of the
+ * list and its contents
+ */
+BlastSeqLoc* BlastSeqLocListDup(BlastSeqLoc* head);
+
/** Converts reverse strand coordinates to forward strand.
* @param filter_in BlastSeqLoc to be reversed [in]
* @param query_length length of query [in]
* @return reversed BlastSeqLoc
*/
NCBI_XBLAST_EXPORT
-BlastSeqLoc* BlastSeqLocReverse(const BlastSeqLoc* filter_in, Int4 query_length);
+BlastSeqLoc* BlastSeqLocReverse(const BlastSeqLoc* filter_in,
+ Int4 query_length);
+
+/** Go through all mask locations in one sequence,
+ * combine any that overlap. Deallocate the memory for the locations that
+ * were on the list, produce a new (merged) list of locations.
+ * @param mask_loc The list of masks to be merged [in]
+ * @param link_value Largest gap size between locations for which they
+ * should be linked together [in]
+ * @return The new (merged) list of masks or NULL if mask_loc is NULL or memory
+ * allocation failure.
+*/
+NCBI_XBLAST_EXPORT
+BlastSeqLoc*
+BlastSeqLocCombine(BlastSeqLoc* mask_loc, Int4 link_value);
/** Deallocate memory for a BlastMaskLoc structure
* as well as the BlastSeqLoc's pointed to.
@@ -98,8 +136,9 @@ BlastSeqLoc* BlastSeqLocReverse(const BlastSeqLoc* filter_in, Int4 query_length)
NCBI_XBLAST_EXPORT
BlastMaskLoc* BlastMaskLocFree(BlastMaskLoc* mask_loc);
-/** Allocate memory for a BlastMaskLoc, also allocates array for BlastSeqLoc* of length total.
- * @param total which context (i.e., strand) [in]
+/** Allocate memory for a BlastMaskLoc.
+ * @param total number of contexts for which SSeqLocs should be allocated
+ * (result of number of queries * number of contexts for given program) [in]
* @return Pointer to the allocated BlastMaskLoc structure.
*/
NCBI_XBLAST_EXPORT
@@ -110,6 +149,12 @@ BlastMaskLoc* BlastMaskLocNew(Int4 total);
* lists.
* @param mask_loc Mask locations structure [in|out]
* @param query_info Query information structure, containing contexts data [in]
+ * Note: This function does NOT take into consideration the strands requested
+ * to be searched, which is INCONSISTENT with what the C++ API does (this
+ * function is not called from the C++ API, only from the C API). Therefore,
+ * this function should either 1) be moved out of the CORE or 2) modified to
+ * take into consideration the strand specified for the nucleotide
+ * query/queries.
*/
Int2 BlastMaskLocDNAToProtein(BlastMaskLoc* mask_loc,
const BlastQueryInfo* query_info);
@@ -122,19 +167,6 @@ Int2 BlastMaskLocDNAToProtein(BlastMaskLoc* mask_loc,
Int2 BlastMaskLocProteinToDNA(BlastMaskLoc* mask_loc,
const BlastQueryInfo* query_info);
-/** Go through all mask locations in one sequence,
- * combine any that overlap. Deallocate the memory for the locations that
- * were on the list, produce a new (merged) list of locations.
- * @param mask_loc The list of masks to be merged [in]
- * @param mask_loc_out The new (merged) list of masks. [out]
- * @param link_value Largest gap size between locations fow which they
- * should be linked together [in]
-*/
-NCBI_XBLAST_EXPORT
-Int2
-CombineMaskLocations(BlastSeqLoc* mask_loc, BlastSeqLoc* *mask_loc_out,
- Int4 link_value);
-
/** This function takes the list of mask locations (i.e., regions that
* should not be searched or not added to lookup table) and makes up a set
* of SSeqRange*'s in the concatenated sequence built from a set of queries,
@@ -150,14 +182,13 @@ CombineMaskLocations(BlastSeqLoc* mask_loc, BlastSeqLoc* *mask_loc_out,
* @param complement_mask Linked list of SSeqRange*s in the concatenated
* sequence to be indexed in the lookup table . [out]
*/
-NCBI_XBLAST_EXPORT
Int2
BLAST_ComplementMaskLocations(EBlastProgramType program_number,
const BlastQueryInfo* query_info, const BlastMaskLoc* mask_loc,
BlastSeqLoc* *complement_mask);
-/** Runs filtering functions, according to the filtering options, returns
- * SeqLocPtr. Should combine all SeqLocs so they are non-redundant.
+/** Runs seg filtering functions, according to the filtering options, returns
+ * BlastSeqLoc*. Should combine all SeqLocs so they are non-redundant.
* @param program_number Type of BLAST program [in]
* @param sequence The sequence or part of the sequence to be filtered [in]
* @param length Length of the (sub)sequence [in]
@@ -188,23 +219,26 @@ BlastSetUp_Filter(EBlastProgramType program_number,
*/
NCBI_XBLAST_EXPORT
Int2
-BlastSetUp_GetFilteringLocations(BLAST_SequenceBlk* query_blk, const BlastQueryInfo* query_info,
- EBlastProgramType program_number, const SBlastFilterOptions* filter_options,
- BlastMaskLoc** filter_out, Blast_Message* *blast_message);
+BlastSetUp_GetFilteringLocations(BLAST_SequenceBlk* query_blk,
+ const BlastQueryInfo* query_info,
+ EBlastProgramType program_number,
+ const SBlastFilterOptions* filter_options,
+ BlastMaskLoc** filter_out,
+ Blast_Message* *blast_message);
/** Masks the letters in buffer.
* This is a low-level routine and takes a raw buffer which it assumes
* to be in ncbistdaa (protein) or blastna (nucleotide).
- * @param buffer the sequence to be masked (will be modified). [out]
+ * @param buffer the sequence to be masked (will be modified, cannot be NULL or
+ * undefined behavior will result).[in|out]
* @param length length of the sequence to be masked . [in]
* @param is_na nucleotide if TRUE [in]
* @param mask_loc the BlastSeqLoc to use for masking [in]
* @param reverse minus strand if TRUE [in]
* @param offset how far along sequence is 1st residuse in buffer [in]
- *
*/
NCBI_XBLAST_EXPORT
-Int2
+void
Blast_MaskTheResidues(Uint1 * buffer, Int4 length, Boolean is_na,
const BlastSeqLoc* mask_loc, Boolean reverse, Int4 offset);
@@ -215,9 +249,11 @@ Blast_MaskTheResidues(Uint1 * buffer, Int4 length, Boolean is_na,
* @param program_number one of blastn,blastp,blastx,etc. [in]
*/
NCBI_XBLAST_EXPORT
-Int2
-BlastSetUp_MaskQuery(BLAST_SequenceBlk* query_blk, const BlastQueryInfo* query_info,
- const BlastMaskLoc *filter_maskloc, EBlastProgramType program_number);
+void
+BlastSetUp_MaskQuery(BLAST_SequenceBlk* query_blk,
+ const BlastQueryInfo* query_info,
+ const BlastMaskLoc *filter_maskloc,
+ EBlastProgramType program_number);
/** Produces SBlastFilterOptions from a string that has been traditionally supported
* in blast.
@@ -229,8 +265,10 @@ BlastSetUp_MaskQuery(BLAST_SequenceBlk* query_blk, const BlastQueryInfo* query_i
*/
NCBI_XBLAST_EXPORT
Int2
-BlastFilteringOptionsFromString(EBlastProgramType program_number, const char* instructions,
- SBlastFilterOptions* *filtering_options, Blast_Message* *blast_message);
+BlastFilteringOptionsFromString(EBlastProgramType program_number,
+ const char* instructions,
+ SBlastFilterOptions* *filtering_options,
+ Blast_Message* *blast_message);
#ifdef __cplusplus
}
diff --git a/algo/blast/core/blast_gapalign.c b/algo/blast/core/blast_gapalign.c
index 203d0c88..99e7dee6 100644
--- a/algo/blast/core/blast_gapalign.c
+++ b/algo/blast/core/blast_gapalign.c
@@ -1,4 +1,4 @@
-/* $Id: blast_gapalign.c,v 1.159 2005/08/22 17:57:09 madden Exp $
+/* $Id: blast_gapalign.c,v 1.163 2005/11/30 18:29:14 papadopo Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
@@ -23,7 +23,6 @@
*
* ===========================================================================
*
- * Author: Ilya Dondoshansky
*
*/
@@ -33,7 +32,7 @@
#ifndef SKIP_DOXYGEN_PROCESSING
static char const rcsid[] =
- "$Id: blast_gapalign.c,v 1.159 2005/08/22 17:57:09 madden Exp $";
+ "$Id: blast_gapalign.c,v 1.163 2005/11/30 18:29:14 papadopo Exp $";
#endif /* SKIP_DOXYGEN_PROCESSING */
#include <algo/blast/core/blast_options.h>
@@ -205,7 +204,8 @@ s_BlastGreedyAlignMemAlloc(const BlastScoringParameters* score_params,
if (gap_open == 0 && gap_extend == 0)
gap_extend = reward / 2 + penalty;
- max_d = (Int4) (max_dbseq_length / GREEDY_MAX_COST_FRACTION + 1);
+ max_d = MIN(GREEDY_MAX_COST,
+ max_dbseq_length / GREEDY_MAX_COST_FRACTION + 1);
gamp = (SGreedyAlignMem*) calloc(1, sizeof(SGreedyAlignMem));
@@ -253,8 +253,10 @@ s_BlastGreedyAlignMemAlloc(const BlastScoringParameters* score_params,
for (i = 1; i <= max_cost; i++)
gamp->last_seq2_off_affine[i] =
gamp->last_seq2_off_affine[i-1] + 2*max_d_1 + 6;
- if (!gamp->last_seq2_off_affine || !gamp->last_seq2_off_affine[0])
+ if (!gamp->last_seq2_off_affine || !gamp->last_seq2_off_affine[0]) {
s_BlastGreedyAlignsFree(gamp);
+ return NULL;
+ }
}
gamp->max_score = (Int4*) malloc(sizeof(Int4) * (max_d + 1 + d_diff));
@@ -280,6 +282,7 @@ BLAST_GapAlignStructFree(BlastGapAlignStruct* gap_align)
if (gap_align->greedy_align_mem)
s_BlastGreedyAlignsFree(gap_align->greedy_align_mem);
GapStateFree(gap_align->state_struct);
+ sfree(gap_align->dp_mem);
sfree(gap_align);
return NULL;
@@ -306,7 +309,16 @@ BLAST_GapAlignStructNew(const BlastScoringParameters* score_params,
gap_align->gap_x_dropoff = ext_params->gap_x_dropoff;
- if (ext_params->options->ePrelimGapExt != eDynProgExt) {
+ if (ext_params->options->ePrelimGapExt == eDynProgExt) {
+ /* allocate structures for ordinary dynamic programming */
+ gap_align->dp_mem_alloc = 1000;
+ gap_align->dp_mem = (BlastGapDP *)malloc(gap_align->dp_mem_alloc *
+ sizeof(BlastGapDP));
+ if (!gap_align->dp_mem)
+ gap_align = BLAST_GapAlignStructFree(gap_align);
+ }
+ else {
+ /* allocate structures for greedy dynamic programming */
max_subject_length = MIN(max_subject_length, MAX_DBSEQ_LEN);
gap_align->greedy_align_mem =
s_BlastGreedyAlignMemAlloc(score_params, ext_params,
@@ -331,14 +343,10 @@ enum {
SCRIPT_SUB = eGapAlignSub, /**< Substitution */
SCRIPT_GAP_IN_A = eGapAlignDel, /**< Deletion */
SCRIPT_GAP_IN_B = eGapAlignIns, /**< Insertion */
- SCRIPT_DECLINE = eGapAlignDecline, /**< Decline to align */
SCRIPT_OP_MASK = 0x07, /**< Mask for edit script operations */
- SCRIPT_EXTEND_DECLINE= 0x08, /**< continue declining alignment */
SCRIPT_EXTEND_GAP_A = 0x10, /**< continue a gap in A */
- SCRIPT_DECLINE_A = 0x20, /**< continue declining alignment for A */
SCRIPT_EXTEND_GAP_B = 0x40, /**< continue a gap in B */
- SCRIPT_DECLINE_B = 0x80 /**< continue declining alignment for B */
};
/** Low level function to perform dynamic programming gapped extension
@@ -376,13 +384,10 @@ ALIGN_EX(Uint1* A, Uint1* B, Int4 M, Int4 N, Int4* a_offset,
Uint1* b_ptr;
BlastGapDP* score_array;
- Int4 score_array_size;
- Int4 score_array_origin;
Int4 gap_open;
Int4 gap_extend;
Int4 gap_open_extend;
- Int4 decline_penalty;
Int4 x_dropoff;
Int4 best_score;
@@ -393,9 +398,7 @@ ALIGN_EX(Uint1* A, Uint1* B, Int4 M, Int4 N, Int4* a_offset,
Int4 score;
Int4 score_gap_row;
Int4 score_gap_col;
- Int4 score_decline;
Int4 next_score;
- Int4 next_score_decline;
GapStateArrayStruct* state_struct;
Uint1* edit_script_row;
@@ -416,7 +419,6 @@ ALIGN_EX(Uint1* A, Uint1* B, Int4 M, Int4 N, Int4* a_offset,
gap_open = score_params->gap_open;
gap_extend = score_params->gap_extend;
gap_open_extend = gap_open + gap_extend;
- decline_penalty = score_params->decline_align;
x_dropoff = gap_align->gap_x_dropoff;
if (x_dropoff < gap_open_extend)
@@ -448,30 +450,29 @@ ALIGN_EX(Uint1* A, Uint1* B, Int4 M, Int4 N, Int4* a_offset,
the alignment can only go x_dropoff/gap_extend positions
at most before failing the X dropoff criterion */
- if (gap_extend > 0) {
+ if (gap_extend > 0)
num_extra_cells = x_dropoff / gap_extend + 3;
- state_struct = s_GapGetState(&gap_align->state_struct,
- num_extra_cells);
- score_array_size = 2 * num_extra_cells;
- }
- else {
- num_extra_cells = 0;
- state_struct = s_GapGetState(&gap_align->state_struct, N + 3);
- score_array_size = N + 3;
+ else
+ num_extra_cells = N + 3;
+
+ if (num_extra_cells > gap_align->dp_mem_alloc) {
+ gap_align->dp_mem_alloc = MAX(num_extra_cells + 100,
+ 2 * gap_align->dp_mem_alloc);
+ sfree(gap_align->dp_mem);
+ gap_align->dp_mem = (BlastGapDP *)malloc(gap_align->dp_mem_alloc *
+ sizeof(BlastGapDP));
}
+ state_struct = s_GapGetState(&gap_align->state_struct, num_extra_cells);
+
edit_script[0] = state_struct->state_array;
edit_start_offset[0] = 0;
edit_script_row = state_struct->state_array;
- score_array_size = MAX(100, score_array_size);
- score_array_origin = 0;
-
score = -gap_open_extend;
- score_array = (BlastGapDP*)malloc(score_array_size * sizeof(BlastGapDP));
+ score_array = gap_align->dp_mem;
score_array[0].best = 0;
score_array[0].best_gap = -gap_open_extend;
- score_array[0].best_decline = -gap_open_extend - decline_penalty;
for (i = 1; i <= N; i++) {
if (score < -x_dropoff)
@@ -479,7 +480,6 @@ ALIGN_EX(Uint1* A, Uint1* B, Int4 M, Int4 N, Int4* a_offset,
score_array[i].best = score;
score_array[i].best_gap = score - gap_open_extend;
- score_array[i].best_decline = score - gap_open_extend - decline_penalty;
score -= gap_extend;
edit_script_row[i] = SCRIPT_GAP_IN_A;
}
@@ -554,21 +554,13 @@ ALIGN_EX(Uint1* A, Uint1* B, Int4 M, Int4 N, Int4* a_offset,
score = MININT;
score_gap_row = MININT;
- score_decline = MININT;
last_b_index = first_b_index;
for (b_index = first_b_index; b_index < b_size; b_index++) {
- /* convert the current B offset into an offset
- suitable for the current array of auxiliary
- structures */
-
- Int4 s_index = b_index - score_array_origin;
-
b_ptr += b_increment;
- score_gap_col = score_array[s_index].best_gap;
- next_score = score_array[s_index].best + matrix_row[ *b_ptr ];
- next_score_decline = score_array[s_index].best_decline;
+ score_gap_col = score_array[b_index].best_gap;
+ next_score = score_array[b_index].best + matrix_row[ *b_ptr ];
/* script, script_row and script_col contain the
actions specified by the dynamic programming.
@@ -578,36 +570,17 @@ ALIGN_EX(Uint1* A, Uint1* B, Int4 M, Int4 N, Int4* a_offset,
this inner loop is exactly the same as the one
in Blast_SemiGappedAlign() */
- if (score_decline > score) {
- script = SCRIPT_DECLINE;
- score = score_decline;
- }
- else {
- script = SCRIPT_SUB;
- }
-
- if (score_gap_col < score_decline) {
- score_gap_col = score_decline;
- script_col = SCRIPT_DECLINE_B;
- }
- else {
- script_col = SCRIPT_EXTEND_GAP_B;
- if (score < score_gap_col) {
- script = SCRIPT_GAP_IN_B;
- score = score_gap_col;
- }
- }
+ script = SCRIPT_SUB;
+ script_col = SCRIPT_EXTEND_GAP_B;
+ script_row = SCRIPT_EXTEND_GAP_A;
- if (score_gap_row < score_decline) {
- score_gap_row = score_decline;
- script_row = SCRIPT_DECLINE_A;
+ if (score < score_gap_col) {
+ script = SCRIPT_GAP_IN_B;
+ score = score_gap_col;
}
- else {
- script_row = SCRIPT_EXTEND_GAP_A;
- if (score < score_gap_row) {
- script = SCRIPT_GAP_IN_A;
- score = score_gap_row;
- }
+ if (score < score_gap_row) {
+ script = SCRIPT_GAP_IN_A;
+ score = score_gap_row;
}
if (best_score - score > x_dropoff) {
@@ -615,7 +588,7 @@ ALIGN_EX(Uint1* A, Uint1* B, Int4 M, Int4 N, Int4* a_offset,
if (first_b_index == b_index)
first_b_index++;
else
- score_array[s_index].best = MININT;
+ score_array[b_index].best = MININT;
}
else {
last_b_index = b_index;
@@ -628,10 +601,10 @@ ALIGN_EX(Uint1* A, Uint1* B, Int4 M, Int4 N, Int4* a_offset,
score_gap_row -= gap_extend;
score_gap_col -= gap_extend;
if (score_gap_col < (score - gap_open_extend)) {
- score_array[s_index].best_gap = score - gap_open_extend;
+ score_array[b_index].best_gap = score - gap_open_extend;
}
else {
- score_array[s_index].best_gap = score_gap_col;
+ score_array[b_index].best_gap = score_gap_col;
script += script_col;
}
@@ -640,52 +613,35 @@ ALIGN_EX(Uint1* A, Uint1* B, Int4 M, Int4 N, Int4* a_offset,
else
script += script_row;
- if (score_decline < (score - gap_open)) {
- score_array[s_index].best_decline = score -
- gap_open - decline_penalty;
- }
- else {
- score_array[s_index].best_decline = score_decline -
- decline_penalty;
- script += SCRIPT_EXTEND_DECLINE;
- }
- score_array[s_index].best = score;
+ score_array[b_index].best = score;
}
score = next_score;
- score_decline = next_score_decline;
edit_script_row[b_index] = script;
}
if (first_b_index == b_size)
break;
- if (last_b_index + num_extra_cells + 3 >=
- score_array_size + score_array_origin) {
- BlastGapDP *new_array;
- score_array_size = 2 * score_array_size;
- new_array = (BlastGapDP *)malloc(score_array_size *
- sizeof(BlastGapDP));
- memcpy(new_array,
- score_array + (first_b_index - score_array_origin),
- (last_b_index - first_b_index + 1) * sizeof(BlastGapDP));
- sfree(score_array);
- score_array = new_array;
- score_array_origin = first_b_index;
+ if (last_b_index + num_extra_cells + 3 >= gap_align->dp_mem_alloc) {
+
+ gap_align->dp_mem_alloc = MAX(last_b_index + num_extra_cells + 100,
+ 2 * gap_align->dp_mem_alloc);
+ score_array = (BlastGapDP *)realloc(score_array,
+ gap_align->dp_mem_alloc *
+ sizeof(BlastGapDP));
+ gap_align->dp_mem = score_array;
}
+
if (last_b_index < b_size - 1) {
b_size = last_b_index + 1;
}
else {
while (score_gap_row >= (best_score - x_dropoff) && b_size <= N) {
- Int4 s_index = b_size - score_array_origin;
-
- score_array[s_index].best = score_gap_row;
- score_array[s_index].best_gap = score_gap_row - gap_open_extend;
- score_array[s_index].best_decline = score_gap_row - gap_open -
- decline_penalty;
+ score_array[b_size].best = score_gap_row;
+ score_array[b_size].best_gap = score_gap_row - gap_open_extend;
score_gap_row -= gap_extend;
edit_script_row[b_size] = SCRIPT_GAP_IN_A;
b_size++;
@@ -698,11 +654,8 @@ ALIGN_EX(Uint1* A, Uint1* B, Int4 M, Int4 N, Int4* a_offset,
state_struct->used += MAX(b_index, b_size) - orig_b_index + 1;
if (b_size <= N) {
- Int4 s_index = b_size - score_array_origin;
-
- score_array[s_index].best = MININT;
- score_array[s_index].best_gap = MININT;
- score_array[s_index].best_decline = MININT;
+ score_array[b_size].best = MININT;
+ score_array[b_size].best_gap = MININT;
b_size++;
}
}
@@ -715,7 +668,7 @@ ALIGN_EX(Uint1* A, Uint1* B, Int4 M, Int4 N, Int4* a_offset,
b_index = *b_offset;
script = SCRIPT_SUB;
- for (i = 0; a_index > 0 || b_index > 0; i++) {
+ while (a_index > 0 || b_index > 0) {
/* Retrieve the next action to perform. Rows of
the traceback array do not necessarily start
at offset zero of B, so a correction is needed
@@ -729,22 +682,12 @@ ALIGN_EX(Uint1* A, Uint1* B, Int4 M, Int4 N, Int4* a_offset,
script = next_script & SCRIPT_OP_MASK;
if (next_script & SCRIPT_EXTEND_GAP_A)
script = SCRIPT_GAP_IN_A;
- else if (next_script & SCRIPT_DECLINE_A)
- script = SCRIPT_DECLINE;
break;
case SCRIPT_GAP_IN_B:
script = next_script & SCRIPT_OP_MASK;
if (next_script & SCRIPT_EXTEND_GAP_B)
script = SCRIPT_GAP_IN_B;
- else if (next_script & SCRIPT_DECLINE_B)
- script = SCRIPT_DECLINE;
- break;
-
- case SCRIPT_DECLINE:
- script = next_script & SCRIPT_OP_MASK;
- if (next_script & SCRIPT_EXTEND_DECLINE)
- script = SCRIPT_DECLINE;
break;
default:
@@ -767,7 +710,6 @@ ALIGN_EX(Uint1* A, Uint1* B, Int4 M, Int4 N, Int4* a_offset,
sfree(edit_start_offset);
sfree(edit_script);
- sfree(score_array);
return best_score;
}
@@ -802,13 +744,10 @@ Blast_SemiGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N,
Uint1* b_ptr;
BlastGapDP* score_array;
- Int4 score_array_size;
- Int4 score_array_origin;
Int4 gap_open; /* alignment penalty variables */
Int4 gap_extend;
Int4 gap_open_extend;
- Int4 decline_penalty;
Int4 x_dropoff;
Int4** matrix = NULL; /* pointers to the score matrix */
@@ -818,9 +757,7 @@ Blast_SemiGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N,
Int4 score; /* score tracking variables */
Int4 score_gap_row;
Int4 score_gap_col;
- Int4 score_decline;
Int4 next_score;
- Int4 next_score_decline;
Int4 best_score;
Int4 num_extra_cells;
@@ -840,7 +777,6 @@ Blast_SemiGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N,
gap_open = score_params->gap_open;
gap_extend = score_params->gap_extend;
gap_open_extend = gap_open + gap_extend;
- decline_penalty = score_params->decline_align;
x_dropoff = gap_align->gap_x_dropoff;
if (x_dropoff < gap_open_extend)
@@ -855,23 +791,23 @@ Blast_SemiGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N,
set of DP computations. The initial window size is determined
by the number of cells needed to fail the x-dropoff test */
- if (gap_extend > 0) {
+ if (gap_extend > 0)
num_extra_cells = x_dropoff / gap_extend + 3;
- score_array_size = 2 * num_extra_cells;
- }
- else {
- num_extra_cells = 0;
- score_array_size = N + 3;
+ else
+ num_extra_cells = N + 3;
+
+ if (num_extra_cells > gap_align->dp_mem_alloc) {
+ gap_align->dp_mem_alloc = MAX(num_extra_cells + 100,
+ 2 * gap_align->dp_mem_alloc);
+ sfree(gap_align->dp_mem);
+ gap_align->dp_mem = (BlastGapDP *)malloc(gap_align->dp_mem_alloc *
+ sizeof(BlastGapDP));
}
- score_array_size = MAX(100, score_array_size);
- score_array_origin = 0;
-
+ score_array = gap_align->dp_mem;
score = -gap_open_extend;
- score_array = (BlastGapDP*)malloc(score_array_size * sizeof(BlastGapDP));
score_array[0].best = 0;
score_array[0].best_gap = -gap_open_extend;
- score_array[0].best_decline = -gap_open_extend - decline_penalty;
for (i = 1; i <= N; i++) {
if (score < -x_dropoff)
@@ -879,7 +815,6 @@ Blast_SemiGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N,
score_array[i].best = score;
score_array[i].best_gap = score - gap_open_extend;
- score_array[i].best_decline = score - gap_open_extend - decline_penalty;
score -= gap_extend;
}
@@ -919,48 +854,18 @@ Blast_SemiGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N,
/* initialize running-score variables */
score = MININT;
score_gap_row = MININT;
- score_decline = MININT;
last_b_index = first_b_index;
for (b_index = first_b_index; b_index < b_size; b_index++) {
- /* convert the current B offset into an offset
- suitable for the current array of auxiliary
- structures */
-
- Int4 s_index = b_index - score_array_origin;
-
- /* Note that this formulation of dynamic programming
- requires looking at an offset into B[] that is one
- beyond b_index. Since b_index can equal the length
- of B[], this means that a sentinel byte for B is
- *required* by this aligner */
-
b_ptr += b_increment;
- score_gap_col = score_array[s_index].best_gap;
- next_score = score_array[s_index].best + matrix_row[ *b_ptr ];
- next_score_decline = score_array[s_index].best_decline;
-
- /* decline the alignment if that improves the score */
+ score_gap_col = score_array[b_index].best_gap;
+ next_score = score_array[b_index].best + matrix_row[ *b_ptr ];
- score = MAX(score, score_decline);
-
- /* decline the best row score if that improves it;
- if not, make it the new high score if it's
- an improvement */
-
- if (score_gap_col < score_decline)
- score_gap_col = score_decline;
- else if (score < score_gap_col)
+ if (score < score_gap_col)
score = score_gap_col;
- /* decline the best column score if that improves it;
- if not, make it the new high score if it's
- an improvement */
-
- if (score_gap_row < score_decline)
- score_gap_row = score_decline;
- else if (score < score_gap_row)
+ if (score < score_gap_row)
score = score_gap_row;
if (best_score - score > x_dropoff) {
@@ -978,7 +883,7 @@ Blast_SemiGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N,
if (b_index == first_b_index)
first_b_index++;
else
- score_array[s_index].best = MININT;
+ score_array[b_index].best = MININT;
}
else {
last_b_index = b_index;
@@ -989,24 +894,20 @@ Blast_SemiGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N,
}
/* If starting a gap at this position will improve
- the best row, column, or declined alignment score,
- update them to reflect that. */
+ the best row, or column, score, update them to
+ reflect that. */
score_gap_row -= gap_extend;
score_gap_col -= gap_extend;
- score_array[s_index].best_gap = MAX(score - gap_open_extend,
+ score_array[b_index].best_gap = MAX(score - gap_open_extend,
score_gap_col);
score_gap_row = MAX(score - gap_open_extend, score_gap_row);
-
- score_array[s_index].best_decline =
- MAX(score_decline, score - gap_open) - decline_penalty;
- score_array[s_index].best = score;
+ score_array[b_index].best = score;
}
score = next_score;
- score_decline = next_score_decline;
}
-
+
/* Finish aligning if the best scores for all positions
of B will fail the X-dropoff test, i.e. the inner loop
bounds have converged to each other */
@@ -1014,21 +915,16 @@ Blast_SemiGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N,
if (first_b_index == b_size)
break;
- /* enlarge the window for score data if necessary. If the
- window expands, move its origin to first_b_index */
-
- if (last_b_index + num_extra_cells + 3 >=
- score_array_size + score_array_origin) {
- BlastGapDP *new_array;
- score_array_size = 2 * score_array_size;
- new_array = (BlastGapDP *)malloc(score_array_size *
- sizeof(BlastGapDP));
- memcpy(new_array,
- score_array + (first_b_index - score_array_origin),
- (last_b_index - first_b_index + 1) * sizeof(BlastGapDP));
- sfree(score_array);
- score_array = new_array;
- score_array_origin = first_b_index;
+ /* enlarge the window for score data if necessary */
+
+ if (last_b_index + num_extra_cells + 3 >= gap_align->dp_mem_alloc) {
+
+ gap_align->dp_mem_alloc = MAX(last_b_index + num_extra_cells + 100,
+ 2 * gap_align->dp_mem_alloc);
+ score_array = (BlastGapDP *)realloc(score_array,
+ gap_align->dp_mem_alloc *
+ sizeof(BlastGapDP));
+ gap_align->dp_mem = score_array;
}
if (last_b_index < b_size - 1) {
@@ -1045,33 +941,20 @@ Blast_SemiGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N,
The next inner loop will have larger bounds */
while (score_gap_row >= (best_score - x_dropoff) && b_size <= N) {
-
- /* convert the current B offset into an offset
- suitable for the current array of auxiliary
- structures. */
-
- Int4 s_index = b_size - score_array_origin;
-
- score_array[s_index].best = score_gap_row;
- score_array[s_index].best_gap = score_gap_row - gap_open_extend;
- score_array[s_index].best_decline = score_gap_row - gap_open -
- decline_penalty;
+ score_array[b_size].best = score_gap_row;
+ score_array[b_size].best_gap = score_gap_row - gap_open_extend;
score_gap_row -= gap_extend;
b_size++;
}
}
if (b_size <= N) {
- Int4 s_index = b_size - score_array_origin;
-
- score_array[s_index].best = MININT;
- score_array[s_index].best_gap = MININT;
- score_array[s_index].best_decline = MININT;
+ score_array[b_size].best = MININT;
+ score_array[b_size].best_gap = MININT;
b_size++;
}
}
- sfree(score_array);
return best_score;
}
@@ -1115,9 +998,7 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N,
Int4 a_index;
Int4 b_index, b_size, first_b_index, last_b_index;
- BlastGapSmallDP* score_array;
- Int4 score_array_size;
- Int4 score_array_origin;
+ BlastGapDP* score_array;
Int4 gap_open; /* alignment penalty variables */
Int4 gap_extend;
@@ -1190,31 +1071,31 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N,
/* allocate storage for the first row of the traceback
array. Because row elements correspond to gaps in A,
- the alignment can only go x_dropoff/gap_extend positions
- at most before failing the X dropoff criterion */
+ the alignment can only go at most x_dropoff/gap_extend
+ positions, in all three frames, before failing the
+ X dropoff criterion */
- if (gap_extend > 0) {
+ if (gap_extend > 0)
num_extra_cells = CODON_LENGTH * (x_dropoff / gap_extend + 5);
- state_struct = s_GapGetState(&gap_align->state_struct,
- num_extra_cells);
- score_array_size = 3 * num_extra_cells;
- }
- else {
- num_extra_cells = 0;
- state_struct = s_GapGetState(&gap_align->state_struct, N + 5);
- score_array_size = N + 5;
+ else
+ num_extra_cells = N + 5;
+
+ if (num_extra_cells > gap_align->dp_mem_alloc) {
+ gap_align->dp_mem_alloc = MAX(num_extra_cells + 100,
+ 2 * gap_align->dp_mem_alloc);
+ sfree(gap_align->dp_mem);
+ gap_align->dp_mem = (BlastGapDP *)malloc(gap_align->dp_mem_alloc *
+ sizeof(BlastGapDP));
}
+ state_struct = s_GapGetState(&gap_align->state_struct, num_extra_cells);
+
edit_script[0] = state_struct->state_array;
edit_start_offset[0] = 0;
edit_script_row = state_struct->state_array;
- score_array_size = MAX(CODON_LENGTH * 100, score_array_size);
- score_array_origin = 0;
-
+ score_array = gap_align->dp_mem;
score = -gap_open_extend;
- score_array = (BlastGapSmallDP*)malloc(score_array_size *
- sizeof(BlastGapSmallDP));
score_array[0].best = 0;
score_array[0].best_gap = -gap_open_extend;
@@ -1324,12 +1205,6 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N,
while (b_index < b_size) {
- /* convert the current B offset into an offset
- suitable for the current array of auxiliary
- structures */
-
- Int4 s_index = b_index - score_array_origin;
-
/* FRAME 0 */
score = MAX(score_other_frame1, score_other_frame2) - shift_penalty;
@@ -1351,9 +1226,9 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N,
}
score += matrix_row[ B[ b_index * increment ] ];
- score_other_frame1 = MAX(score_col1, score_array[s_index].best);
- score_col1 = score_array[s_index].best;
- score_gap_col = score_array[s_index].best_gap;
+ score_other_frame1 = MAX(score_col1, score_array[b_index].best);
+ score_col1 = score_array[b_index].best;
+ score_gap_col = score_array[b_index].best_gap;
if (score < MAX(score_gap_col, score_row1)) {
if (score_gap_col > score_row1) {
@@ -1369,12 +1244,12 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N,
if (first_b_index == b_index)
first_b_index = b_index + 1;
else
- score_array[s_index].best = MININT;
+ score_array[b_index].best = MININT;
}
else {
last_b_index = b_index;
- score_array[s_index].best = score;
- score_array[s_index].best_gap = score_gap_col - gap_extend;
+ score_array[b_index].best = score;
+ score_array[b_index].best_gap = score_gap_col - gap_extend;
score_row1 -= gap_extend;
}
}
@@ -1383,11 +1258,11 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N,
if (first_b_index == b_index)
first_b_index = b_index + 1;
else
- score_array[s_index].best = MININT;
+ score_array[b_index].best = MININT;
}
else {
last_b_index = b_index;
- score_array[s_index].best = score;
+ score_array[b_index].best = score;
if (score > best_score) {
best_score = score;
*a_offset = a_index;
@@ -1403,11 +1278,11 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N,
score_gap_col -= gap_extend;
if (score < score_gap_col) {
- score_array[s_index].best_gap = score_gap_col;
+ score_array[b_index].best_gap = score_gap_col;
script |= SCRIPT_EXTEND_GAP_A;
}
else {
- score_array[s_index].best_gap = score;
+ score_array[b_index].best_gap = score;
}
}
}
@@ -1423,7 +1298,6 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N,
/* FRAME 1 */
- s_index++;
score = MAX(score_other_frame1, score_other_frame2) - shift_penalty;
score = MAX(score, score_col2);
if (score == score_col2) {
@@ -1442,9 +1316,9 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N,
script = SCRIPT_NEXT_PLUS_TWO_FRAMES;
}
score += matrix_row[ B[ b_index * increment ] ];
- score_other_frame2 = MAX(score_col2, score_array[s_index].best);
- score_col2 = score_array[s_index].best;
- score_gap_col = score_array[s_index].best_gap;
+ score_other_frame2 = MAX(score_col2, score_array[b_index].best);
+ score_col2 = score_array[b_index].best;
+ score_gap_col = score_array[b_index].best_gap;
if (score < MAX(score_gap_col, score_row2)) {
score = MAX(score_gap_col, score_row2);
@@ -1452,7 +1326,7 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N,
if (first_b_index == b_index)
first_b_index = b_index + 1;
else
- score_array[s_index].best = MININT;
+ score_array[b_index].best = MININT;
}
else {
if (score == score_gap_col)
@@ -1461,8 +1335,8 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N,
script = SCRIPT_OOF_OPEN_GAP | SCRIPT_GAP_IN_B;
last_b_index = b_index;
- score_array[s_index].best = score;
- score_array[s_index].best_gap = score_gap_col - gap_extend;
+ score_array[b_index].best = score;
+ score_array[b_index].best_gap = score_gap_col - gap_extend;
score_row2 -= gap_extend;
}
}
@@ -1471,11 +1345,11 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N,
if (first_b_index == b_index)
first_b_index = b_index + 1;
else
- score_array[s_index].best = MININT;
+ score_array[b_index].best = MININT;
}
else {
last_b_index = b_index;
- score_array[s_index].best = score;
+ score_array[b_index].best = score;
if (score > best_score) {
best_score = score;
*a_offset = a_index;
@@ -1490,11 +1364,11 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N,
score_gap_col -= gap_extend;
if (score < score_gap_col) {
- score_array[s_index].best_gap = score_gap_col;
+ score_array[b_index].best_gap = score_gap_col;
script |= SCRIPT_EXTEND_GAP_A;
}
else {
- score_array[s_index].best_gap = score;
+ score_array[b_index].best_gap = score;
}
}
}
@@ -1510,7 +1384,6 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N,
/* FRAME 2 */
- s_index++;
score = MAX(score_other_frame1, score_other_frame2) - shift_penalty;
score = MAX(score, score_col3);
if (score == score_col3) {
@@ -1530,9 +1403,9 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N,
}
score += matrix_row[ B[ b_index * increment ] ];
score_other_frame1 = score_other_frame2;
- score_other_frame2 = MAX(score_col3, score_array[s_index].best);
- score_col3 = score_array[s_index].best;
- score_gap_col = score_array[s_index].best_gap;
+ score_other_frame2 = MAX(score_col3, score_array[b_index].best);
+ score_col3 = score_array[b_index].best;
+ score_gap_col = score_array[b_index].best_gap;
if (score < MAX(score_gap_col, score_row3)) {
score = MAX(score_gap_col, score_row3);
@@ -1540,7 +1413,7 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N,
if (first_b_index == b_index)
first_b_index = b_index + 1;
else
- score_array[s_index].best = MININT;
+ score_array[b_index].best = MININT;
}
else {
if (score == score_gap_col)
@@ -1549,8 +1422,8 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N,
script = SCRIPT_OOF_OPEN_GAP | SCRIPT_GAP_IN_B;
last_b_index = b_index;
- score_array[s_index].best = score;
- score_array[s_index].best_gap = score_gap_col - gap_extend;
+ score_array[b_index].best = score;
+ score_array[b_index].best_gap = score_gap_col - gap_extend;
score_row3 -= gap_extend;
}
}
@@ -1559,11 +1432,11 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N,
if (first_b_index == b_index)
first_b_index = b_index + 1;
else
- score_array[s_index].best = MININT;
+ score_array[b_index].best = MININT;
}
else {
last_b_index = b_index;
- score_array[s_index].best = score;
+ score_array[b_index].best = score;
if (score > best_score) {
best_score = score;
*a_offset = a_index;
@@ -1578,11 +1451,11 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N,
score_gap_col -= gap_extend;
if (score < score_gap_col) {
- score_array[s_index].best_gap = score_gap_col;
+ score_array[b_index].best_gap = score_gap_col;
script |= SCRIPT_EXTEND_GAP_A;
}
else {
- score_array[s_index].best_gap = score;
+ score_array[b_index].best_gap = score;
}
}
}
@@ -1596,22 +1469,16 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N,
if (first_b_index == b_size)
break;
- /* Enlarge the window for score data, if necessary. If the
- window must be enlarged, move the window origin to
- first_b_index */
-
- if (last_b_index + num_extra_cells + 5 >=
- score_array_size + score_array_origin) {
- BlastGapSmallDP *new_array;
- score_array_size = 2 * score_array_size;
- new_array = (BlastGapSmallDP *)malloc(score_array_size *
- sizeof(BlastGapSmallDP));
- memcpy(new_array,
- score_array + (first_b_index - score_array_origin),
- (last_b_index - first_b_index + 1) * sizeof(BlastGapSmallDP));
- sfree(score_array);
- score_array = new_array;
- score_array_origin = first_b_index;
+ /* Enlarge the window for score data if necessary */
+
+ if (last_b_index + num_extra_cells + 5 >= gap_align->dp_mem_alloc) {
+
+ gap_align->dp_mem_alloc = MAX(last_b_index + num_extra_cells + 100,
+ 2 * gap_align->dp_mem_alloc);
+ score_array = (BlastGapDP *)realloc(score_array,
+ gap_align->dp_mem_alloc *
+ sizeof(BlastGapDP));
+ gap_align->dp_mem = score_array;
}
if (last_b_index < b_size - 1) {
@@ -1634,22 +1501,20 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N,
score = MAX(score, score_row3);
while (score >= (best_score - x_dropoff) && b_size < N + 1) {
- Int4 s_index = b_size - score_array_origin;
-
- score_array[s_index].best = score_row1;
- score_array[s_index].best_gap = score_row1 - gap_open_extend;
+ score_array[b_size].best = score_row1;
+ score_array[b_size].best_gap = score_row1 - gap_open_extend;
score_row1 -= gap_extend;
edit_script_row[b_size] = SCRIPT_OOF_OPEN_GAP |
SCRIPT_GAP_IN_B;
- score_array[s_index+1].best = score_row2;
- score_array[s_index+1].best_gap = score_row2 - gap_open_extend;
+ score_array[b_size+1].best = score_row2;
+ score_array[b_size+1].best_gap = score_row2 - gap_open_extend;
score_row2 -= gap_extend;
edit_script_row[b_size+1] = SCRIPT_OOF_OPEN_GAP |
SCRIPT_GAP_IN_B;
- score_array[s_index+2].best = score_row3;
- score_array[s_index+2].best_gap = score_row3 - gap_open_extend;
+ score_array[b_size+2].best = score_row3;
+ score_array[b_size+2].best_gap = score_row3 - gap_open_extend;
score_row3 -= gap_extend;
edit_script_row[b_size+2] = SCRIPT_OOF_OPEN_GAP |
SCRIPT_GAP_IN_B;
@@ -1668,9 +1533,8 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N,
last_b_index = MIN(b_size + 4, N + 3);
while (b_size < last_b_index) {
- Int4 s_index = b_size - score_array_origin;
- score_array[s_index].best = MININT;
- score_array[s_index].best_gap = MININT;
+ score_array[b_size].best = MININT;
+ score_array[b_size].best_gap = MININT;
b_size++;
}
}
@@ -1717,7 +1581,6 @@ s_OutOfFrameAlignWithTraceback(Uint1* A, Uint1* B, Int4 M, Int4 N,
sfree(edit_start_offset);
sfree(edit_script);
- sfree(score_array);
if (!reversed)
*b_offset -= 2;
@@ -1758,9 +1621,7 @@ s_OutOfFrameGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N,
Int4 shift_penalty;
Int4 x_dropoff;
- BlastGapSmallDP* score_array;
- Int4 score_array_size;
- Int4 score_array_origin;
+ BlastGapDP* score_array;
Int4 num_extra_cells;
Int4** matrix = NULL; /* pointers to the score matrix */
@@ -1812,21 +1673,21 @@ s_OutOfFrameGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N,
set of DP computations. The initial window size is determined
by the number of cells needed to fail the x-dropoff test */
- if (gap_extend > 0) {
+ if (gap_extend > 0)
num_extra_cells = CODON_LENGTH * (x_dropoff / gap_extend + 5);
- score_array_size = 3 * num_extra_cells;
- }
- else {
- num_extra_cells = 0;
- score_array_size = N + 5;
+ else
+ num_extra_cells = N + 5;
+
+ if (num_extra_cells > gap_align->dp_mem_alloc) {
+ gap_align->dp_mem_alloc = MAX(num_extra_cells + 100,
+ 2 * gap_align->dp_mem_alloc);
+ sfree(gap_align->dp_mem);
+ gap_align->dp_mem = (BlastGapDP *)malloc(gap_align->dp_mem_alloc *
+ sizeof(BlastGapDP));
}
- score_array_size = MAX(CODON_LENGTH*100, score_array_size);
- score_array_origin = 0;
-
+ score_array = gap_align->dp_mem;
score = -gap_open_extend;
- score_array = (BlastGapSmallDP*)malloc(score_array_size *
- sizeof(BlastGapSmallDP));
score_array[0].best = 0;
score_array[0].best_gap = -gap_open_extend;
@@ -1891,21 +1752,15 @@ s_OutOfFrameGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N,
while (b_index < b_size) {
- /* convert the current B offset into an offset
- suitable for the current array of auxiliary
- structures */
-
- Int4 s_index = b_index - score_array_origin;
-
/* FRAME 0 */
/* Pick the best score among all frames */
score = MAX(score_other_frame1, score_other_frame2) - shift_penalty;
score = MAX(score, score_col1) +
matrix_row[ B[ b_index * increment ] ];
- score_other_frame1 = MAX(score_col1, score_array[s_index].best);
- score_col1 = score_array[s_index].best;
- score_gap_col = score_array[s_index].best_gap;
+ score_other_frame1 = MAX(score_col1, score_array[b_index].best);
+ score_col1 = score_array[b_index].best;
+ score_gap_col = score_array[b_index].best_gap;
/* Use the row and column scores if they improve
the score overall */
@@ -1927,13 +1782,13 @@ s_OutOfFrameGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N,
if (first_b_index == b_index)
first_b_index = b_index + 1;
else
- score_array[s_index].best = MININT;
+ score_array[b_index].best = MININT;
}
else {
/* update the row and column running scores */
last_b_index = b_index;
- score_array[s_index].best = score;
- score_array[s_index].best_gap = score_gap_col - gap_extend;
+ score_array[b_index].best = score;
+ score_array[b_index].best_gap = score_gap_col - gap_extend;
score_row1 -= gap_extend;
}
}
@@ -1946,7 +1801,7 @@ s_OutOfFrameGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N,
if (first_b_index == b_index)
first_b_index = b_index + 1;
else
- score_array[s_index].best = MININT;
+ score_array[b_index].best = MININT;
}
else {
/* The current best score exceeds the
@@ -1954,7 +1809,7 @@ s_OutOfFrameGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N,
improve on the current optimal score */
last_b_index = b_index;
- score_array[s_index].best = score;
+ score_array[b_index].best = score;
if (score > best_score) {
best_score = score;
*a_offset = a_index;
@@ -1967,7 +1822,7 @@ s_OutOfFrameGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N,
score -= gap_open_extend;
score_row1 -= gap_extend;
score_row1 = MAX(score, score_row1);
- score_array[s_index].best_gap = MAX(score,
+ score_array[b_index].best_gap = MAX(score,
score_gap_col - gap_extend);
}
}
@@ -1990,13 +1845,12 @@ s_OutOfFrameGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N,
same as the preceeding code. The only real difference
is the updating of the other_frame best scores */
- s_index++;
score = MAX(score_other_frame1, score_other_frame2) - shift_penalty;
score = MAX(score, score_col2) +
matrix_row[ B[ b_index * increment ] ];
- score_other_frame2 = MAX(score_col2, score_array[s_index].best);
- score_col2 = score_array[s_index].best;
- score_gap_col = score_array[s_index].best_gap;
+ score_other_frame2 = MAX(score_col2, score_array[b_index].best);
+ score_col2 = score_array[b_index].best;
+ score_gap_col = score_array[b_index].best_gap;
if (score < MAX(score_gap_col, score_row2)) {
score = MAX(score_gap_col, score_row2);
@@ -2004,12 +1858,12 @@ s_OutOfFrameGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N,
if (first_b_index == b_index)
first_b_index = b_index + 1;
else
- score_array[s_index].best = MININT;
+ score_array[b_index].best = MININT;
}
else {
last_b_index = b_index;
- score_array[s_index].best = score;
- score_array[s_index].best_gap = score_gap_col - gap_extend;
+ score_array[b_index].best = score;
+ score_array[b_index].best_gap = score_gap_col - gap_extend;
score_row2 -= gap_extend;
}
}
@@ -2018,11 +1872,11 @@ s_OutOfFrameGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N,
if (first_b_index == b_index)
first_b_index = b_index + 1;
else
- score_array[s_index].best = MININT;
+ score_array[b_index].best = MININT;
}
else {
last_b_index = b_index;
- score_array[s_index].best = score;
+ score_array[b_index].best = score;
if (score > best_score) {
best_score = score;
*a_offset = a_index;
@@ -2031,7 +1885,7 @@ s_OutOfFrameGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N,
score -= gap_open_extend;
score_row2 -= gap_extend;
score_row2 = MAX(score, score_row2);
- score_array[s_index].best_gap = MAX(score,
+ score_array[b_index].best_gap = MAX(score,
score_gap_col - gap_extend);
}
}
@@ -2046,14 +1900,13 @@ s_OutOfFrameGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N,
/* FRAME 2 */
- s_index++;
score = MAX(score_other_frame1, score_other_frame2) - shift_penalty;
score = MAX(score, score_col3) +
matrix_row[ B[ b_index * increment ] ];
score_other_frame1 = score_other_frame2;
- score_other_frame2 = MAX(score_col3, score_array[s_index].best);
- score_col3 = score_array[s_index].best;
- score_gap_col = score_array[s_index].best_gap;
+ score_other_frame2 = MAX(score_col3, score_array[b_index].best);
+ score_col3 = score_array[b_index].best;
+ score_gap_col = score_array[b_index].best_gap;
if (score < MAX(score_gap_col, score_row3)) {
score = MAX(score_gap_col, score_row3);
@@ -2061,12 +1914,12 @@ s_OutOfFrameGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N,
if (first_b_index == b_index)
first_b_index = b_index + 1;
else
- score_array[s_index].best = MININT;
+ score_array[b_index].best = MININT;
}
else {
last_b_index = b_index;
- score_array[s_index].best = score;
- score_array[s_index].best_gap = score_gap_col - gap_extend;
+ score_array[b_index].best = score;
+ score_array[b_index].best_gap = score_gap_col - gap_extend;
score_row3 -= gap_extend;
}
}
@@ -2075,11 +1928,11 @@ s_OutOfFrameGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N,
if (first_b_index == b_index)
first_b_index = b_index + 1;
else
- score_array[s_index].best = MININT;
+ score_array[b_index].best = MININT;
}
else {
last_b_index = b_index;
- score_array[s_index].best = score;
+ score_array[b_index].best = score;
if (score > best_score) {
best_score = score;
*a_offset = a_index;
@@ -2088,7 +1941,7 @@ s_OutOfFrameGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N,
score -= gap_open_extend;
score_row3 -= gap_extend;
score_row3 = MAX(score, score_row3);
- score_array[s_index].best_gap = MAX(score,
+ score_array[b_index].best_gap = MAX(score,
score_gap_col - gap_extend);
}
}
@@ -2102,22 +1955,16 @@ s_OutOfFrameGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N,
if (first_b_index == b_size)
break;
- /* Enlarge the window for score data, if necessary. If the
- window must be enlarged, move the window origin to
- first_b_index */
-
- if (b_size + num_extra_cells + 5 >=
- score_array_size + score_array_origin) {
- BlastGapSmallDP *new_array;
- score_array_size = 2 * score_array_size;
- new_array = (BlastGapSmallDP *)malloc(score_array_size *
- sizeof(BlastGapSmallDP));
- memcpy(new_array,
- score_array + (first_b_index - score_array_origin),
- (b_size - first_b_index + 1) * sizeof(BlastGapSmallDP));
- sfree(score_array);
- score_array = new_array;
- score_array_origin = first_b_index;
+ /* Enlarge the window for score data, if necessary */
+
+ if (b_size + num_extra_cells + 5 >= gap_align->dp_mem_alloc) {
+
+ gap_align->dp_mem_alloc = MAX(b_size + num_extra_cells + 100,
+ 2 * gap_align->dp_mem_alloc);
+ score_array = (BlastGapDP *)realloc(score_array,
+ gap_align->dp_mem_alloc *
+ sizeof(BlastGapDP));
+ gap_align->dp_mem = score_array;
}
if (last_b_index < b_size - 1) {
@@ -2140,18 +1987,16 @@ s_OutOfFrameGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N,
score = MAX(score, score_row3);
while (score >= (best_score - x_dropoff) && b_size < N + 1) {
- Int4 s_index = b_size - score_array_origin;
-
- score_array[s_index].best = score_row1;
- score_array[s_index].best_gap = score_row1 - gap_open_extend;
+ score_array[b_size].best = score_row1;
+ score_array[b_size].best_gap = score_row1 - gap_open_extend;
score_row1 -= gap_extend;
- score_array[s_index+1].best = score_row2;
- score_array[s_index+1].best_gap = score_row2 - gap_open_extend;
+ score_array[b_size+1].best = score_row2;
+ score_array[b_size+1].best_gap = score_row2 - gap_open_extend;
score_row2 -= gap_extend;
- score_array[s_index+2].best = score_row3;
- score_array[s_index+2].best_gap = score_row3 - gap_open_extend;
+ score_array[b_size+2].best = score_row3;
+ score_array[b_size+2].best_gap = score_row3 - gap_open_extend;
score_row3 -= gap_extend;
b_size += 3;
@@ -2163,9 +2008,8 @@ s_OutOfFrameGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N,
b_size = MIN(b_size, N + 1);
last_b_index = MIN(b_size + 4, N + 3);
while (b_size < last_b_index) {
- Int4 s_index = b_size - score_array_origin;
- score_array[s_index].best = MININT;
- score_array[s_index].best_gap = MININT;
+ score_array[b_size].best = MININT;
+ score_array[b_size].best_gap = MININT;
b_size++;
}
}
@@ -2174,7 +2018,6 @@ s_OutOfFrameGappedAlign(Uint1* A, Uint1* B, Int4 M, Int4 N,
/* The sequence was shifted, so length should be adjusted as well */
*b_offset -= 2;
}
- sfree(score_array);
return best_score;
}
@@ -2620,9 +2463,7 @@ s_BlastAlignPackedNucl(Uint1* B, Uint1* A, Int4 N, Int4 M,
Int4 b_index, b_size, first_b_index, last_b_index, b_increment;
Uint1* b_ptr;
- BlastGapSmallDP* score_array;
- Int4 score_array_size;
- Int4 score_array_origin;
+ BlastGapDP* score_array;
Int4 num_extra_cells;
Int4 gap_open; /* alignment penalty variables */
@@ -2666,21 +2507,21 @@ s_BlastAlignPackedNucl(Uint1* B, Uint1* A, Int4 N, Int4 M,
set of DP computations. The initial window size is determined
by the number of cells needed to fail the x-dropoff test */
- if (gap_extend > 0) {
+ if (gap_extend > 0)
num_extra_cells = x_dropoff / gap_extend + 3;
- score_array_size = 2 * num_extra_cells;
- }
- else {
- num_extra_cells = 0;
- score_array_size = N + 3;
+ else
+ num_extra_cells = N + 3;
+
+ if (num_extra_cells > gap_align->dp_mem_alloc) {
+ gap_align->dp_mem_alloc = MAX(num_extra_cells + 100,
+ 2 * gap_align->dp_mem_alloc);
+ sfree(gap_align->dp_mem);
+ gap_align->dp_mem = (BlastGapDP *)malloc(gap_align->dp_mem_alloc *
+ sizeof(BlastGapDP));
}
+ score_array = gap_align->dp_mem;
score = -gap_open_extend;
- score_array_size = MAX(100, score_array_size);
- score_array_origin = 0;
-
- score_array = (BlastGapSmallDP*)malloc(score_array_size *
- sizeof(BlastGapSmallDP));
score = -gap_open_extend;
score_array[0].best = 0;
score_array[0].best_gap = -gap_open_extend;
@@ -2733,15 +2574,9 @@ s_BlastAlignPackedNucl(Uint1* B, Uint1* A, Int4 N, Int4 M,
for (b_index = first_b_index; b_index < b_size; b_index++) {
- /* convert the current B offset into an offset
- suitable for the current array of auxiliary
- structures */
-
- Int4 s_index = b_index - score_array_origin;
-
b_ptr += b_increment;
- score_gap_col = score_array[s_index].best_gap;
- next_score = score_array[s_index].best + matrix_row[ *b_ptr ];
+ score_gap_col = score_array[b_index].best_gap;
+ next_score = score_array[b_index].best + matrix_row[ *b_ptr ];
if (score < score_gap_col)
score = score_gap_col;
@@ -2764,7 +2599,7 @@ s_BlastAlignPackedNucl(Uint1* B, Uint1* A, Int4 N, Int4 M,
if (b_index == first_b_index)
first_b_index++;
else
- score_array[s_index].best = MININT;
+ score_array[b_index].best = MININT;
}
else {
last_b_index = b_index;
@@ -2775,16 +2610,16 @@ s_BlastAlignPackedNucl(Uint1* B, Uint1* A, Int4 N, Int4 M,
}
/* If starting a gap at this position will improve
- the best row, or column, score, update them to
+ the best row or column score, update them to
reflect that. */
score_gap_row -= gap_extend;
score_gap_col -= gap_extend;
- score_array[s_index].best_gap = MAX(score - gap_open_extend,
+ score_array[b_index].best_gap = MAX(score - gap_open_extend,
score_gap_col);
score_gap_row = MAX(score - gap_open_extend, score_gap_row);
- score_array[s_index].best = score;
+ score_array[b_index].best = score;
}
score = next_score;
@@ -2797,18 +2632,14 @@ s_BlastAlignPackedNucl(Uint1* B, Uint1* A, Int4 N, Int4 M,
if (first_b_index == b_size)
break;
- if (last_b_index + num_extra_cells + 3 >=
- score_array_size + score_array_origin) {
- BlastGapSmallDP *new_array;
- score_array_size = 2 * score_array_size;
- new_array = (BlastGapSmallDP *)malloc(score_array_size *
- sizeof(BlastGapSmallDP));
- memcpy(new_array,
- score_array + (first_b_index - score_array_origin),
- (last_b_index - first_b_index + 1) * sizeof(BlastGapSmallDP));
- sfree(score_array);
- score_array = new_array;
- score_array_origin = first_b_index;
+ if (last_b_index + num_extra_cells + 3 >= gap_align->dp_mem_alloc) {
+
+ gap_align->dp_mem_alloc = MAX(last_b_index + num_extra_cells + 100,
+ 2 * gap_align->dp_mem_alloc);
+ score_array = (BlastGapDP *)realloc(score_array,
+ gap_align->dp_mem_alloc *
+ sizeof(BlastGapDP));
+ gap_align->dp_mem = score_array;
}
if (last_b_index < b_size - 1) {
@@ -2825,30 +2656,20 @@ s_BlastAlignPackedNucl(Uint1* B, Uint1* A, Int4 N, Int4 M,
The next inner loop will have larger bounds */
while (score_gap_row >= (best_score - x_dropoff) && b_size <= N) {
-
- /* convert the current B offset into an offset
- suitable for the current array of auxiliary
- structures. */
-
- Int4 s_index = b_size - score_array_origin;
-
- score_array[s_index].best = score_gap_row;
- score_array[s_index].best_gap = score_gap_row - gap_open_extend;
+ score_array[b_size].best = score_gap_row;
+ score_array[b_size].best_gap = score_gap_row - gap_open_extend;
score_gap_row -= gap_extend;
b_size++;
}
}
if (b_size <= N) {
- Int4 s_index = b_size - score_array_origin;
-
- score_array[s_index].best = MININT;
- score_array[s_index].best_gap = MININT;
+ score_array[b_size].best = MININT;
+ score_array[b_size].best_gap = MININT;
b_size++;
}
}
- sfree(score_array);
return best_score;
}
diff --git a/algo/blast/core/blast_gapalign.h b/algo/blast/core/blast_gapalign.h
index 49525522..9b3767f6 100644
--- a/algo/blast/core/blast_gapalign.h
+++ b/algo/blast/core/blast_gapalign.h
@@ -1,4 +1,4 @@
-/* $Id: blast_gapalign.h,v 1.62 2005/04/27 19:47:57 dondosha Exp $
+/* $Id: blast_gapalign.h,v 1.63 2005/11/30 18:30:00 papadopo Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -53,6 +53,15 @@ extern "C" {
/** Split subject sequences if longer than this */
#define MAX_DBSEQ_LEN 5000000
+/** Auxiliary structure for dynamic programming gapped extension */
+typedef struct {
+ Int4 best; /**< score of best path that ends in a match
+ at this position */
+ Int4 best_gap; /**< score of best path that ends in a gap
+ at this position */
+} BlastGapDP;
+
+
/** Structure supporting the gapped alignment */
typedef struct BlastGapAlignStruct {
Boolean positionBased; /**< Is this PSI-BLAST? */
@@ -63,6 +72,8 @@ typedef struct BlastGapAlignStruct {
GapPrelimEditBlock *rev_prelim_tback; /**< traceback from right extensions */
SGreedyAlignMem* greedy_align_mem;/**< Preallocated memory for the greedy
gapped extension */
+ BlastGapDP* dp_mem; /**< scratch structures for dynamic programming */
+ Int4 dp_mem_alloc; /**< current number of structures allocated */
BlastScoreBlk* sbp; /**< Pointer to the scoring information block */
Int4 gap_x_dropoff; /**< X-dropoff parameter to use */
Int4 query_start; /**< query start offset of current alignment */
diff --git a/algo/blast/core/blast_gapalign_priv.h b/algo/blast/core/blast_gapalign_priv.h
index 83f628d6..e5975689 100644
--- a/algo/blast/core/blast_gapalign_priv.h
+++ b/algo/blast/core/blast_gapalign_priv.h
@@ -1,7 +1,7 @@
#ifndef ALGO_BLAST_CORE___BLAST_GAPALIGN_PRI__H
#define ALGO_BLAST_CORE___BLAST_GAPALIGN_PRI__H
-/* $Id: blast_gapalign_priv.h,v 1.11 2005/05/02 13:07:34 madden Exp $
+/* $Id: blast_gapalign_priv.h,v 1.12 2005/11/30 18:25:03 papadopo Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -39,26 +39,6 @@
extern "C" {
#endif
-/** Auxiliary structure for dynamic programming gapped extension */
-typedef struct BlastGapDP {
- Int4 best; /**< score of best path that ends in a match
- at this position */
- Int4 best_gap; /**< score of best path that ends in a gap
- at this position */
- Int4 best_decline; /**< score of best path that ends in a decline
- at this position */
-} BlastGapDP;
-
-/** Reduced version of BlastGapDP, for alignments that
- * don't use a decline penalty
- */
-typedef struct {
- Int4 best; /**< score of best path that ends in a match
- at this position */
- Int4 best_gap; /**< score of best path that ends in a gap
- at this position */
-} BlastGapSmallDP;
-
Int4
ALIGN_EX(Uint1* A, Uint1* B, Int4 M, Int4 N, Int4* a_offset,
Int4* b_offset, GapPrelimEditBlock *edit_block,
@@ -159,6 +139,9 @@ void RPSPsiMatrixDetach(BlastScoreBlk* sbp);
* ===========================================================================
*
* $Log: blast_gapalign_priv.h,v $
+ * Revision 1.12 2005/11/30 18:25:03 papadopo
+ * move BlastGapDP, remove BlastGapSmallDP
+ *
* Revision 1.11 2005/05/02 13:07:34 madden
* Remove Blast_CheckHSPsForCommonEndpoints
*
diff --git a/algo/blast/core/blast_hits.c b/algo/blast/core/blast_hits.c
index 9480e947..9a72ee04 100644
--- a/algo/blast/core/blast_hits.c
+++ b/algo/blast/core/blast_hits.c
@@ -1,4 +1,4 @@
-/* $Id: blast_hits.c,v 1.169 2005/08/15 16:11:20 dondosha Exp $
+/* $Id: blast_hits.c,v 1.173 2005/11/16 14:27:03 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
@@ -23,7 +23,6 @@
*
* ===========================================================================
*
- * Author: Ilya Dondoshansky
*
*/
@@ -33,7 +32,7 @@
#ifndef SKIP_DOXYGEN_PROCESSING
static char const rcsid[] =
- "$Id: blast_hits.c,v 1.169 2005/08/15 16:11:20 dondosha Exp $";
+ "$Id: blast_hits.c,v 1.173 2005/11/16 14:27:03 madden Exp $";
#endif /* SKIP_DOXYGEN_PROCESSING */
#include <algo/blast/core/blast_options.h>
@@ -1599,6 +1598,7 @@ Int2 Blast_HSPListGetEvalues(const BlastQueryInfo* query_info,
ASSERT(hsp != NULL);
ASSERT(scaling_factor != 0.0);
+ ASSERT(sbp->round_down == FALSE || (hsp->score & 1) == 0);
/* Divide Lambda by the scaling factor, so e-value is
calculated correctly from a scaled score. This is needed only
@@ -1638,6 +1638,7 @@ Int2 Blast_HSPListGetBitScores(BlastHSPList* hsp_list,
for (index=0; index<hsp_list->hspcnt; index++) {
hsp = hsp_list->hsp_array[index];
ASSERT(hsp != NULL);
+ ASSERT(sbp->round_down == FALSE || (hsp->score & 1) == 0);
hsp->bit_score =
(hsp->score*kbp[hsp->context]->Lambda - kbp[hsp->context]->logK) /
NCBIMATH_LN2;
@@ -2189,6 +2190,8 @@ Blast_HSPListReevaluateWithAmbiguities(EBlastProgramType program,
/* Sort the HSP array by score (scores may have changed!) */
Blast_HSPListSortByScore(hsp_list);
+ Blast_HSPListAdjustOddBlastnScores(hsp_list, gapped, sbp);
+
return status;
}
@@ -2440,12 +2443,18 @@ void Blast_HSPListAdjustOffsets(BlastHSPList* hsp_list, Int4 offset)
}
}
-void Blast_HSPListAdjustOddBlastnScores(BlastHSPList* hsp_list)
+void Blast_HSPListAdjustOddBlastnScores(BlastHSPList* hsp_list, Boolean gapped_calculation, BlastScoreBlk* sbp)
{
int index;
if (!hsp_list || hsp_list->hspcnt == 0)
return;
+
+ if (gapped_calculation == FALSE)
+ return;
+
+ if (sbp->round_down == FALSE)
+ return;
for (index = 0; index < hsp_list->hspcnt; ++index) {
hsp_list->hsp_array[index]->score -=
@@ -2985,9 +2994,6 @@ Blast_HSPResultsSaveRPSHSPList(EBlastProgramType program,
Int2 Blast_HSPResultsSaveHSPList(EBlastProgramType program, BlastHSPResults* results,
BlastHSPList* hsp_list, const SBlastHitsParameters* blasthit_params)
{
- Int2 status = 0;
- BlastHSP* hsp;
-
if (!hsp_list)
return 0;
@@ -3002,46 +3008,59 @@ Int2 Blast_HSPResultsSaveHSPList(EBlastProgramType program, BlastHSPResults* res
/* Rearrange HSPs into multiple hit lists if more than one query */
if (results->num_queries > 1) {
+ BlastHSP* hsp;
BlastHSPList** hsp_list_array;
BlastHSPList* tmp_hsp_list;
Int4 index;
hsp_list_array = calloc(results->num_queries, sizeof(BlastHSPList*));
+ if (hsp_list_array == NULL)
+ return -1;
for (index = 0; index < hsp_list->hspcnt; index++) {
+ Boolean can_insert = TRUE;
Int4 query_index;
hsp = hsp_list->hsp_array[index];
query_index = Blast_GetQueryIndexFromContext(hsp->context, program);
- tmp_hsp_list = hsp_list_array[query_index];
- if (!tmp_hsp_list) {
+ if (!(tmp_hsp_list = hsp_list_array[query_index])) {
hsp_list_array[query_index] = tmp_hsp_list =
Blast_HSPListNew(blasthit_params->options->hsp_num_max);
+ if (tmp_hsp_list == NULL)
+ {
+ sfree(hsp_list_array);
+ return -1;
+ }
tmp_hsp_list->oid = hsp_list->oid;
}
- if (!tmp_hsp_list || tmp_hsp_list->do_not_reallocate) {
- tmp_hsp_list = NULL;
- } else if (tmp_hsp_list->hspcnt >= tmp_hsp_list->allocated) {
- BlastHSP** new_hsp_array;
- Int4 new_size =
- MIN(2*tmp_hsp_list->allocated, tmp_hsp_list->hsp_max);
- if (new_size == tmp_hsp_list->hsp_max)
- tmp_hsp_list->do_not_reallocate = TRUE;
+ if (tmp_hsp_list->hspcnt >= tmp_hsp_list->allocated) {
+ if (tmp_hsp_list->do_not_reallocate == FALSE) {
+ BlastHSP** new_hsp_array;
+ Int4 new_size =
+ MIN(2*tmp_hsp_list->allocated, tmp_hsp_list->hsp_max);
+ if (new_size == tmp_hsp_list->hsp_max)
+ tmp_hsp_list->do_not_reallocate = TRUE;
- new_hsp_array = realloc(tmp_hsp_list->hsp_array,
+ new_hsp_array = realloc(tmp_hsp_list->hsp_array,
new_size*sizeof(BlastHSP*));
- if (!new_hsp_array) {
- tmp_hsp_list->do_not_reallocate = TRUE;
- tmp_hsp_list = NULL;
- } else {
- tmp_hsp_list->hsp_array = new_hsp_array;
- tmp_hsp_list->allocated = new_size;
+ if (!new_hsp_array) {
+ tmp_hsp_list->do_not_reallocate = TRUE;
+ can_insert = FALSE;
+ } else {
+ tmp_hsp_list->hsp_array = new_hsp_array;
+ tmp_hsp_list->allocated = new_size;
+ }
+ }
+ else
+ {
+ can_insert = FALSE;
}
}
- if (tmp_hsp_list) {
+ if (can_insert) {
tmp_hsp_list->hsp_array[tmp_hsp_list->hspcnt++] = hsp;
} else {
+ /* FIXME: what if this is not the least significant HSP?? */
/* Cannot add more HSPs; free the memory */
hsp_list->hsp_array[index] = Blast_HSPFree(hsp);
}
@@ -3077,7 +3096,7 @@ Int2 Blast_HSPResultsSaveHSPList(EBlastProgramType program, BlastHSPResults* res
Blast_HSPListFree(hsp_list);
}
- return status;
+ return 0;
}
Int2 Blast_HSPResultsInsertHSPList(BlastHSPResults* results,
diff --git a/algo/blast/core/blast_hits.h b/algo/blast/core/blast_hits.h
index d771faf3..f458e6fe 100644
--- a/algo/blast/core/blast_hits.h
+++ b/algo/blast/core/blast_hits.h
@@ -1,4 +1,4 @@
-/* $Id: blast_hits.h,v 1.83 2005/08/15 16:09:58 dondosha Exp $
+/* $Id: blast_hits.h,v 1.84 2005/09/27 14:42:20 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -521,10 +521,12 @@ void Blast_HSPListAdjustOffsets(BlastHSPList* hsp_list, Int4 offset);
* random alignments are dominated by runs of exact matches, which all have even
* scores. This makes it impossible to estimate statistical parameters correctly
* for odd scores. Hence the raw score formula is adjusted - all scores are
- * rounded down to the nearest even value.
+ * rounded down to the nearest even value in order to provide a conservative estimate.
* @param hsp_list HSP list structure to adjust scores for. [in] [out]
+ * @param gapped_calculation not an ungapped alignment [in]
+ * @param sbp used for round_down Boolean
*/
-void Blast_HSPListAdjustOddBlastnScores(BlastHSPList* hsp_list);
+void Blast_HSPListAdjustOddBlastnScores(BlastHSPList* hsp_list, Boolean gapped_calculation, BlastScoreBlk* sbp);
/** Check if HSP list is sorted by score.
* @param hsp_list The list to check [in]
diff --git a/algo/blast/core/blast_inline.h b/algo/blast/core/blast_inline.h
index d3ad23c6..000fc097 100644
--- a/algo/blast/core/blast_inline.h
+++ b/algo/blast/core/blast_inline.h
@@ -1,4 +1,4 @@
-/* $Id: blast_inline.h,v 1.8 2005/06/23 16:18:46 camacho Exp $
+/* $Id: blast_inline.h,v 1.9 2005/11/16 14:27:03 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
diff --git a/algo/blast/core/blast_itree.c b/algo/blast/core/blast_itree.c
index e43e3866..ab0745fa 100644
--- a/algo/blast/core/blast_itree.c
+++ b/algo/blast/core/blast_itree.c
@@ -1,4 +1,4 @@
-/* $Id: blast_itree.c,v 1.10 2005/04/27 14:52:08 papadopo Exp $
+/* $Id: blast_itree.c,v 1.11 2005/11/16 14:27:03 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
@@ -33,7 +33,7 @@
#ifndef SKIP_DOXYGEN_PROCESSING
static char const rcsid[] =
- "$Id: blast_itree.c,v 1.10 2005/04/27 14:52:08 papadopo Exp $";
+ "$Id: blast_itree.c,v 1.11 2005/11/16 14:27:03 madden Exp $";
#endif /* SKIP_DOXYGEN_PROCESSING */
#include "blast_itree.h"
diff --git a/algo/blast/core/blast_itree.h b/algo/blast/core/blast_itree.h
index 9e19fcf1..1bec4856 100644
--- a/algo/blast/core/blast_itree.h
+++ b/algo/blast/core/blast_itree.h
@@ -1,4 +1,4 @@
-/* $Id: blast_itree.h,v 1.4 2005/04/27 14:52:08 papadopo Exp $
+/* $Id: blast_itree.h,v 1.5 2005/11/16 14:27:03 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
diff --git a/algo/blast/core/blast_kappa.c b/algo/blast/core/blast_kappa.c
index dddc0313..2af2194e 100644
--- a/algo/blast/core/blast_kappa.c
+++ b/algo/blast/core/blast_kappa.c
@@ -1,4 +1,4 @@
-/* $Id: blast_kappa.c,v 1.59 2005/07/21 13:51:19 camacho Exp $
+/* $Id: blast_kappa.c,v 1.62 2005/12/02 17:16:51 madden Exp $
* ==========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -28,15 +28,16 @@
*/
/** @file blast_kappa.c
- * Utilities for doing Smith-Waterman alignments and adjusting the scoring
+ * Utilities for doing Smith-Waterman alignments and adjusting the scoring
* system for each match in blastpgp
*/
#ifndef SKIP_DOXYGEN_PROCESSING
-static char const rcsid[] =
- "$Id: blast_kappa.c,v 1.59 2005/07/21 13:51:19 camacho Exp $";
+static char const rcsid[] =
+"$Id: blast_kappa.c,v 1.62 2005/12/02 17:16:51 madden Exp $";
#endif /* SKIP_DOXYGEN_PROCESSING */
+#include <float.h>
#include <algo/blast/core/blast_def.h>
#include <algo/blast/core/blast_hits.h>
#include <algo/blast/core/blast_stat.h>
@@ -52,13 +53,12 @@ static char const rcsid[] =
#include "blast_posit.h"
#include "blast_hits_priv.h"
-/** by what factor might initially reported E-value exceed true Evalue */
-#define EVALUE_STRETCH 5
-
-/** For translated subject sequences, the number of amino acids to
- include before and after the existing aligned segment when
- generating a composition-based scoring system. */
-#define KAPPA_WINDOW_BORDER 200
+#include <algo/blast/composition_adjustment/nlm_linear_algebra.h>
+#include <algo/blast/composition_adjustment/composition_constants.h>
+#include <algo/blast/composition_adjustment/composition_adjustment.h>
+#include <algo/blast/composition_adjustment/compo_heap.h>
+#include <algo/blast/composition_adjustment/smith_waterman.h>
+#include <algo/blast/composition_adjustment/redo_alignment.h>
/**
* Scale the scores in an HSP list and reset the bit scores.
@@ -70,20 +70,21 @@ static char const rcsid[] =
* @todo rename to something which is more intention revealing, merge with
* function of the same name in blast_traceback.c
*/
+/* WHY */
static void
s_HSPListRescaleScores(BlastHSPList * hsp_list,
- double lambda,
- double logK,
- double scoreDivisor)
+ double lambda,
+ double logK,
+ double scoreDivisor)
{
- int hsp_index;
- for(hsp_index = 0; hsp_index < hsp_list->hspcnt; hsp_index++) {
- BlastHSP * hsp = hsp_list->hsp_array[hsp_index];
-
- hsp->score = BLAST_Nint(((double) hsp->score) / scoreDivisor);
- /* Compute the bit score using the newly computed scaled score. */
- hsp->bit_score = (hsp->score*lambda*scoreDivisor - logK)/NCBIMATH_LN2;
- }
+ int hsp_index;
+ for(hsp_index = 0; hsp_index < hsp_list->hspcnt; hsp_index++) {
+ BlastHSP * hsp = hsp_list->hsp_array[hsp_index];
+
+ hsp->score = BLAST_Nint(((double) hsp->score) / scoreDivisor);
+ /* Compute the bit score using the newly computed scaled score. */
+ hsp->bit_score = (hsp->score*lambda*scoreDivisor - logK)/NCBIMATH_LN2;
+ }
}
/**
@@ -97,121 +98,78 @@ s_HSPListRescaleScores(BlastHSPList * hsp_list,
* @param hspcnt length of hsp_array
*/
static void
-HitlistReapContained(
- BlastHSP * hsp_array[],
- Int4 * hspcnt)
+s_HitlistReapContained(
+ BlastHSP * hsp_array[],
+ Int4 * hspcnt)
{
- Int4 iread; /* iteration index used to read the hitlist */
- Int4 iwrite; /* iteration index used to write to the hitlist */
- Int4 old_hspcnt; /* number of HSPs in the hitlist on entry */
-
- old_hspcnt = *hspcnt;
-
- for( iread = 1; iread < *hspcnt; iread++ ) {
- /* for all HSPs in the hitlist */
- Int4 ireadBack; /* iterator over indices less than iread */
- BlastHSP *hsp1; /* an HSP that is a candidate for deletion */
-
- hsp1 = hsp_array[iread];
- for( ireadBack = 0; ireadBack < iread && hsp1 != NULL; ireadBack++ ) {
- /* for all HSPs before hsp1 in the hitlist and while hsp1 has not
- been deleted */
- BlastHSP *hsp2; /* an HSP that occurs earlier in hsp_array
- * than hsp1 */
- hsp2 = hsp_array[ireadBack];
-
- if( hsp2 == NULL ) { /* hsp2 was deleted in a prior iteration. */
- continue;
- }
- if(SIGN(hsp2->query.frame) == SIGN(hsp1->query.frame) &&
- SIGN(hsp2->subject.frame) == SIGN(hsp1->subject.frame)) {
- /* hsp1 and hsp2 are in the same query/subject frame. */
- if(CONTAINED_IN_HSP
- (hsp2->query.offset, hsp2->query.end, hsp1->query.offset,
- hsp2->subject.offset, hsp2->subject.end,
- hsp1->subject.offset) &&
- CONTAINED_IN_HSP
- (hsp2->query.offset, hsp2->query.end, hsp1->query.end,
- hsp2->subject.offset, hsp2->subject.end,
- hsp1->subject.end) &&
- hsp1->score <= hsp2->score) {
- hsp1 = hsp_array[iread] = Blast_HSPFree(hsp_array[iread]);
- }
- } /* end if hsp1 and hsp2 are in the same query/subject frame */
- } /* end for all HSPs before hsp1 in the hitlist */
- } /* end for all HSPs in the hitlist */
-
- /* Condense the hsp_array, removing any NULL items. */
- iwrite = 0;
- for( iread = 0; iread < *hspcnt; iread++ ) {
- if( hsp_array[iread] != NULL ) {
- hsp_array[iwrite++] = hsp_array[iread];
- }
- }
- *hspcnt = iwrite;
- /* Fill the remaining memory in hsp_array with NULL pointers. */
- for( ; iwrite < old_hspcnt; iwrite++ ) {
- hsp_array[iwrite] = NULL;
- }
+ Int4 iread; /* iteration index used to read the hitlist */
+ Int4 iwrite; /* iteration index used to write to the hitlist */
+ Int4 old_hspcnt; /* number of HSPs in the hitlist on entry */
+
+ old_hspcnt = *hspcnt;
+
+ for (iread = 1; iread < *hspcnt; iread++) {
+ /* for all HSPs in the hitlist */
+ Int4 ireadBack; /* iterator over indices less than iread */
+ BlastHSP *hsp1; /* an HSP that is a candidate for deletion */
+
+ hsp1 = hsp_array[iread];
+ for (ireadBack = 0; ireadBack < iread && hsp1 != NULL; ireadBack++) {
+ /* for all HSPs before hsp1 in the hitlist and while hsp1 has not
+ been deleted */
+ BlastHSP *hsp2; /* an HSP that occurs earlier in hsp_array
+ * than hsp1 */
+ hsp2 = hsp_array[ireadBack];
+
+ if( hsp2 == NULL ) { /* hsp2 was deleted in a prior iteration. */
+ continue;
+ }
+ if (SIGN(hsp2->query.frame) == SIGN(hsp1->query.frame) &&
+ SIGN(hsp2->subject.frame) == SIGN(hsp1->subject.frame)) {
+ /* hsp1 and hsp2 are in the same query/subject frame. */
+ if (CONTAINED_IN_HSP
+ (hsp2->query.offset, hsp2->query.end, hsp1->query.offset,
+ hsp2->subject.offset, hsp2->subject.end,
+ hsp1->subject.offset) &&
+ CONTAINED_IN_HSP
+ (hsp2->query.offset, hsp2->query.end, hsp1->query.end,
+ hsp2->subject.offset, hsp2->subject.end,
+ hsp1->subject.end) &&
+ hsp1->score <= hsp2->score) {
+ hsp1 = hsp_array[iread] = Blast_HSPFree(hsp_array[iread]);
+ }
+ } /* end if hsp1 and hsp2 are in the same query/subject frame */
+ } /* end for all HSPs before hsp1 in the hitlist */
+ } /* end for all HSPs in the hitlist */
+
+ /* Condense the hsp_array, removing any NULL items. */
+ iwrite = 0;
+ for (iread = 0; iread < *hspcnt; iread++) {
+ if (hsp_array[iread] != NULL) {
+ hsp_array[iwrite++] = hsp_array[iread];
+ }
+ }
+ *hspcnt = iwrite;
+ /* Fill the remaining memory in hsp_array with NULL pointers. */
+ for ( ; iwrite < old_hspcnt; iwrite++) {
+ hsp_array[iwrite] = NULL;
+ }
}
-/**
- * An object of type Kappa_DistinctAlignment represents a distinct
- * alignment of the query sequence to the current subject sequence.
- * These objects are typically part of a singly linked list of
- * distinct alignments, stored in the reverse of the order in which
- * they were computed.
- */
-typedef struct Kappa_DistinctAlignment {
- Int4 score; /**< the score of this alignment */
- Int4 queryStart; /**< the start of the alignment in the query */
- Int4 queryEnd; /**< one past the end of the alignment in the query */
- Int4 matchStart; /**< the start of the alignment in the subject */
- Int4 matchEnd; /**< one past the end of the alignment in the
- subject */
- Int4 frame; /**< the subject frame */
- GapEditScript * editScript; /**< the alignment info for a gapped
- alignment */
- struct Kappa_DistinctAlignment * next; /**< the next alignment in the
- list */
-} Kappa_DistinctAlignment;
-
-
-/**
- * Recursively free all alignments in the singly linked list whose
- * head is *palign. Set *palign to NULL.
- *
- * @param palign pointer to the head of a singly linked list
- * of alignments.
- */
-static void
-Kappa_DistinctAlignmentsFree(Kappa_DistinctAlignment ** palign)
+static void s_FreeEditScript(void * edit_script)
{
- Kappa_DistinctAlignment * align; /* represents the current
- alignment in loops */
- align = *palign; *palign = NULL;
- while(align != NULL) {
- /* Save the value of align->next, because align is to be deleted. */
- Kappa_DistinctAlignment * align_next = align->next;
- align_next = align->next;
-
- if(align->editScript) {
- GapEditScriptDelete(align->editScript);
- }
- sfree(align);
-
- align = align_next;
- }
+ if (edit_script != NULL)
+ GapEditScriptDelete(edit_script);
}
/**
- * Converts a list of objects of type Kappa_DistinctAlignment to an
+ * Converts a list of objects of type BlastCompo_Alignment to an
* new object of type BlastHSPList and returns the result. Conversion
* in this direction is lossless. The list passed to this routine is
* freed to ensure that there is no aliasing of fields between the
- * list of Kappa_DistinctAlignments and the new hitlist.
+ * list of BlastCompo_Alignments and the new hitlist.
*
* @param alignments A list of distinct alignments; freed before return [in]
* @param oid Ordinal id of a database sequence [in]
@@ -219,1454 +177,97 @@ Kappa_DistinctAlignmentsFree(Kappa_DistinctAlignment ** palign)
*/
static BlastHSPList *
s_HSPListFromDistinctAlignments(
- Kappa_DistinctAlignment ** alignments,
- int oid)
-{
- const int unknown_value = 0;
- BlastHSPList * hsp_list = Blast_HSPListNew(0);
- Kappa_DistinctAlignment * align;
-
- hsp_list->oid = oid;
-
- for(align = *alignments; NULL != align; align = align->next) {
- BlastHSP * new_hsp = NULL;
-
- Blast_HSPInit(align->queryStart, align->queryEnd,
- align->matchStart, align->matchEnd,
- unknown_value, unknown_value,
- 0, 0, align->frame, align->score,
- &align->editScript, &new_hsp);
-
- /* At this point, the subject and possibly the query sequence have
- * been filtered; since it is not clear that num_ident of the
- * filtered sequences, rather than the original, is desired,
- * explictly leave num_ident blank. */
- new_hsp->num_ident = 0;
-
- Blast_HSPListSaveHSP(hsp_list, new_hsp);
- }
- Kappa_DistinctAlignmentsFree(alignments);
- Blast_HSPListSortByScore(hsp_list);
-
- return hsp_list;
-}
-
-
-/**
- * Given a list of alignments and a new alignment, create a new list
- * of alignments that conditionally includes the new alignment.
- *
- * If there is an equal or higher-scoring alignment in the preexisting
- * list of alignments that shares an endpoint with the new alignment,
- * then preexisting list is returned. Otherwise, a new list is
- * returned with the new alignment as its head and the elements of
- * preexisting list that do not share an endpoint with the new
- * alignment as its tail. The order of elements is preserved.
- *
- * Typically, a list of alignments is built one alignment at a time
- * through a call to withDistinctEnds. All alignments in the resulting
- * list have distinct endpoints. Which items are retained in the list
- * depends on the order in which they were added.
- *
- * Note that an endpoint is a triple, specifying a frame, a location
- * in the query and a location in the subject. In other words,
- * alignments that are not in the same frame never share endpoints.
- *
- * @param p_newAlign on input the alignment that may be added to
- * the list; on output NULL
- * @param p_oldAlignments on input the existing list of alignments;
- * on output the new list
- */
-static void
-withDistinctEnds(
- Kappa_DistinctAlignment **p_newAlign,
- Kappa_DistinctAlignment **p_oldAlignments)
-{
- /* Deference the input parameters. */
- Kappa_DistinctAlignment * newAlign = *p_newAlign;
- Kappa_DistinctAlignment * oldAlignments = *p_oldAlignments;
- Kappa_DistinctAlignment * align; /* represents the current
- alignment in loops */
- Boolean include_new_align; /* true if the new alignment
- may be added to the list */
- *p_newAlign = NULL;
- include_new_align = 1;
-
- for(align = oldAlignments; align != NULL; align = align->next) {
- if(align->frame == newAlign->frame &&
- ( ( align->queryStart == newAlign->queryStart
- && align->matchStart == newAlign->matchStart)
- || ( align->queryEnd == newAlign->queryEnd
- && align->matchEnd == newAlign->matchEnd))) {
- /* At least one of the endpoints of newAlign matches an endpoint
- of align. */
- if( newAlign->score <= align->score ) {
- /* newAlign cannot be added to the list. */
- include_new_align = 0;
- break;
- }
- }
- }
-
- if(include_new_align) {
- Kappa_DistinctAlignment **tail; /* tail of the list being created */
-
- tail = &newAlign->next;
- align = oldAlignments;
- while(align != NULL) {
- /* Save align->next because align may be deleted. */
- Kappa_DistinctAlignment * align_next = align->next;
- align->next = NULL;
- if(align->frame == newAlign->frame &&
- ( ( align->queryStart == newAlign->queryStart
- && align->matchStart == newAlign->matchStart)
- || ( align->queryEnd == newAlign->queryEnd
- && align->matchEnd == newAlign->matchEnd))) {
- /* The alignment shares an end with newAlign; */
- /* delete the alignment. */
- Kappa_DistinctAlignmentsFree(&align);
- } else { /* The alignment does not share an end with newAlign; */
- /* add it to the output list. */
- *tail = align;
- tail = &align->next;
- }
- align = align_next;
- } /* end while align != NULL */
- *p_oldAlignments = newAlign;
- } else { /* do not include_new_align */
- Kappa_DistinctAlignmentsFree(&newAlign);
- } /* end else do not include newAlign */
-}
-
-
-/**
- * The number of bits by which the score of a previously computed
- * alignment must exceed the score of the HSP under consideration for
- * a containment relationship to be reported by the isAlreadyContained
- * routine. */
-#define KAPPA_BIT_TOL 2
-
-
-/**
- * Return true if the HSP is already contained in a
- * previously-computed alignment of sufficiently high score.
- *
- * @param hsp HSP to be tested
- * @param alignments list of alignments
- * @param lambda Karlin-Altschul statistical parameter
- * @param localScalingFactor factor by which scores were scaled to
- * obtain higher precision
- */
-
-static Boolean
-isAlreadyContained(
- BlastHSP * hsp,
- Kappa_DistinctAlignment * alignments,
- double lambda,
- double localScalingFactor)
-{
- Kappa_DistinctAlignment * align; /* represents the current alignment
- in the main loop */
- double scoreTol; /* the amount by which the score of the current
- alignment must exceed the score of the HSP for a
- containment relationship to be reported. */
- scoreTol = KAPPA_BIT_TOL * NCBIMATH_LN2/lambda;
-
- for( align = alignments; align != NULL; align = align->next ) {
- /* for all elements of alignments */
- if(SIGN(hsp->query.frame) == SIGN(align->frame)) {
- /* hsp1 and hsp2 are in the same query/subject frame */
- if(CONTAINED_IN_HSP
- (align->queryStart, align->queryEnd, hsp->query.offset,
- align->matchStart, align->matchEnd, hsp->subject.offset) &&
- CONTAINED_IN_HSP
- (align->queryStart, align->queryEnd, hsp->query.end,
- align->matchStart, align->matchEnd, hsp->subject.end) &&
- hsp->score * localScalingFactor + scoreTol <= align->score) {
- return 1;
- }
- } /* hsp1 and hsp2 are in the same query/subject frame */
- } /* end for all items in alignments */
-
- return 0;
-}
-
-
-/**
- * The struct SWheapRecord data type is used below to define the
- * internal structure of a SWheap (see below). A SWheapRecord
- * represents all alignments of a query sequence to a particular
- * matching sequence.
- */
-typedef struct SWheapRecord {
- double bestEvalue; /**< best (smallest) evalue of all alignments
- in the record */
- Int4 bestScore; /**< best (largest) score; used to break
- ties between records with the same
- e-value */
- Int4 subject_index; /**< index of the subject sequence in
- the database */
- BlastHSPList * theseAlignments; /**< a list of alignments */
-} SWheapRecord;
-
-
-/** Compare two records in the heap. */
-static Boolean
-SWheapRecordCompare(SWheapRecord * place1,
- SWheapRecord * place2)
-{
- int result;
- if(0 == (result = BLAST_CMP(place1->bestEvalue, place2->bestEvalue)) &&
- 0 == (result = BLAST_CMP(place2->bestScore, place1->bestScore))) {
- result = BLAST_CMP(place1->subject_index, place2->subject_index);
- }
- return result > 0;
-}
-
-
-/** swap two records in the heap*/
-static void
-SWheapRecordSwap(SWheapRecord * record1,
- SWheapRecord * record2)
-{
- /* bestEvalue, bestScore, theseAlignments and subject_index are temporary
- * variables used to perform the swap. */
- double bestEvalue;
- Int4 bestScore, subject_index;
- BlastHSPList * theseAlignments;
-
- bestEvalue = record1->bestEvalue;
- record1->bestEvalue = record2->bestEvalue;
- record2->bestEvalue = bestEvalue;
-
- bestScore = record1->bestScore;
- record1->bestScore = record2->bestScore;
- record2->bestScore = bestScore;
-
- subject_index = record1->subject_index;
- record1->subject_index = record2->subject_index;
- record2->subject_index = subject_index;
-
- theseAlignments = record1->theseAlignments;
- record1->theseAlignments = record2->theseAlignments;
- record2->theseAlignments = theseAlignments;
-}
-
-
-/**
- * Verifies that the array heapArray[i] .. heapArray[n] is ordered so
- * as to be a valid heap. This routine checks every element in the array,
- * an so is very time consuming. It is for debugging purposes only.
- */
-static Boolean
-SWheapIsValid(SWheapRecord * heapArray,
- Int4 i,
- Int4 n)
-{
- /* indices of nodes to the left and right of node i */
- Int4 left = 2 * i, right = 2 * i + 1;
-
- if(right <= n) {
- return !SWheapRecordCompare(&(heapArray[right]), &(heapArray[i])) &&
- SWheapIsValid(heapArray, right, n);
- }
- if(left <= n) {
- return !SWheapRecordCompare(&(heapArray[left]), &(heapArray[i])) &&
- SWheapIsValid(heapArray, left, n);
- }
- return TRUE;
-}
-
-/** convenience debugging macro for this module */
-#ifdef KAPPA_INTENSE_DEBUG
-#define KAPPA_ASSERT(expr) ((expr) ? 0 : \
-(fprintf( stderr, "KAPPA_ASSERT failed line %d: %s", __LINE__, #expr ), \
-exit(1)))
-#else
-#define KAPPA_ASSERT(expr) (void)(0)
-#endif
-
-
-/** On entry, all but the first element of the array heapArray[i] to
- * heapArray[n] are in valid heap order. This routine rearranges
- * the elements so that on exit they all are in heap order.
- * @param heapArray holds the heap [in][out]
- * @param i element of heapArray that may be out of order [in]
- * @param n size of heapArray [in]
- */
-static void
-SWheapifyDown(SWheapRecord * heapArray,
- Int4 i,
- Int4 n)
-{
- Boolean moreswap = TRUE; /* is more swapping needed */
- Int4 left, right, largest; /* placeholders for indices in swapping */
- do {
- left = 2 * i;
- right = 2 * i + 1;
- if((left <= n) &&
- (SWheapRecordCompare(&(heapArray[left]), &(heapArray[i]))))
- largest = left;
- else
- largest = i;
- if((right <= n) &&
- (SWheapRecordCompare(&(heapArray[right]), &(heapArray[largest]))))
- largest = right;
- if(largest != i) {
- SWheapRecordSwap(&heapArray[i], &heapArray[largest]);
- /* push largest up the heap */
- i = largest; /* check next level down */
- } else
- moreswap = FALSE;
- } while(moreswap); /* function builds the heap */
- KAPPA_ASSERT(SWheapIsValid(heapArray, i, n));
-}
-
-
-/** On entry, all but the last element of the array heapArray[0] to
- * heapArray[i] are in valid heap order. This routine rearranges
- * the elements so that on exit they all are in heap order.
- *
- * @param heapArray holds the heap [in][out]
- * @param i element in heap array that may be out of order [in]
- * @param n size of heapArray
- */
-static void
-SWheapifyUp(SWheapRecord * heapArray,
- Int4 i,
- Int4 n)
-{
- Int4 parent = i / 2; /* index to the node that is the
- parent of node i */
- while(parent >= 1 &&
- SWheapRecordCompare(&(heapArray[i]), &(heapArray[parent]))){
- SWheapRecordSwap(&heapArray[i], &heapArray[parent]);
-
- i = parent;
- parent /= 2;
- }
- KAPPA_ASSERT(SWheapIsValid(heapArray, 1, n));
-}
-
-/**
- * A SWheap represents a collection of alignments between one query
- * sequence and several matching subject sequences.
- *
- * Each matching sequence is allocated one record in a SWheap. The
- * eValue of a query-subject pair is the best (smallest positive)
- * evalue of all alignments between the two sequences.
- *
- * A match will be inserted in the the SWheap if:
- * - there are fewer that SWheap::heapThreshold elements in the SWheap;
- * - the eValue of the match is <= SWheap::ecutoff; or
- * - the eValue of the match is less than the largest (worst) eValue
- * already in the SWheap.
- *
- * If there are >= SWheap::heapThreshold matches already in the SWheap
- * when a new match is to be inserted, then the match with the largest
- * (worst) eValue is removed, unless the largest eValue <=
- * SWheap::ecutoff. Matches with eValue <= SWheap::ecutoff are never
- * removed by the insertion routine. As a consequence, the SWheap can
- * hold an arbitrarily large number of matches, although it is
- * atypical for the number of matches to be greater than
- * SWheap::heapThreshold.
- *
- * Once all matches have been collected, the SWheapToFlatList routine
- * may be invoked to return a list of all alignments. (see below).
- *
- * While the number of elements in a heap < SWheap::heapThreshold, the
- * SWheap is implemented as an unordered array, rather than a
- * heap-ordered array. The SWheap is converted to a heap-ordered
- * array as soon as it becomes necessary to order the matches by
- * evalue. The routines that operate on a SWheap should behave
- * properly whichever state the SWheap is in.
- */
-typedef struct SWheap {
- Int4 n; /**< The current number of elements */
- Int4 capacity; /**< The maximum number of elements
- that may be inserted before the
- SWheap must be resized, this
- number must be >= heapThreshold */
- Int4 heapThreshold; /**< see above */
- double ecutoff; /**< matches with evalue below ecutoff may
- always be inserted in the SWheap */
- double worstEvalue; /**< the worst (biggest) evalue currently in
- the heap */
-
- SWheapRecord *array; /**< the SWheapRecord array if the SWheap is
- being represented as an unordered array */
- SWheapRecord *heapArray; /**< the SWheapRecord array if the SWheap is
- being represented as an heap-ordered
- array. At least one of (array, heapArray)
- is NULL */
-
-} SWheap;
-
-
-/** Convert a SWheap from a representation as an unordered array to
- * a representation as a heap-ordered array.
- *
- * @param self the SWheap to convert
- */
-static void
-ConvertToHeap(SWheap * self)
+ BlastCompo_Alignment ** alignments,
+ int oid)
{
- if(NULL != self->array) {
- Int4 i; /* heap node index */
- Int4 n; /* number of elements in the heap */
- /* We aren't already a heap */
- self->heapArray = self->array;
- self->array = NULL;
-
- n = self->n;
- for(i = n / 2; i >= 1; --i) {
- SWheapifyDown(self->heapArray, i, n);
+ const int unknown_value = 0;
+ BlastHSPList * hsp_list = Blast_HSPListNew(0);
+ BlastCompo_Alignment * align;
+
+ hsp_list->oid = oid;
+
+ for (align = *alignments; NULL != align; align = align->next) {
+ BlastHSP * new_hsp = NULL;
+ GapEditScript * editScript = align->context;
+ align->context = NULL;
+ Blast_HSPInit(align->queryStart, align->queryEnd,
+ align->matchStart, align->matchEnd,
+ unknown_value, unknown_value,
+ 0, 0, align->frame, align->score,
+ &editScript, &new_hsp);
+
+ /* At this point, the subject and possibly the query sequence have
+ * been filtered; since it is not clear that num_ident of the
+ * filtered sequences, rather than the original, is desired,
+ * explictly leave num_ident blank. */
+ new_hsp->num_ident = 0;
+
+ Blast_HSPListSaveHSP(hsp_list, new_hsp);
}
- }
- KAPPA_ASSERT(SWheapIsValid(self->heapArray, 1, self->n));
-}
+ BlastCompo_AlignmentsFree(alignments, s_FreeEditScript);
+ Blast_HSPListSortByScore(hsp_list);
-/** When the heap is about to exceed its capacity, it will be grown by
- * the minimum of a multiplicative factor of SWHEAP_RESIZE_FACTOR
- * and an additive factor of SWHEAP_MIN_RESIZE. The heap never
- * decreases in size */
-#define SWHEAP_RESIZE_FACTOR 1.5
-/** @sa SWHEAP_RESIZE_FACTOR */
-#define SWHEAP_MIN_RESIZE 100
-
-/** Return true if self may insert a match that had the given eValue,
- * score and subject_index.
- *
- * @param self a SWheap
- * @param eValue the evalue to be tested.
- * @param score the score to be tested
- * @param subject_index the subject_index to be tested.
- */
-static Boolean
-SWheapWouldInsert(SWheap * self,
- double eValue,
- Int4 score,
- Int4 subject_index)
-{
- if(self->n < self->heapThreshold ||
- eValue <= self->ecutoff ||
- eValue < self->worstEvalue) {
- return TRUE;
- } else {
- /* self is either currently a heap, or must be converted to one;
- * use SWheapRecordCompare to compare against the worst element in
- * the heap */
- SWheapRecord heapRecord; /* temporary record to compare against */
-
- if(self->heapArray == NULL) ConvertToHeap(self);
-
- heapRecord.bestEvalue = eValue;
- heapRecord.bestScore = score;
- heapRecord.subject_index = subject_index;
- heapRecord.theseAlignments = NULL;
-
- return SWheapRecordCompare(&self->heapArray[1], &heapRecord);
- }
+ return hsp_list;
}
-/**
- * Try to insert matchRecord into the SWheap. The list of SeqAligns
- * passed to this routine is used directly, i.e. the list is not copied,
- * but is rather stored in the SWheap or deleted.
- *
- * @param self the heap
- * @param alignments a list of alignments
- * @param eValue the best evalue among the alignments
- * @param score the best score among the alignments
- * @param subject_index the index of the subject sequence in the database
- */
static void
-SWheapInsert(
- SWheap * self,
- BlastHSPList * alignments,
- double eValue,
- Int4 score,
- Int4 subject_index)
+s_HitlistEvaluateAndPurge(int * pbestScore, double *pbestEvalue,
+ BlastHSPList * hsp_list,
+ int subject_length,
+ EBlastProgramType program_number,
+ BlastQueryInfo* queryInfo,
+ BlastScoreBlk* sbp,
+ const BlastHitSavingParameters* hitParams,
+ int do_link_hsps)
{
- if(self->array && self->n >= self->heapThreshold) {
- ConvertToHeap(self);
- }
- if(self->array != NULL) {
- /* "self" is currently a list. Add the new alignments to the end */
- SWheapRecord *heapRecord; /* destination for the new alignments */
- heapRecord = &self->array[++self->n];
- heapRecord->bestEvalue = eValue;
- heapRecord->bestScore = score;
- heapRecord->theseAlignments = alignments;
- heapRecord->subject_index = subject_index;
- if( self->worstEvalue < eValue ) {
- self->worstEvalue = eValue;
- }
- } else { /* "self" is currently a heap */
- if(self->n < self->heapThreshold ||
- (eValue <= self->ecutoff &&
- self->worstEvalue <= self->ecutoff)) {
- SWheapRecord *heapRecord; /* Destination for the new alignments */
- /* The new alignments must be inserted into the heap, and all old
- * alignments retained */
- if(self->n >= self->capacity) {
- /* The heap must be resized */
- Int4 newCapacity; /* capacity the heap will have after
- * it is resized */
- newCapacity = MAX(SWHEAP_MIN_RESIZE + self->capacity,
- (Int4) (SWHEAP_RESIZE_FACTOR * self->capacity));
- self->heapArray = (SWheapRecord *)
- realloc(self->heapArray, (newCapacity + 1) * sizeof(SWheapRecord));
- self->capacity = newCapacity;
- }
- /* end if the heap must be resized */
- heapRecord = &self->heapArray[++self->n];
- heapRecord->bestEvalue = eValue;
- heapRecord->bestScore = score;
- heapRecord->theseAlignments = alignments;
- heapRecord->subject_index = subject_index;
-
- SWheapifyUp(self->heapArray, self->n, self->n);
+ *pbestEvalue = DBL_MAX;
+ *pbestScore = 0;
+ if (do_link_hsps) {
+ BLAST_LinkHsps(program_number, hsp_list,
+ queryInfo, subject_length,
+ sbp, hitParams->link_hsp_params, TRUE);
} else {
- /* Some set of alignments must be discarded; discardedAlignments
- * will hold a pointer to these alignments. */
- BlastHSPList * discardedAlignments = NULL;
- SWheapRecord heapRecord; /* Candidate record for insertion */
-
- heapRecord.bestEvalue = eValue;
- heapRecord.bestScore = score;
- heapRecord.theseAlignments = alignments;
- heapRecord.subject_index = subject_index;
-
- if(SWheapRecordCompare(&self->heapArray[1], &heapRecord)) {
- /* The new record should be inserted, and the largest
- * element currently in the heap may be disgarded */
- discardedAlignments = self->heapArray[1].theseAlignments;
- memcpy(&self->heapArray[1], &heapRecord, sizeof(SWheapRecord));
- } else {
- discardedAlignments = heapRecord.theseAlignments;
- }
- SWheapifyDown(self->heapArray, 1, self->n);
-
- if(discardedAlignments != NULL) {
- Blast_HSPListFree(discardedAlignments);
- }
- /* end while there are discarded alignments that have not been freed */
+ Blast_HSPListGetEvalues(queryInfo, hsp_list, TRUE, sbp,
+ 0.0, /* use a non-zero gap decay only when
+ linking hsps */
+ 1.0); /* Use scaling factor equal to
+ 1, because both scores and
+ Lambda are scaled, so they
+ will cancel each other. */
}
- /* end else some set of alignments must be discarded */
-
- self->worstEvalue = self->heapArray[1].bestEvalue;
- KAPPA_ASSERT(SWheapIsValid(self->heapArray, 1, self->n));
- }
- /* end else "self" is currently a heap. */
-}
-
-
-/**
- * Return true if only matches with evalue <= self->ecutoff may be
- * inserted.
- *
- * @param self a SWheap
- */
-static Boolean
-SWheapWillAcceptOnlyBelowCutoff(SWheap * self)
-{
- return self->n >= self->heapThreshold && self->worstEvalue <= self->ecutoff;
-}
-
-
-/** Initialize a new SWheap; parameters to this function correspond
- * directly to fields in the SWheap */
-static void
-SWheapInitialize(SWheap * self,
- Int4 heapThreshold,
- double ecutoff)
-{
- self->n = 0;
- self->heapThreshold = heapThreshold;
- self->ecutoff = ecutoff;
- self->heapArray = NULL;
- self->capacity = heapThreshold;
- self->worstEvalue = 0;
- /* Begin life as a list */
- self->array =
- (SWheapRecord *) malloc((self->capacity + 1) * sizeof(SWheapRecord));
-}
-
-
-/**
- * Release the storage associated with the fields of a SWheap. Don't
- * delete the SWheap structure itself.
- *
- * @param self SWheap whose storage will be released
- */
-static void
-SWheapRelease(SWheap * self)
-{
- if(self->heapArray) sfree(self->heapArray);
- if(self->array) sfree(self->array);
-
- self->n = self->capacity = self->heapThreshold = 0;
- self->heapArray = NULL; self->array = NULL;
-}
-
-
-/**
- * Remove and return the element in the SWheap with largest (worst) evalue
- *
- * @param self a SWheap
- */
-static BlastHSPList *
-SWheapPop(SWheap * self)
-{
- BlastHSPList * results = NULL; /* the list of HSPs to be returned */
-
- ConvertToHeap(self);
- if(self->n > 0) { /* The heap is not empty */
- SWheapRecord *first, *last; /* The first and last elements of the
- * array that represents the heap. */
- first = &self->heapArray[1];
- last = &self->heapArray[self->n];
-
- results = first->theseAlignments;
- if( --self->n > 0 ) {
- /* The heap is still not empty */
- memcpy(first, last, sizeof(SWheapRecord));
-
- SWheapifyDown(self->heapArray, 1, self->n);
+ Blast_HSPListReapByEvalue(hsp_list, hitParams->options);
+ if (hsp_list->hspcnt > 0) {
+ *pbestEvalue = hsp_list->best_evalue;
+ *pbestScore = hsp_list->hsp_array[0]->score;
}
- }
-
- KAPPA_ASSERT(SWheapIsValid(self->heapArray, 1, self->n));
-
- return results;
-}
-
-
-/**
- * Convert a SWheap to a flat list of SeqAligns. Note that there may
- * be more than one alignment per element in the heap. The new list
- * preserves the order of the SeqAligns associated with each
- * HeapRecord. (@todo this function is named as it is for compatibility with
- * kappa.c, rename in the future)
- *
- * @param self a SWheap
- * @param results BLAST core external results structure (pre-SeqAlign)
- * [out]
- * @param hitlist_size size of each list in the results structure above [in]
- */
-static void
-SWheapToFlatList(SWheap * self, BlastHSPResults * results, Int4 hitlist_size)
-{
- BlastHSPList* hsp_list;
- BlastHitList* hitlist =
- results->hitlist_array[0] = Blast_HitListNew(hitlist_size);
-
- hsp_list = NULL;
- while(NULL != (hsp_list = SWheapPop(self))) {
- Blast_HitListUpdate(hitlist, hsp_list);
- }
}
-/** keeps one row of the Smith-Waterman matrix
- */
-typedef struct SWpairs {
- Int4 noGap; /**< @todo document me */
- Int4 gapExists; /**< @todo document me */
-} SWpairs;
-
-
-/**
- * computes Smith-Waterman local alignment score and returns the
- * evalue
- *
- * @param matchSeq is a database sequence matched by this query [in]
- * @param matchSeqLength is the length of matchSeq in amino acids [in]
- * @param query is the input query sequence [in]
- * @param queryLength is the length of query [in]
- * @param matrix is the position-specific matrix associated with
- * query [in]
- * @param gapOpen is the cost of opening a gap [in]
- * @param gapExtend is the cost of extending an existing gap by 1
- * position [in]
- * @param matchSeqEnd returns the final position in the matchSeq of an
- * optimal local alignment [in]
- * @param queryEnd returns the final position in query of an optimal
- * local alignment [in]. matchSeqEnd and queryEnd can
- * be used to run the local alignment
- * in reverse to find optimal starting positions [in]
- * @param score is used to pass back the optimal score [in]
- * @param kbp holds the Karlin-Altschul parameters [in]
- * @param effSearchSpace effective search space [in]
- * @param positionSpecific determines whether matrix is position
- * specific or not [in]
- * @return the expect value of the alignment
- */
static double
-BLbasicSmithWatermanScoreOnly(Uint1 * matchSeq,
- Int4 matchSeqLength, Uint1 *query, Int4 queryLength,
- Int4 **matrix,
- Int4 gapOpen, Int4 gapExtend, Int4 *matchSeqEnd, Int4 *queryEnd,
- Int4 *score,
- Blast_KarlinBlk* kbp, Int8 effSearchSpace, Boolean positionSpecific)
+s_CalcLambda(double probs[], int min_score, int max_score, double lambda0)
{
-
- Int4 bestScore; /*best score seen so far*/
- Int4 newScore; /* score of next entry*/
- Int4 bestMatchSeqPos, bestQueryPos; /*position ending best score in
- matchSeq and query sequences*/
- SWpairs *scoreVector; /*keeps one row of the Smith-Waterman matrix
- overwrite old row with new row*/
- Int4 *matrixRow; /*one row of score matrix*/
- Int4 newGapCost; /*cost to have a gap of one character*/
- Int4 prevScoreNoGapMatchSeq; /*score one row and column up
- with no gaps*/
- Int4 prevScoreGapMatchSeq; /*score if a gap already started in matchSeq*/
- Int4 continueGapScore; /*score for continuing a gap in matchSeq*/
- Int4 matchSeqPos, queryPos; /*positions in matchSeq and query*/
- double returnEvalue; /*e-value to return*/
-
-
- scoreVector = (SWpairs *) calloc(matchSeqLength, sizeof(SWpairs));
- bestMatchSeqPos = 0;
- bestQueryPos = 0;
- bestScore = 0;
- newGapCost = gapOpen + gapExtend;
- for (matchSeqPos = 0; matchSeqPos < matchSeqLength; matchSeqPos++) {
- scoreVector[matchSeqPos].noGap = 0;
- scoreVector[matchSeqPos].gapExists = -(gapOpen);
- }
- for(queryPos = 0; queryPos < queryLength; queryPos++) {
- if (positionSpecific)
- matrixRow = matrix[queryPos];
- else
- matrixRow = matrix[query[queryPos]];
- newScore = 0;
- prevScoreNoGapMatchSeq = 0;
- prevScoreGapMatchSeq = -(gapOpen);
- for(matchSeqPos = 0; matchSeqPos < matchSeqLength; matchSeqPos++) {
- /*testing scores with a gap in matchSeq, either starting a new
- gap or extending an existing gap*/
- if ((newScore = newScore - newGapCost) >
- (prevScoreGapMatchSeq = prevScoreGapMatchSeq - gapExtend))
- prevScoreGapMatchSeq = newScore;
- /*testing scores with a gap in query, either starting a new
- gap or extending an existing gap*/
- if ((newScore = scoreVector[matchSeqPos].noGap - newGapCost) >
- (continueGapScore = scoreVector[matchSeqPos].gapExists - gapExtend))
- continueGapScore = newScore;
- /*compute new score extending one position in matchSeq and query*/
- newScore = prevScoreNoGapMatchSeq + matrixRow[matchSeq[matchSeqPos]];
- if (newScore < 0)
- newScore = 0; /*Smith-Waterman locality condition*/
- /*test two alternatives*/
- if (newScore < prevScoreGapMatchSeq)
- newScore = prevScoreGapMatchSeq;
- if (newScore < continueGapScore)
- newScore = continueGapScore;
- prevScoreNoGapMatchSeq = scoreVector[matchSeqPos].noGap;
- scoreVector[matchSeqPos].noGap = newScore;
- scoreVector[matchSeqPos].gapExists = continueGapScore;
- if (newScore > bestScore) {
- bestScore = newScore;
- bestQueryPos = queryPos;
- bestMatchSeqPos = matchSeqPos;
- }
- }
- }
- sfree(scoreVector);
- if (bestScore < 0)
- bestScore = 0;
- *matchSeqEnd = bestMatchSeqPos;
- *queryEnd = bestQueryPos;
- *score = bestScore;
- returnEvalue = BLAST_KarlinStoE_simple(bestScore,kbp, effSearchSpace);
- return(returnEvalue);
-}
-
-
-/**
- * computes where optimal Smith-Waterman local alignment starts given
- * the ending positions and score matchSeqEnd and queryEnd can be used
- * to run the local alignment in reverse to find optimal starting
- * positions these are passed back in matchSeqStart and queryStart the
- * optimal score is passed in to check when it has been reached going
- * backwards the score is also returned
- * @param matchSeq is a database sequence matched by this query [in]
- * @param matchSeqLength is the length of matchSeq in amino acids [in]
- * @param query is the input query sequence [in]
- * @param matrix is the position-specific matrix associated with
- * query or the standard matrix [in]
- * @param gapOpen is the cost of opening a gap [in]
- * @param gapExtend is the cost of extending an existing gap by 1
- * position [in]
- * @param matchSeqEnd is the final position in the matchSeq of an optimal
- * local alignment [in]
- * @param queryEnd is the final position in query of an optimal
- * local alignment [in]
- * @param score optimal score to be obtained [in]
- * @param matchSeqStart starting point of optimal alignment [out]
- * @param queryStart starting point of optimal alignment [out]
- * @param positionSpecific determines whether matrix is position specific
- * or not
- */
-static Int4
-BLSmithWatermanFindStart(Uint1 * matchSeq,
- Int4 matchSeqLength, Uint1 *query, Int4 **matrix,
- Int4 gapOpen, Int4 gapExtend, Int4 matchSeqEnd, Int4 queryEnd, Int4 score,
- Int4 *matchSeqStart, Int4 *queryStart, Boolean positionSpecific)
-{
-
- Int4 bestScore; /*best score seen so far*/
- Int4 newScore; /* score of next entry*/
- Int4 bestMatchSeqPos, bestQueryPos; /*position starting best score in
- matchSeq and database sequences*/
- SWpairs *scoreVector; /*keeps one row of the Smith-Waterman matrix
- overwrite old row with new row*/
- Int4 *matrixRow; /*one row of score matrix*/
- Int4 newGapCost; /*cost to have a gap of one character*/
- Int4 prevScoreNoGapMatchSeq; /*score one row and column up
- with no gaps*/
- Int4 prevScoreGapMatchSeq; /*score if a gap already started in matchSeq*/
- Int4 continueGapScore; /*score for continuing a gap in query*/
- Int4 matchSeqPos, queryPos; /*positions in matchSeq and query*/
-
- scoreVector = (SWpairs *) calloc(matchSeqLength, sizeof(SWpairs));
- bestMatchSeqPos = 0;
- bestQueryPos = 0;
- bestScore = 0;
- newGapCost = gapOpen + gapExtend;
- for (matchSeqPos = 0; matchSeqPos < matchSeqLength; matchSeqPos++) {
- scoreVector[matchSeqPos].noGap = 0;
- scoreVector[matchSeqPos].gapExists = -(gapOpen);
- }
- for(queryPos = queryEnd; queryPos >= 0; queryPos--) {
- if (positionSpecific)
- matrixRow = matrix[queryPos];
- else
- matrixRow = matrix[query[queryPos]];
- newScore = 0;
- prevScoreNoGapMatchSeq = 0;
- prevScoreGapMatchSeq = -(gapOpen);
- for(matchSeqPos = matchSeqEnd; matchSeqPos >= 0; matchSeqPos--) {
- /*testing scores with a gap in matchSeq, either starting a new
- gap or extending an existing gap*/
- if ((newScore = newScore - newGapCost) >
- (prevScoreGapMatchSeq = prevScoreGapMatchSeq - gapExtend))
- prevScoreGapMatchSeq = newScore;
- /*testing scores with a gap in query, either starting a new
- gap or extending an existing gap*/
- if ((newScore = scoreVector[matchSeqPos].noGap - newGapCost) >
- (continueGapScore = scoreVector[matchSeqPos].gapExists - gapExtend))
- continueGapScore = newScore;
- /*compute new score extending one position in matchSeq and query*/
- newScore = prevScoreNoGapMatchSeq + matrixRow[matchSeq[matchSeqPos]];
- if (newScore < 0)
- newScore = 0; /*Smith-Waterman locality condition*/
- /*test two alternatives*/
- if (newScore < prevScoreGapMatchSeq)
- newScore = prevScoreGapMatchSeq;
- if (newScore < continueGapScore)
- newScore = continueGapScore;
- prevScoreNoGapMatchSeq = scoreVector[matchSeqPos].noGap;
- scoreVector[matchSeqPos].noGap = newScore;
- scoreVector[matchSeqPos].gapExists = continueGapScore;
- if (newScore > bestScore) {
- bestScore = newScore;
- bestQueryPos = queryPos;
- bestMatchSeqPos = matchSeqPos;
- }
- if (bestScore >= score)
- break;
- }
- if (bestScore >= score)
- break;
- }
- sfree(scoreVector);
- if (bestScore < 0)
- bestScore = 0;
- *matchSeqStart = bestMatchSeqPos;
- *queryStart = bestQueryPos;
- return(bestScore);
-}
-
-
-/**
- * computes Smith-Waterman local alignment score and returns the
- * evalue assuming some positions are forbidden matchSeqEnd and query
- * can be used to run the local alignment in reverse to find optimal
- * starting positions
- * @param matchSeq is the matchSeq sequence [in]
- * @param matchSeqLength is the length of matchSeq in amino acids [in]
- * @param query is the input query sequence [in]
- * @param queryLength is the length of query [in]
- * @param matrix is either the position-specific matrix associated
- * with query or the standard matrix [in]
- * @param gapOpen is the cost of opening a gap [in]
- * @param gapExtend is the cost of extending an existing gap by 1
- * position [in]
- * @param matchSeqEnd returns the final position in the matchSeq of an
- * optimal local alignment [in]
- * @param queryEnd returns the final position in query of an optimal
- * local alignment [in]
- * @param score is used to pass back the optimal score [out]
- * @param kbp holds the Karlin-Altschul parameters [in]
- * @param effSearchSpace effective search space [in]
- * @param numForbidden number of forbidden ranges [in]
- * @param forbiddenRanges lists areas that should not be aligned [in]
- * @param positionSpecific determines whether matrix is position specific
- * or not [in]
- */
-static double
-BLspecialSmithWatermanScoreOnly(Uint1 * matchSeq,
- Int4 matchSeqLength, Uint1 *query, Int4 queryLength, Int4 **matrix,
- Int4 gapOpen, Int4 gapExtend,
- Int4 *matchSeqEnd, Int4 *queryEnd, Int4 *score,
- Blast_KarlinBlk* kbp, Int8 effSearchSpace,
- Int4 *numForbidden, Int4 ** forbiddenRanges, Boolean positionSpecific)
-{
-
- Int4 bestScore; /*best score seen so far*/
- Int4 newScore; /* score of next entry*/
- Int4 bestMatchSeqPos, bestQueryPos; /*position ending best score in
- matchSeq and database sequences*/
- SWpairs *scoreVector; /*keeps one row of the Smith-Waterman matrix
- overwrite old row with new row*/
- Int4 *matrixRow; /*one row of score matrix*/
- Int4 newGapCost; /*cost to have a gap of one character*/
- Int4 prevScoreNoGapMatchSeq; /*score one row and column up
- with no gaps*/
- Int4 prevScoreGapMatchSeq; /*score if a gap already started in matchSeq*/
- Int4 continueGapScore; /*score for continuing a gap in query*/
- Int4 matchSeqPos, queryPos; /*positions in matchSeq and query*/
- double returnEvalue; /*e-value to return*/
- Boolean forbidden; /*is this position forbidden?*/
- Int4 f; /*index over forbidden positions*/
-
-
- scoreVector = (SWpairs *) calloc(1, matchSeqLength * sizeof(SWpairs));
- bestMatchSeqPos = 0;
- bestQueryPos = 0;
- bestScore = 0;
- newGapCost = gapOpen + gapExtend;
- for (matchSeqPos = 0; matchSeqPos < matchSeqLength; matchSeqPos++) {
- scoreVector[matchSeqPos].noGap = 0;
- scoreVector[matchSeqPos].gapExists = -(gapOpen);
- }
- for(queryPos = 0; queryPos < queryLength; queryPos++) {
- if (positionSpecific)
- matrixRow = matrix[queryPos];
- else
- matrixRow = matrix[query[queryPos]];
- newScore = 0;
- prevScoreNoGapMatchSeq = 0;
- prevScoreGapMatchSeq = -(gapOpen);
- for(matchSeqPos = 0; matchSeqPos < matchSeqLength; matchSeqPos++) {
- /*testing scores with a gap in matchSeq, either starting a new
- gap or extending an existing gap*/
- if ((newScore = newScore - newGapCost) >
- (prevScoreGapMatchSeq = prevScoreGapMatchSeq - gapExtend))
- prevScoreGapMatchSeq = newScore;
- /*testing scores with a gap in query, either starting a new
- gap or extending an existing gap*/
- if ((newScore = scoreVector[matchSeqPos].noGap - newGapCost) >
- (continueGapScore = scoreVector[matchSeqPos].gapExists - gapExtend))
- continueGapScore = newScore;
- /*compute new score extending one position in matchSeq and query*/
- forbidden = FALSE;
- for(f = 0; f < numForbidden[queryPos]; f++) {
- if ((matchSeqPos >= forbiddenRanges[queryPos][2 * f]) &&
- (matchSeqPos <= forbiddenRanges[queryPos][2*f + 1])) {
- forbidden = TRUE;
- break;
- }
- }
- if (forbidden)
- newScore = BLAST_SCORE_MIN;
- else
- newScore = prevScoreNoGapMatchSeq + matrixRow[matchSeq[matchSeqPos]];
- if (newScore < 0)
- newScore = 0; /*Smith-Waterman locality condition*/
- /*test two alternatives*/
- if (newScore < prevScoreGapMatchSeq)
- newScore = prevScoreGapMatchSeq;
- if (newScore < continueGapScore)
- newScore = continueGapScore;
- prevScoreNoGapMatchSeq = scoreVector[matchSeqPos].noGap;
- scoreVector[matchSeqPos].noGap = newScore;
- scoreVector[matchSeqPos].gapExists = continueGapScore;
- if (newScore > bestScore) {
- bestScore = newScore;
- bestQueryPos = queryPos;
- bestMatchSeqPos = matchSeqPos;
-
- }
- }
- }
- sfree(scoreVector);
- if (bestScore < 0)
- bestScore = 0;
- *matchSeqEnd = bestMatchSeqPos;
- *queryEnd = bestQueryPos;
- *score = bestScore;
- returnEvalue = BLAST_KarlinStoE_simple(bestScore,kbp, effSearchSpace);
- return(returnEvalue);
-}
-
-
-/**
- * computes where optimal Smith-Waterman local alignment starts given
- * the ending positions. matchSeqEnd and queryEnd can be used to run
- * the local alignment in reverse to find optimal starting positions
- * these are passed back in matchSeqStart and queryStart the optimal
- * score is passed in to check when it has been reached going
- * backwards the score is also returned
- * @param matchSeq is the matchSeq sequence [in]
- * @param matchSeqLength is the length of matchSeq in amino acids [in]
- * @param query is the sequence corresponding to some matrix
- * profile [in]
- * @param matrix is the position-specific matrix associated with
- * query [in]
- * @param gapOpen is the cost of opening a gap [in]
- * @param gapExtend is the cost of extending an existing gap by 1
- * position [in]
- * @param matchSeqEnd is the final position in the matchSeq of an optimal
- * local alignment [in]
- * @param queryEnd is the final position in query of an optimal
- * local alignment [in]
- * @param score optimal score is passed in to check when it has
- * been reached going backwards [in]
- * @param matchSeqStart optimal starting point [in]
- * @param queryStart optimal starting point [in]
- * @param numForbidden array of regions not to be aligned. [in]
- * @param numForbidden array of regions not to be aligned. [in]
- * @param forbiddenRanges regions not to be aligned. [in]
- * @param positionSpecific determines whether matrix is position specific
- * or not
- * @return the score found
- */
-static Int4 BLspecialSmithWatermanFindStart(Uint1 * matchSeq,
- Int4 matchSeqLength, Uint1 *query, Int4 **matrix,
- Int4 gapOpen, Int4 gapExtend, Int4 matchSeqEnd, Int4 queryEnd, Int4 score,
- Int4 *matchSeqStart, Int4 *queryStart, Int4 *numForbidden,
- Int4 ** forbiddenRanges, Boolean positionSpecific)
-{
-
- Int4 bestScore; /*best score seen so far*/
- Int4 newScore; /* score of next entry*/
- Int4 bestMatchSeqPos, bestQueryPos; /*position starting best score in
- matchSeq and database sequences*/
- SWpairs *scoreVector; /*keeps one row of the Smith-Waterman matrix
- overwrite old row with new row*/
- Int4 *matrixRow; /*one row of score matrix*/
- Int4 newGapCost; /*cost to have a gap of one character*/
- Int4 prevScoreNoGapMatchSeq; /*score one row and column up
- with no gaps*/
- Int4 prevScoreGapMatchSeq; /*score if a gap already started in matchSeq*/
- Int4 continueGapScore; /*score for continuing a gap in query*/
- Int4 matchSeqPos, queryPos; /*positions in matchSeq and query*/
- Boolean forbidden; /*is this position forbidden?*/
- Int4 f; /*index over forbidden positions*/
-
- scoreVector = (SWpairs *) calloc(matchSeqLength, sizeof(SWpairs));
- bestMatchSeqPos = 0;
- bestQueryPos = 0;
- bestScore = 0;
- newGapCost = gapOpen + gapExtend;
- for (matchSeqPos = 0; matchSeqPos < matchSeqLength; matchSeqPos++) {
- scoreVector[matchSeqPos].noGap = 0;
- scoreVector[matchSeqPos].gapExists = -(gapOpen);
- }
- for(queryPos = queryEnd; queryPos >= 0; queryPos--) {
- if (positionSpecific)
- matrixRow = matrix[queryPos];
- else
- matrixRow = matrix[query[queryPos]];
- newScore = 0;
- prevScoreNoGapMatchSeq = 0;
- prevScoreGapMatchSeq = -(gapOpen);
- for(matchSeqPos = matchSeqEnd; matchSeqPos >= 0; matchSeqPos--) {
- /*testing scores with a gap in matchSeq, either starting a new
- gap or extending an existing gap*/
- if ((newScore = newScore - newGapCost) >
- (prevScoreGapMatchSeq = prevScoreGapMatchSeq - gapExtend))
- prevScoreGapMatchSeq = newScore;
- /*testing scores with a gap in query, either starting a new
- gap or extending an existing gap*/
- if ((newScore = scoreVector[matchSeqPos].noGap - newGapCost) >
- (continueGapScore = scoreVector[matchSeqPos].gapExists - gapExtend))
- continueGapScore = newScore;
- /*compute new score extending one position in matchSeq and query*/
- forbidden = FALSE;
- for(f = 0; f < numForbidden[queryPos]; f++) {
- if ((matchSeqPos >= forbiddenRanges[queryPos][2 * f]) &&
- (matchSeqPos <= forbiddenRanges[queryPos][2*f + 1])) {
- forbidden = TRUE;
- break;
- }
- }
- if (forbidden)
- newScore = BLAST_SCORE_MIN;
- else
- newScore = prevScoreNoGapMatchSeq + matrixRow[matchSeq[matchSeqPos]];
- if (newScore < 0)
- newScore = 0; /*Smith-Waterman locality condition*/
- /*test two alternatives*/
- if (newScore < prevScoreGapMatchSeq)
- newScore = prevScoreGapMatchSeq;
- if (newScore < continueGapScore)
- newScore = continueGapScore;
- prevScoreNoGapMatchSeq = scoreVector[matchSeqPos].noGap;
- scoreVector[matchSeqPos].noGap = newScore;
- scoreVector[matchSeqPos].gapExists = continueGapScore;
- if (newScore > bestScore) {
- bestScore = newScore;
- bestQueryPos = queryPos;
- bestMatchSeqPos = matchSeqPos;
- }
- if (bestScore >= score)
- break;
- }
- if (bestScore >= score)
- break;
- }
- sfree(scoreVector);
- if (bestScore < 0)
- bestScore = 0;
- *matchSeqStart = bestMatchSeqPos;
- *queryStart = bestQueryPos;
- return(bestScore);
-}
-
-
-/**
- * Kappa_SequenceData - represents a string of amino acids or nucleotides
- */
-typedef struct Kappa_SequenceData {
- Uint1 *data; /**< amino acid or nucleotide data */
- Int4 length; /**< the length of data. For amino acid data
- &data[-1] is a valid address and
- data[-1] == 0. */
- Uint1 *buffer; /**< if non-nil, points to memory that
- must be freed when this instance of
- Kappa_SequenceData is deleted. */
-} Kappa_SequenceData;
-
-
-/** Release the data associated with this object. */
-static void
-Kappa_SequenceDataRelease(Kappa_SequenceData * self)
-{
- if(self->buffer) sfree(self->buffer);
-
- self->data = NULL;
- self->buffer = NULL;
-}
-
-
-/**
- * An instance of Kappa_ForbiddenRanges is used by the Smith-Waterman
- * algorithm to represent ranges in the database that are not to be
- * aligned.
- */
-typedef struct Kappa_ForbiddenRanges {
- Boolean isEmpty; /**< True if there are no forbidden ranges */
- Int4 *numForbidden; /**< how many forbidden ranges at each db
- position */
- Int4 **ranges; /**< forbidden ranges for each database
- position */
- Int4 queryLength; /**< length of the query sequence */
-} Kappa_ForbiddenRanges;
-
-
-/**
- * Initialize a new, empty Kappa_ForbiddenRanges
- *
- * @param self object to be initialized
- * @param queryLength the length of the query
- */
-static void
-Kappa_ForbiddenRangesInitialize(
- Kappa_ForbiddenRanges * self,
- Int4 queryLength)
-{
- Int4 f;
- self->queryLength = queryLength;
- self->numForbidden = (Int4 *) malloc(queryLength * sizeof(Int4));
- self->ranges = (Int4 **) malloc(queryLength * sizeof(Int4 *));
- self->isEmpty = TRUE;
-
- for(f = 0; f < queryLength; f++) {
- self->numForbidden[f] = 0;
- self->ranges[f] = (Int4 *) malloc(2 * sizeof(Int4));
- self->ranges[f][0] = 0;
- self->ranges[f][1] = 0;
- }
-}
-
-
-/** Reset self to be empty */
-static void
-Kappa_ForbiddenRangesClear(Kappa_ForbiddenRanges * self)
-{
- Int4 f;
- for(f = 0; f < self->queryLength; f++) {
- self->numForbidden[f] = 0;
- }
- self->isEmpty = TRUE;
-}
-
-
-/** Add some ranges to self
- * @param self an instance of Kappa_ForbiddenRanges [in][out]
- * @param queryStart start of the alignment in the query sequence
- * @param queryAlignmentExtent length of the alignment in the query sequence
- * @param matchStart start of the alignment in the subject sequence
- * @param matchAlignmentExtent length of the alignment in the
- * subject sequence
- */
-static void
-Kappa_ForbiddenRangesPush(
- Kappa_ForbiddenRanges * self,
- Int4 queryStart,
- Int4 queryAlignmentExtent,
- Int4 matchStart,
- Int4 matchAlignmentExtent)
-{
- Int4 f;
- for(f = queryStart; f < (queryStart + queryAlignmentExtent); f++) {
- Int4 last = 2 * self->numForbidden[f];
- if(0 != last) { /* we must resize the array */
- self->ranges[f] =
- (Int4 *) realloc(self->ranges[f], (last + 2) * sizeof(Int4));
+ int i, n;
+ double avg;
+ Blast_ScoreFreq freq;
+
+ n = max_score - min_score + 1;
+ avg = 0.0;
+ for (i = 0; i < n; i++) {
+ avg += (min_score + i) * probs[i];
}
- self->ranges[f][last] = matchStart;
- self->ranges[f][last + 1] = matchStart + matchAlignmentExtent;
-
- self->numForbidden[f]++;
- }
- self->isEmpty = FALSE;
-}
-
-
-/**
- * Release the storage associated with the fields of self, but do not
- * delete self
- *
- * @param self an instance of Kappa_ForbiddenRanges [in][out]
- */
-static void
-Kappa_ForbiddenRangesRelease(Kappa_ForbiddenRanges * self)
-{
- Int4 f;
- for(f = 0; f < self->queryLength; f++) sfree(self->ranges[f]);
-
- sfree(self->ranges); self->ranges = NULL;
- sfree(self->numForbidden); self->numForbidden = NULL;
+ freq.score_min = min_score;
+ freq.score_max = max_score;
+ freq.obs_min = min_score;
+ freq.obs_max = max_score;
+ freq.sprob0 = probs;
+ freq.sprob = &probs[-min_score];
+ freq.score_avg = avg;
+
+ return Blast_KarlinLambdaNR(&freq, lambda0);
}
-/**
- * Calls BLbasicSmithWatermanScoreOnly if forbiddenRanges is empty and
- * calls BLspecialSmithWatermanScoreOnly otherwise. This routine has
- * the same parameters and return value as
- * BLspecialSmithWatermanScoreOnly.
- */
-static double
-SmithWatermanScoreOnly(Kappa_SequenceData * subject,
- Kappa_SequenceData * query,
- Int4 **matrix,
- Int4 gapOpen,
- Int4 gapExtend,
- Int4 *matchSeqEnd,
- Int4 *queryEnd,
- Int4 *score,
- Blast_KarlinBlk * kbp,
- Int8 effSearchSpace,
- Boolean positionSpecific,
- Kappa_ForbiddenRanges * forbiddenRanges )
-{
- if( forbiddenRanges->isEmpty ) {
- return
- BLbasicSmithWatermanScoreOnly(subject->data, subject->length,
- query ->data, query ->length,
- matrix, gapOpen, gapExtend, matchSeqEnd,
- queryEnd, score, kbp, effSearchSpace,
- positionSpecific);
- } else {
- return
- BLspecialSmithWatermanScoreOnly(subject->data, subject->length,
- query ->data, query ->length,
- matrix, gapOpen, gapExtend, matchSeqEnd,
- queryEnd, score, kbp, effSearchSpace,
- forbiddenRanges->numForbidden,
- forbiddenRanges->ranges,
- positionSpecific);
- }
-}
-
-
-/**
- * Calls BLSmithWatermanFindStart if forbiddenRanges is empty and
- * calls BLspecialSmithWatermanFindStart otherwise. This routine has
- * the same parameters and return value as
- * BLspecialSmithWatermanFindStart.
- */
-static Int4
-SmithWatermanFindStart(Kappa_SequenceData * subject,
- Kappa_SequenceData * query,
- Int4 **matrix,
- Int4 gapOpen,
- Int4 gapExtend,
- Int4 matchSeqEnd,
- Int4 queryEnd,
- Int4 score,
- Int4 *matchSeqStart,
- Int4 *queryStart,
- Boolean positionSpecific,
- Kappa_ForbiddenRanges * forbiddenRanges)
-{
- if( forbiddenRanges->isEmpty ) {
- return
- BLSmithWatermanFindStart(subject->data, subject->length,
- query ->data,
- matrix, gapOpen, gapExtend,
- matchSeqEnd, queryEnd, score,
- matchSeqStart, queryStart,
- positionSpecific);
- } else {
- return
- BLspecialSmithWatermanFindStart(subject->data, subject->length,
- query ->data,
- matrix, gapOpen, gapExtend,
- matchSeqEnd, queryEnd, score,
- matchSeqStart, queryStart,
- forbiddenRanges->numForbidden,
- forbiddenRanges->ranges,
- positionSpecific);
- }
-}
-
-
-/**
- * @param matrix is a position-specific score matrix with matrixLength
- * positions
- * @param subjectProbArray is an array containing the probability of
- * occurrence of each residue in the subject
- * @param queryProbArray is an array containing the probability of
- * occurrence of each residue in the query
- * @param scoreArray is an array of probabilities for each score that is
- * to be used as a field in return_sfp
- * @param return_sfp is a the structure to be filled in and returned
- * @param range is the size of scoreArray and is an upper bound on
- * the difference between maximum score and minimum
- * score in the matrix
- * the routine posfillSfp computes the probability of each score
- * weighted by the probability of each query residue and fills those
- * probabilities into scoreArray and puts scoreArray as a field in
- * that in the structure that is returned for indexing convenience the
- * field storing scoreArray points to the entry for score 0, so that
- * referring to the -k index corresponds to score -k
- */
-static Blast_ScoreFreq* notposfillSfp(Int4 **matrix, double *subjectProbArray, double *queryProbArray, double *scoreArray, Blast_ScoreFreq* return_sfp, Int4 range)
-{
- Int4 minScore, maxScore; /*observed minimum and maximum scores*/
- Int4 i,j,k; /* indices */
-
- minScore = maxScore = 0;
-
- for(i = 0; i < BLASTAA_SIZE; i++) {
- for(j = 0 ; j < PRO_TRUE_ALPHABET_SIZE; j++) {
- k = trueCharPositions[j];
- if ((matrix[i][k] != BLAST_SCORE_MIN) && (matrix[i][k] < minScore))
- minScore = matrix[i][k];
- if (matrix[i][k] > maxScore)
- maxScore = matrix[i][k];
- }
- }
- return_sfp->obs_min = minScore;
- return_sfp->obs_max = maxScore;
- for (i = 0; i < range; i++)
- scoreArray[i] = 0.0;
- return_sfp->sprob = &(scoreArray[-minScore]); /*center around 0*/
- for(i = 0; i < BLASTAA_SIZE; i++) {
- for (j = 0; j < PRO_TRUE_ALPHABET_SIZE; j++) {
- k = trueCharPositions[j];
- if(matrix[i][k] >= minScore) {
- return_sfp->sprob[matrix[i][k]] += (queryProbArray[i] * subjectProbArray[k]);
- }
- }
- }
- return_sfp->score_avg = 0;
- for(i = minScore; i <= maxScore; i++)
- return_sfp->score_avg += i * return_sfp->sprob[i];
- return(return_sfp);
-}
-
-
-/**
- *
- * @param matrix is a position-specific score matrix with
- * matrixLength positions
- * @param matrixLength length of the position-specific matrix above
- * @param subjectProbArray is an array containing the probability of
- * occurrence of each residue in the matching
- * sequence often called the subject
- * @param scoreArray is an array of probabilities for each score
- * that is to be used as a field in return_sfp
- * @param return_sfp is a the structure to be filled in and returned
- * range is the size of scoreArray and is an upper
- * bound on the difference between maximum score
- * and minimum score in the matrix
- * @param range is the size of scoreArray and is an upper bound on
- * the difference between maximum score and minimum
- * score in the matrix
- * the routine posfillSfp computes the probability of each score
- * weighted by the probability of each query residue and fills those
- * probabilities into scoreArray and puts scoreArray as a field in
- * that in the structure that is returned for indexing convenience the
- * field storing scoreArray points to the entry for score 0, so that
- * referring to the -k index corresponds to score -k
- */
-static Blast_ScoreFreq* posfillSfp(Int4 **matrix, Int4 matrixLength, double *subjectProbArray, double *scoreArray, Blast_ScoreFreq* return_sfp, Int4 range)
-{
- Int4 minScore, maxScore; /*observed minimum and maximum scores*/
- Int4 i,j,k; /* indices */
- double onePosFrac; /*1/matrix length as a double*/
-
- minScore = maxScore = 0;
-
- for(i = 0; i < matrixLength; i++) {
- for(j = 0 ; j < PRO_TRUE_ALPHABET_SIZE; j++) {
- k = trueCharPositions[j];
- if ((matrix[i][k] != BLAST_SCORE_MIN) && (matrix[i][k] < minScore))
- minScore = matrix[i][k];
- if (matrix[i][k] > maxScore)
- maxScore = matrix[i][k];
- }
- }
- return_sfp->obs_min = minScore;
- return_sfp->obs_max = maxScore;
- for (i = 0; i < range; i++)
- scoreArray[i] = 0.0;
- return_sfp->sprob = &(scoreArray[-minScore]); /*center around 0*/
- onePosFrac = 1.0/ ((double) matrixLength);
- for(i = 0; i < matrixLength; i++) {
- for (j = 0; j < PRO_TRUE_ALPHABET_SIZE; j++) {
- k = trueCharPositions[j];
- if(matrix[i][k] >= minScore) {
- return_sfp->sprob[matrix[i][k]] += (onePosFrac * subjectProbArray[k]);
- }
- }
- }
- return_sfp->score_avg = 0;
- for(i = minScore; i <= maxScore; i++)
- return_sfp->score_avg += i * return_sfp->sprob[i];
- return(return_sfp);
-}
-
/** Return the a matrix of the frequency ratios that underlie the
* score matrix being used on this pass. The returned matrix
* is position-specific, so if we are in the first pass, use
@@ -1675,7 +276,7 @@ static Blast_ScoreFreq* posfillSfp(Int4 **matrix, Int4 matrixLength, double *sub
* score matrix used. numPositions is the length of the query;
* startNumerator is the matrix of frequency ratios as stored
* in posit.h. It needs to be divided by the frequency of the
- * second character to get the intended ratio
+ * second character to get the intended ratio
* @param sbp statistical information for blast [in]
* @param query the query sequence [in]
* @param matrixName name of the underlying matrix [in]
@@ -1684,82 +285,51 @@ static Blast_ScoreFreq* posfillSfp(Int4 **matrix, Int4 matrixLength, double *sub
* second character to get the intended ratio [in]
* @param numPositions length of the query [in]
*/
-static double **getStartFreqRatios(BlastScoreBlk* sbp,
- Uint1* query,
- const char *matrixName,
- double **startNumerator,
- Int4 numPositions)
-{
- double** returnRatios; /*frequency ratios to start investigating each pair*/
- double *standardProb; /*probabilities of each letter*/
- Int4 i,j; /* Loop indices. */
- SFreqRatios* freqRatios=NULL; /* frequency ratio container for given matrix */
- const double kPosEpsilon = 0.0001;
-
- returnRatios = (double**) _PSIAllocateMatrix(numPositions,
- BLASTAA_SIZE,
- sizeof(double));
-
- freqRatios = _PSIMatrixFrequencyRatiosNew(matrixName);
- if (freqRatios == NULL)
- return NULL;
-
- for(i = 0; i < numPositions; i++) {
- for(j = 0; j < BLASTAA_SIZE; j++) {
- returnRatios[i][j] = freqRatios->data[query[i]][j];
- }
- }
-
- freqRatios = _PSIMatrixFrequencyRatiosFree(freqRatios);
-
- standardProb = BLAST_GetStandardAaProbabilities();
-
- /*reverse multiplication done in posit.c*/
- for(i = 0; i < numPositions; i++)
- for(j = 0; j < BLASTAA_SIZE; j++)
- if ((standardProb[query[i]] > kPosEpsilon) && (standardProb[j] > kPosEpsilon) &&
- (j != AMINOACID_TO_NCBISTDAA['X']) && (j != AMINOACID_TO_NCBISTDAA['*'])
- && (startNumerator[i][j] > kPosEpsilon))
- returnRatios[i][j] = startNumerator[i][j]/standardProb[j];
-
- sfree(standardProb);
-
- return(returnRatios);
-}
-
-
-/**
- * take every entry of startFreqRatios that is not corresponding to a
- * score of BLAST_SCORE_MIN and take its log, divide by Lambda and
- * multiply by LambdaRatio then round to the nearest integer and put
- * the result in the corresponding entry of matrix. startMatrix and
- * matrix have dimensions numPositions X BLASTAA_SIZE
- *
- * @param matrix preallocated matrix to be filled in [out]
- * @param startFreqRatios frequency ratios of starting matrix [in]
- * @param numPositions length of query [in]
- * @param Lambda A Karlin-Altschul parameter. [in]
- * @param LambdaRatio ratio of correct Lambda to it's original value [in]
-*/
-static void scaleMatrix(Int4 **matrix,
- double **startFreqRatios, Int4 numPositions,
- double Lambda, double LambdaRatio)
+static void
+s_GetStartFreqRatios(double ** returnRatios,
+ Uint1 * query,
+ const char *matrixName,
+ double **startNumerator,
+ Int4 numPositions,
+ Boolean positionSpecific)
{
- Int4 p, c; /*indices over positions and characters*/
- double temp; /*intermediate term in computation*/
-
- for (p = 0; p < numPositions; p++) {
- for (c = 0; c < BLASTAA_SIZE; c++) {
- if (0.0 == startFreqRatios[p][c]) {
- matrix[p][c] = BLAST_SCORE_MIN;
- } else {
- temp = log(startFreqRatios[p][c]);
- temp = temp/Lambda;
- temp = temp * LambdaRatio;
- matrix[p][c] = BLAST_Nint(temp);
- }
- }
- }
+ Int4 i,j;
+ SFreqRatios * stdFreqRatios = NULL;
+ const double kPosEpsilon = 0.0001;
+
+ stdFreqRatios = _PSIMatrixFrequencyRatiosNew(matrixName);
+ if (positionSpecific) {
+ for (i = 0; i < numPositions; i++) {
+ for (j = 0; j < BLASTAA_SIZE; j++) {
+ returnRatios[i][j] = stdFreqRatios->data[query[i]][j];
+ }
+ }
+ } else {
+ for (i = 0; i < BLASTAA_SIZE; i++) {
+ for (j = 0; j < BLASTAA_SIZE; j++) {
+ returnRatios[i][j] = stdFreqRatios->data[i][j];
+ }
+ }
+ }
+ stdFreqRatios = _PSIMatrixFrequencyRatiosFree(stdFreqRatios);
+
+ if (positionSpecific) {
+ double *standardProb; /*probabilities of each letter*/
+ standardProb = BLAST_GetStandardAaProbabilities();
+
+ /*reverse multiplication done in posit.c*/
+ for (i = 0; i < numPositions; i++) {
+ for (j = 0; j < BLASTAA_SIZE; j++) {
+ if ((standardProb[query[i]] > kPosEpsilon) &&
+ (standardProb[j] > kPosEpsilon) &&
+ (j != eStopChar) && (j != eXchar) &&
+ (startNumerator[i][j] > kPosEpsilon)) {
+ returnRatios[i][j] = startNumerator[i][j]/standardProb[j];
+ }
+ }
+ }
+ sfree(standardProb);
+ }
}
@@ -1771,42 +341,6 @@ static void scaleMatrix(Int4 **matrix,
/**
- * Compute a scaled up version of the standard matrix encoded by
- * matrix name. Standard matrices are in half-bit units.
- *
- * @param matrix preallocated matrix [in][out]
- * @param matrixName name of matrix (e.g., BLOSUM62, PAM30). [in]
- * @param Lambda A Karlin-Altschul parameter. [in]
- */
-static void
-computeScaledStandardMatrix(
- Int4 **matrix,
- char *matrixName,
- double Lambda)
-{
- int i,j; /*loop indices*/
- SFreqRatios* freqRatios=NULL; /* frequency ratios for the matrix */
-
- freqRatios = _PSIMatrixFrequencyRatiosNew(matrixName);
- ASSERT(freqRatios);
- if (freqRatios == NULL)
- return;
-
- for(i = 0; i < BLASTAA_SIZE; i++)
- for(j = 0; j < BLASTAA_SIZE; j++) {
- if(0.0 == freqRatios->data[i][j])
- matrix[i][j] = BLAST_SCORE_MIN;
- else {
- double temp = log(freqRatios->data[i][j])/Lambda;
- matrix[i][j] = BLAST_Nint(temp);
- }
- }
-
- freqRatios = _PSIMatrixFrequencyRatiosFree(freqRatios);
-}
-
-
-/**
* produce a scaled-up version of the position-specific matrix
* starting from posFreqs
*
@@ -1821,20 +355,20 @@ computeScaledStandardMatrix(
* @param queryLength Length of the query sequence above [in]
*/
static int
-scalePosMatrix(int **fillPosMatrix,
- int **nonposMatrix,
- const char *matrixName,
- double **posFreqs,
- Uint1 *query,
- int queryLength,
- BlastScoreBlk* sbp)
+s_ScalePosMatrix(int **fillPosMatrix,
+ int **nonposMatrix,
+ const char *matrixName,
+ double **posFreqs,
+ Uint1 *query,
+ int queryLength,
+ BlastScoreBlk* sbp)
{
Kappa_posSearchItems *posSearch = NULL;
Kappa_compactSearchItems *compactSearch = NULL;
_PSIInternalPssmData* internal_pssm = NULL;
int status = PSI_SUCCESS;
- posSearch = Kappa_posSearchItemsNew(queryLength, matrixName,
+ posSearch = Kappa_posSearchItemsNew(queryLength, matrixName,
fillPosMatrix, posFreqs);
compactSearch = Kappa_compactSearchItemsNew(query, queryLength, sbp);
@@ -1842,11 +376,13 @@ scalePosMatrix(int **fillPosMatrix,
internal_pssm = _PSIInternalPssmDataNew(queryLength, BLASTAA_SIZE);
_PSICopyMatrix_int(internal_pssm->pssm, posSearch->posMatrix,
internal_pssm->ncols, internal_pssm->nrows);
- _PSICopyMatrix_int(internal_pssm->scaled_pssm, posSearch->posPrivateMatrix,
+ _PSICopyMatrix_int(internal_pssm->scaled_pssm,
+ posSearch->posPrivateMatrix,
internal_pssm->ncols, internal_pssm->nrows);
- _PSICopyMatrix_double(internal_pssm->freq_ratios, posSearch->posFreqs,
- internal_pssm->ncols, internal_pssm->nrows);
- status = _PSIConvertFreqRatiosToPSSM(internal_pssm, query, sbp,
+ _PSICopyMatrix_double(internal_pssm->freq_ratios,
+ posSearch->posFreqs, internal_pssm->ncols,
+ internal_pssm->nrows);
+ status = _PSIConvertFreqRatiosToPSSM(internal_pssm, query, sbp,
compactSearch->standardProb);
if (status != PSI_SUCCESS) {
internal_pssm = _PSIInternalPssmDataFree(internal_pssm);
@@ -1854,26 +390,23 @@ scalePosMatrix(int **fillPosMatrix,
compactSearch = Kappa_compactSearchItemsFree(compactSearch);
return status;
}
-
/* Copy data from new structures to posSearchItems */
_PSICopyMatrix_int(posSearch->posMatrix, internal_pssm->pssm,
internal_pssm->ncols, internal_pssm->nrows);
- _PSICopyMatrix_int(posSearch->posPrivateMatrix, internal_pssm->scaled_pssm,
+ _PSICopyMatrix_int(posSearch->posPrivateMatrix,
+ internal_pssm->scaled_pssm,
internal_pssm->ncols, internal_pssm->nrows);
- _PSICopyMatrix_double(posSearch->posFreqs, internal_pssm->freq_ratios,
+ _PSICopyMatrix_double(posSearch->posFreqs,
+ internal_pssm->freq_ratios,
internal_pssm->ncols, internal_pssm->nrows);
- status = Kappa_impalaScaling(posSearch,
- compactSearch,
- (double) SCALING_FACTOR,
- FALSE,
- sbp);
+ status = Kappa_impalaScaling(posSearch, compactSearch, (double)
+ SCALING_FACTOR, FALSE, sbp);
if (status != 0) {
internal_pssm = _PSIInternalPssmDataFree(internal_pssm);
posSearch = Kappa_posSearchItemsFree(posSearch);
compactSearch = Kappa_compactSearchItemsFree(compactSearch);
return status;
}
-
internal_pssm = _PSIInternalPssmDataFree(internal_pssm);
posSearch = Kappa_posSearchItemsFree(posSearch);
compactSearch = Kappa_compactSearchItemsFree(compactSearch);
@@ -1881,167 +414,49 @@ scalePosMatrix(int **fillPosMatrix,
}
-/**
- * Kappa_WindowInfo - a struct whose instances represent a range
- * of data in a sequence. */
-typedef struct Kappa_WindowInfo
+static BlastCompo_Alignment *
+s_ResultHspToDistinctAlign(BlastQueryInfo* queryInfo,
+ BlastHSP * hsp_array[], Int4 hspcnt,
+ double localScalingFactor)
{
- Int4 begin; /**< the starting index of the range */
- Int4 end; /**< one beyond the last item in the range */
- Int4 frame; /**< the translation frame of this window */
- Int4 hspcnt; /**< the number of HSPs aligned to a subset of the data
- in this window's range. */
-} Kappa_WindowInfo;
-
-
-/**
- * A datatype used solely to enable a list of windows and of indices
- * to be simultaneously sorted in the WindowsFromHSPs routine.
- */
-typedef struct Kappa_WindowIndexPair {
- Kappa_WindowInfo * window; /**< a window */
- Int4 index; /**< an index associated with
- "window," typically the index of
- the window in a list, before the
- list is sorted. */
-} Kappa_WindowIndexPair;
-
-/**
- * A comparison routine used to sort a list of Kappa_WindowIndexPair
- * objects first by frame and then by location.
- */
-static int
-location_compare_windows(const void * vp1, const void *vp2)
-{
- /* w1 and w2 are the windows being compared */
- Kappa_WindowInfo * w1 = ((Kappa_WindowIndexPair *) vp1)->window;
- Kappa_WindowInfo * w2 = ((Kappa_WindowIndexPair *) vp2)->window;
-
- Int4 result; /* result of the comparison */
- if(0 == (result = BLAST_CMP(w1->frame, w2->frame)) &&
- 0 == (result = BLAST_CMP(w1->begin, w2->begin))) {
- result = BLAST_CMP(w1->end, w2->end);
- }
- return (int) result;
-}
-
-
-/**
- * Reads a array of HSPs and creates a new array of pointers to
- * Kappa_WindowInfo so that each element in the array of HSPs is
- * contained in exactly one window
- *
- * @param hsp_array hsp array to be read [in]
- * @param hspcnt length of hsp_array [in]
- * @param border Number of extra amino acids to include
- * at the start and end of each HSP.
- * @param sequence_length length of the sequence containing these
- * HSPs, in nucleotide coordinates.
- * @param pwindows a pointer to an array of windows;
- * the array may be resized by this routine. [in][out]
- * @param nWindows the number of windows in *pwindows [in][out]
- * @param lWindows the allocated length of *pwindows [in][out]
- * @param window_of_hsp HSP i is contained in the bounds of
- * window_of_hsp[i] [in][out]
- */
-static void
-WindowsFromHSPs(
- BlastHSP * hsp_array[],
- Int4 hspcnt,
- Int4 border,
- Int4 sequence_length,
- Kappa_WindowInfo ***pwindows,
- Int4 * nWindows,
- Int4 * lWindows,
- Int4 * window_of_hsp)
-{
- Int4 k, ell;
- Kappa_WindowIndexPair * window_and_index; /* an array of windows
- * paired with the index
- * of the HSP that
- * generated them */
- Kappa_WindowInfo ** windows; /* the output list of windows */
- Int4 start_cluster; /* start of a cluster of windows to be joined */
- Int4 length_joined; /* the current length of the list of joined windows */
-
- windows = *pwindows;
- /* Make the window list have exactly hspcnt windows. */
- if( *lWindows < hspcnt ) {
- *lWindows = 2 * hspcnt;
- windows = *pwindows =
- realloc(*pwindows, *lWindows * sizeof(Kappa_WindowInfo*));
- }
- for( k = *nWindows; k < hspcnt; k++ ) {
- windows[k] = malloc(sizeof(Kappa_WindowInfo));
- }
- for( k = hspcnt; k < *nWindows; k++ ) {
- sfree(windows[k]);
- }
- *nWindows = hspcnt;
-
- window_and_index = calloc(hspcnt, sizeof(Kappa_WindowIndexPair));
-
- for( k = 0; k < hspcnt; k++ ) { /* for all HSPs */
- /* length of the translation of the nucleotide sequence in this frame */
- Int4 translated_length;
-
- windows[k]->frame = hsp_array[k]->subject.frame;
-
- if( windows[k]->frame > 0 ) {
- translated_length = (sequence_length - windows[k]->frame + 1)/3;
- } else {
- translated_length = (sequence_length + windows[k]->frame - 1)/3;
+ BlastCompo_Alignment *aligns = NULL, *tail = NULL, *new_align = NULL;
+ int hsp_index;
+ for (hsp_index = 0; hsp_index < hspcnt; hsp_index++) {
+ int queryIndex, queryEnd, matchEnd;
+ BlastHSP * hsp = hsp_array[hsp_index];
+ queryEnd = hsp->query.end;
+ matchEnd = hsp->subject.end;
+ /* YIKES! how do we handle multiple queries */
+ /*
+ if(search->mult_queries != NULL) {
+ queryIndex =
+ GetQueryNum(search->mult_queries,
+ hsp->query_offset, queryEnd - 1, 0);
+ } else {
+ queryIndex = 0;
+ }
+ */
+ queryIndex = 0;
+ new_align =
+ BlastCompo_AlignmentNew(hsp->score * localScalingFactor,
+ eNoCompositionAdjustment,
+ hsp->query.offset, queryEnd, queryIndex,
+ hsp->subject.offset, matchEnd,
+ hsp->subject.frame, hsp);
+ if (new_align == NULL) /* out of memory */
+ goto error_return;
+ if (tail == NULL) {
+ aligns = new_align;
+ } else {
+ tail->next = new_align;
+ }
+ tail = new_align;
}
- windows[k]->begin = MAX(0, hsp_array[k]->subject.offset - border);
- windows[k]->end = MIN(translated_length,
- hsp_array[k]->subject.end + border);
- windows[k]->hspcnt = 1;
-
- window_and_index[k].index = k;
- window_and_index[k].window = windows[k];
- }
- qsort(window_and_index, hspcnt, sizeof(Kappa_WindowIndexPair),
- location_compare_windows);
-
- /* Join windows that overlap or are too close together. */
- start_cluster = 0;
- length_joined = 0;
- for( k = 0; k < hspcnt; k++ ) { /* for all windows in the
- original list */
- Kappa_WindowInfo * window; /* window at this value of k */
- Kappa_WindowInfo * nextWindow; /* window at the next value of k, or
- NULL if no such window exists */
- window = window_and_index[k].window;
- nextWindow = ( k + 1 < hspcnt ) ? window_and_index[k+1].window : NULL;
-
- if(nextWindow != NULL && /* there is a next window; and */
- window->frame == nextWindow->frame && /* it is in the same frame; and
- it is very near this one */
- window->end >= nextWindow->begin) {
- /* Join the current window with the next window. Do not add the
- current window to the output list. */
- nextWindow->begin = MIN(window->begin, nextWindow->begin);
- nextWindow->end = MAX(window->end, nextWindow->end );
-
- sfree(window);
- window_and_index[k].window = NULL; /* Set the now dangling
- pointer to NULL */
- } else {
- /* Don't join the current window with the next window. Add the
- current window to the output list instead */
- windows[length_joined] = window;
- for( ell = start_cluster; ell <= k; ell++ ) {
- window_of_hsp[window_and_index[ell].index] = length_joined;
- }
- length_joined++;
- start_cluster = k + 1;
- } /* end else don't join the current window with the next window */
- } /* end for all windows in the original list */
- *nWindows = length_joined;
- for( k = length_joined; k < hspcnt; k++ ) {
- windows[k] = NULL;
- }
- sfree(window_and_index);
+ goto normal_return;
+ error_return:
+ BlastCompo_AlignmentsFree(&aligns, NULL);
+ normal_return:
+ return aligns;
}
@@ -2075,45 +490,45 @@ WindowsFromHSPs(
* algorithm
*/
static void
-Kappa_SWFindFinalEndsUsingXdrop(
- Kappa_SequenceData * query,
- Int4 queryStart,
- Int4 queryEnd,
- Kappa_SequenceData * subject,
- Int4 matchStart,
- Int4 matchEnd,
- BlastGapAlignStruct* gap_align,
- const BlastScoringParameters* scoringParams,
- Int4 score,
- double localScalingFactor,
- Int4 * queryAlignmentExtent,
- Int4 * matchAlignmentExtent,
- Int4 * newScore)
+s_SWFindFinalEndsUsingXdrop(
+ BlastCompo_SequenceData * query,
+ Int4 queryStart,
+ Int4 queryEnd,
+ BlastCompo_SequenceData * subject,
+ Int4 matchStart,
+ Int4 matchEnd,
+ BlastGapAlignStruct* gap_align,
+ const BlastScoringParameters* scoringParams,
+ Int4 score,
+ double localScalingFactor,
+ Int4 * queryAlignmentExtent,
+ Int4 * matchAlignmentExtent,
+ Int4 * newScore)
{
- Int4 XdropAlignScore; /* alignment score obtained using X-dropoff
- * method rather than Smith-Waterman */
- Int4 doublingCount = 0; /* number of times X-dropoff had to be
- * doubled */
-
- GapPrelimEditBlockReset(gap_align->rev_prelim_tback);
- GapPrelimEditBlockReset(gap_align->fwd_prelim_tback);
- do {
- XdropAlignScore =
- ALIGN_EX(&(query->data[queryStart]) - 1,
- &(subject->data[matchStart]) - 1,
- queryEnd - queryStart + 1, matchEnd - matchStart + 1,
- queryAlignmentExtent,
- matchAlignmentExtent, gap_align->fwd_prelim_tback,
- gap_align, scoringParams, queryStart - 1, FALSE, FALSE);
-
- gap_align->gap_x_dropoff *= 2;
- doublingCount++;
- if((XdropAlignScore < score) && (doublingCount < 3)) {
- GapPrelimEditBlockReset(gap_align->fwd_prelim_tback);
- }
- } while((XdropAlignScore < score) && (doublingCount < 3));
+ Int4 XdropAlignScore; /* alignment score obtained using X-dropoff
+ * method rather than Smith-Waterman */
+ Int4 doublingCount = 0; /* number of times X-dropoff had to be
+ * doubled */
+
+ GapPrelimEditBlockReset(gap_align->rev_prelim_tback);
+ GapPrelimEditBlockReset(gap_align->fwd_prelim_tback);
+ do {
+ XdropAlignScore =
+ ALIGN_EX(&(query->data[queryStart]) - 1,
+ &(subject->data[matchStart]) - 1,
+ queryEnd - queryStart + 1, matchEnd - matchStart + 1,
+ queryAlignmentExtent,
+ matchAlignmentExtent, gap_align->fwd_prelim_tback,
+ gap_align, scoringParams, queryStart - 1, FALSE, FALSE);
+
+ gap_align->gap_x_dropoff *= 2;
+ doublingCount++;
+ if((XdropAlignScore < score) && (doublingCount < 3)) {
+ GapPrelimEditBlockReset(gap_align->fwd_prelim_tback);
+ }
+ } while((XdropAlignScore < score) && (doublingCount < 3));
- *newScore = XdropAlignScore;
+ *newScore = XdropAlignScore;
}
@@ -2127,24 +542,23 @@ Kappa_SWFindFinalEndsUsingXdrop(
* We draw a distinction between a sequence itself, and strings of
* data that may be obtained from the sequence. The amino
* acid/nucleotide data is represented by an object of type
- * Kappa_SequenceData. There may be more than one instance of
- * Kappa_SequenceData per Kappa_MatchingSequence, each representing a
+ * BlastCompo_SequenceData. There may be more than one instance of
+ * BlastCompo_SequenceData per Kappa_MatchingSequence, each representing a
* different range in the sequence, or a different translation frame.
*/
-typedef struct Kappa_MatchingSequence {
- Int4 length; /**< length of this matching sequence */
- Int4 index; /**< index of this sequence in the database */
- EBlastProgramType prog_number; /**< identifies the type of blast search being
- performed. The type of search determines
- how sequence data should be obtained. */
- const Uint1* genetic_code; /**< genetic code for translated searches */
- const BlastSeqSrc* seq_src; /**< BLAST sequence data source */
- BlastSeqSrcGetSeqArg seq_arg; /**< argument to GetSequence method of the
- BlastSeqSrc (@todo this structure was
- designed to be allocated on the stack, i.e.:
- in Kappa_MatchingSequenceInitialize)
- */
-} Kappa_MatchingSequence;
+typedef struct Kappa_SequenceLocalData {
+ EBlastProgramType prog_number; /**< identifies the type of blast
+ search being performed. The type
+ of search determines how sequence
+ data should be obtained. */
+ const Uint1* genetic_code; /**< genetic code for translated searches */
+ const BlastSeqSrc* seq_src; /**< BLAST sequence data source */
+ BlastSeqSrcGetSeqArg seq_arg; /**< argument to GetSequence method
+ of the BlastSeqSrc (@todo this
+ structure was designed to be
+ allocated on the stack, i.e.: in
+ Kappa_MatchingSequenceInitialize) */
+} Kappa_SequenceLocalData;
/**
@@ -2155,47 +569,59 @@ typedef struct Kappa_MatchingSequence {
* @param seqSrc A pointer to a source from which sequence data
* may be obtained
* @param program_number identifies the type of blast search being
- performed.
+ * performed.
* @param gen_code_string genetic code for translated queries
* @param subject_index index of the matching sequence in the database
*/
static void
-Kappa_MatchingSequenceInitialize(
- Kappa_MatchingSequence * self,
- EBlastProgramType program_number,
- const BlastSeqSrc* seqSrc,
- const Uint1* gen_code_string,
- Int4 subject_index)
+s_MatchingSequenceInitialize(
+ BlastCompo_MatchingSequence * self,
+ EBlastProgramType program_number,
+ const BlastSeqSrc* seqSrc,
+ const Uint1* gen_code_string,
+ Int4 subject_index)
{
- self->seq_src = seqSrc;
- self->prog_number = program_number;
- self->genetic_code = gen_code_string;
-
- memset((void*) &self->seq_arg, 0, sizeof(self->seq_arg));
- self->seq_arg.oid = self->index = subject_index;
-
- if( program_number == eBlastTypeTblastn ) {
- self->seq_arg.encoding = eBlastEncodingNcbi4na;
- } else {
- self->seq_arg.encoding = eBlastEncodingProtein;
- }
-
- if (BlastSeqSrcGetSequence(seqSrc, (void*) &self->seq_arg) < 0)
- return;
- self->length = BlastSeqSrcGetSeqLen(seqSrc, (void*) &self->seq_arg);
+ Kappa_SequenceLocalData * local_data =
+ malloc(sizeof(Kappa_SequenceLocalData));
+ self->local_data = local_data;
+
+ local_data->seq_src = seqSrc;
+ local_data->prog_number = program_number;
+ local_data->genetic_code = gen_code_string;
+
+ memset((void*) &local_data->seq_arg, 0, sizeof(local_data ->seq_arg));
+ local_data->seq_arg.oid = self->index = subject_index;
+
+ if( program_number == eBlastTypeTblastn ) {
+ local_data->seq_arg.encoding = eBlastEncodingNcbi4na;
+ } else {
+ local_data->seq_arg.encoding = eBlastEncodingProtein;
+ }
+ if (BlastSeqSrcGetSequence(seqSrc, (void*) &local_data->seq_arg) >= 0) {
+ self->length =
+ BlastSeqSrcGetSeqLen(seqSrc, (void*) &local_data->seq_arg);
+ } else {
+ self->length = 0;
+ }
}
/** Release the resources associated with a matching sequence. */
static void
-Kappa_MatchingSequenceRelease(Kappa_MatchingSequence * self)
+s_MatchingSequenceRelease(BlastCompo_MatchingSequence * self)
{
- BlastSeqSrcReleaseSequence(self->seq_src, (void*)&self->seq_arg);
- BlastSequenceBlkFree(self->seq_arg.seq);
+ if (self != NULL) {
+ Kappa_SequenceLocalData * local_data = self->local_data;
+ BlastSeqSrcReleaseSequence(local_data->seq_src,
+ (void*)&local_data->seq_arg);
+ BlastSequenceBlkFree(local_data->seq_arg.seq);
+ free(self->local_data);
+ self->local_data = NULL;
+ }
}
-/** NCBIstdaa encoding for 'X' character (@todo is this really needed?) */
+/** NCBIstdaa encoding for 'X' character */
#define BLASTP_MASK_RESIDUE 21
/** Default instructions and mask residue for SEG filtering */
#define BLASTP_MASK_INSTRUCTIONS "S 10 1.8 2.1"
@@ -2205,86 +631,87 @@ Kappa_MatchingSequenceRelease(Kappa_MatchingSequence * self)
* Obtain a string of translated data
*
* @param self the sequence from which to obtain the data [in]
- * @param window the range and tranlation frame to get [in]
+ * @param range the range and translation frame to get [in]
* @param seqData the resulting data [out]
*/
static void
-Kappa_SequenceGetTranslatedWindow(Kappa_MatchingSequence * self,
- Kappa_WindowInfo * window,
- Kappa_SequenceData * seqData )
+s_SequenceGetTranslatedRange(const BlastCompo_MatchingSequence * self,
+ const BlastCompo_SequenceRange * range,
+ BlastCompo_SequenceData * seqData )
{
- ASSERT( 0 && "Not implemented" );
+ ASSERT( 0 && "Not implemented" );
}
/**
- * Obtain the sequence data that lies within the given window.
+ * Obtain the sequence data that lies within the given range.
*
* @param self sequence information [in]
- * @param window window specifying the range of data [in]
+ * @param range range specifying the range of data [in]
* @param seqData the sequence data obtained [out]
*/
-static void
-Kappa_SequenceGetWindow(
- Kappa_MatchingSequence * self,
- Kappa_WindowInfo * window,
- Kappa_SequenceData * seqData )
+static int
+s_SequenceGetRange(
+ const BlastCompo_MatchingSequence * self,
+ const BlastCompo_SequenceRange * range,
+ BlastCompo_SequenceData * seqData )
{
- if(self->prog_number == eBlastTypeTblastn) {
- /* The sequence must be translated. */
- Kappa_SequenceGetTranslatedWindow(self, window, seqData);
- } else {
- /* The sequence does not need to be translated. */
- Int4 idx;
- Uint1 *origData; /* the unfiltered data for the sequence */
-
- /* Copy the entire sequence (necessary for SEG filtering.) */
- seqData->buffer = calloc((self->length + 2), sizeof(Uint1));
- /* First and last characters of the buffer MUST be '\0', which is
- * true here because the buffer was allocated using calloc. */
- seqData->data = seqData->buffer + 1;
- seqData->length = self->length;
-
- origData = self->seq_arg.seq->sequence;
- for( idx = 0; idx < seqData->length; idx++ ) {
- /* Copy the sequence data, replacing occurrences of amino acid
- * number 24 (Selenocysteine) with number 21 (Undetermined or
- * atypical). */
- if(origData[idx] != 24) {
- seqData->data[idx] = origData[idx];
- } else {
- seqData->data[idx] = 21;
- fprintf(stderr, "Selenocysteine (U) at position %ld"
- " replaced by X\n",
- (long) idx + 1);
- }
- }
+ Kappa_SequenceLocalData * local_data = self->local_data;
+ if (local_data->prog_number == eBlastTypeTblastn) {
+ /* The sequence must be translated. */
+ s_SequenceGetTranslatedRange(self, range, seqData);
+ } else {
+ /* The sequence does not need to be translated. */
+ Int4 idx;
+ Uint1 *origData; /* the unfiltered data for the sequence */
+
+ /* Copy the entire sequence (necessary for SEG filtering.) */
+ seqData->buffer = calloc((self->length + 2), sizeof(Uint1));
+ /* First and last characters of the buffer MUST be '\0', which is
+ * true here because the buffer was allocated using calloc. */
+ seqData->data = seqData->buffer + 1;
+ seqData->length = self->length;
+
+ origData = local_data->seq_arg.seq->sequence;
+ for (idx = 0; idx < seqData->length; idx++) {
+ /* Copy the sequence data, replacing occurrences of amino acid
+ * number 24 (Selenocysteine) with number 21 (Undetermined or
+ * atypical). */
+ if (origData[idx] != 24) {
+ seqData->data[idx] = origData[idx];
+ } else {
+ seqData->data[idx] = 21;
+ fprintf(stderr, "Selenocysteine (U) at position %ld"
+ " replaced by X\n",
+ (long) idx + 1);
+ }
+ }
#ifndef KAPPA_NO_SEG_SEQUENCE
- /*take as input an amino acid string and its length; compute a filtered
- amino acid string and return the filtered string*/
- {{
- BlastSeqLoc* mask_seqloc;
- const EBlastProgramType k_program_name = eBlastTypeBlastp;
- SBlastFilterOptions* filter_options;
-
- BlastFilteringOptionsFromString(k_program_name, BLASTP_MASK_INSTRUCTIONS, &filter_options, NULL);
-
- BlastSetUp_Filter(k_program_name, seqData->data, seqData->length,
- 0, filter_options, &mask_seqloc, NULL);
-
- filter_options = SBlastFilterOptionsFree(filter_options);
-
- Blast_MaskTheResidues(seqData->data, seqData->length,
- FALSE, mask_seqloc, FALSE, 0);
-
- mask_seqloc = BlastSeqLocFree(mask_seqloc);
- }}
+ /* take as input an amino acid string and its length; compute
+ * a filtered amino acid string and return the filtered string */
+ {{
+ BlastSeqLoc* mask_seqloc;
+ const EBlastProgramType k_program_name = eBlastTypeBlastp;
+ SBlastFilterOptions* filter_options;
+
+ BlastFilteringOptionsFromString(k_program_name,
+ BLASTP_MASK_INSTRUCTIONS,
+ &filter_options, NULL);
+ BlastSetUp_Filter(k_program_name, seqData->data, seqData->length,
+ 0, filter_options, &mask_seqloc, NULL);
+ filter_options = SBlastFilterOptionsFree(filter_options);
+
+ Blast_MaskTheResidues(seqData->data, seqData->length,
+ FALSE, mask_seqloc, FALSE, 0);
+ mask_seqloc = BlastSeqLocFree(mask_seqloc);
+ }}
#endif
- /* Fit the data to the window. */
- seqData ->data = &seqData->data[window->begin - 1];
- *seqData->data++ = '\0';
- seqData ->length = window->end - window->begin;
- } /* end else the sequence does not need to be translated */
+ /* Fit the data to the range. */
+ seqData ->data = &seqData->data[range->begin - 1];
+ *seqData->data++ = '\0';
+ seqData ->length = range->end - range->begin;
+ } /* end else the sequence does not need to be translated */
+ return 0;
}
@@ -2303,49 +730,95 @@ Kappa_SequenceGetWindow(
* @param query the query data [in]
* @param subject the subject data [in]
*/
+/* WHY */
static void
-StartingPointForHit(
- Int4 * q_start,
- Int4 * s_start,
- const BlastScoreBlk* sbp,
- Boolean positionBased,
- BlastHSP * hsp,
- Kappa_WindowInfo * window,
- Kappa_SequenceData * query,
- Kappa_SequenceData * subject)
+s_StartingPointForHit(Int4 * q_start,
+ Int4 * s_start,
+ const BlastScoreBlk* sbp,
+ Boolean positionBased,
+ BlastHSP * hsp,
+ BlastCompo_SequenceRange * range,
+ BlastCompo_SequenceData * query,
+ BlastCompo_SequenceData * subject)
+{
+ hsp->subject.offset -= range->begin;
+ hsp->subject.gapped_start -= range->begin;
+
+ if(BLAST_CheckStartForGappedAlignment(hsp, query->data,
+ subject->data, sbp)) {
+ /* We may use the starting point supplied by the HSP. */
+ *q_start = hsp->query.gapped_start;
+ *s_start = hsp->subject.gapped_start;
+ } else {
+ /* We must recompute the start for the gapped alignment, as the
+ one in the HSP was unacceptable.*/
+ *q_start =
+ BlastGetStartForGappedAlignment(query->data,
+ subject->data, sbp,
+ hsp->query.offset,
+ hsp->query.end -
+ hsp->query.offset,
+ hsp->subject.offset,
+ hsp->subject.end -
+ hsp->subject.offset);
+ *s_start =
+ (hsp->subject.offset - hsp->query.offset) + *q_start;
+ }
+}
+
+
+struct Blast_GappingParamsContext {
+ BlastGapAlignStruct * gap_align;
+ const BlastScoringParameters* scoringParams;
+ BlastScoreBlk* sbp;
+ double localScalingFactor;
+ Int4 prog_number;
+};
+typedef struct Blast_GappingParamsContext Blast_GappingParamsContext;
+
+
+/**
+ * Reads a GapAlignBlk that has been used to compute a traceback, and
+ * return a BlastCompo_Alignment representing the alignment.
+ *
+ * @param gap_align the GapAlignBlk
+ * @param window the window used to compute the traceback
+ */
+static BlastCompo_Alignment *
+s_NewAlignmentFromGapAlign(BlastGapAlignStruct * gap_align,
+ BlastCompo_SequenceRange * query_range,
+ BlastCompo_SequenceRange * subject_range,
+ int whichMode)
{
- hsp->subject.offset -= window->begin;
- hsp->subject.gapped_start -= window->begin;
-
- if(BLAST_CheckStartForGappedAlignment(hsp, query->data,
- subject->data, sbp)) {
- /* We may use the starting point supplied by the HSP. */
- *q_start = hsp->query.gapped_start;
- *s_start = hsp->subject.gapped_start;
- } else {
- /* We must recompute the start for the gapped alignment, as the
- one in the HSP was unacceptable.*/
- *q_start =
- BlastGetStartForGappedAlignment(query->data, subject->data, sbp,
- hsp->query.offset, hsp->query.end - hsp->query.offset,
- hsp->subject.offset, hsp->subject.end - hsp->subject.offset);
-
- *s_start =
- (hsp->subject.offset - hsp->query.offset) + *q_start;
- }
+ int queryStart, queryEnd, queryIndex, matchStart, matchEnd, frame;
+ BlastCompo_Alignment * obj; /* the new alignment */
+
+ queryStart = gap_align->query_start + query_range->begin;
+ queryEnd = gap_align->query_stop + query_range->begin;
+ queryIndex = query_range->context;
+ matchStart = gap_align->subject_start + subject_range->begin;
+ matchEnd = gap_align->subject_stop + subject_range->begin;
+ frame = subject_range->context;
+
+ obj = BlastCompo_AlignmentNew(gap_align->score, whichMode,
+ queryStart, queryEnd, queryIndex,
+ matchStart, matchEnd, frame,
+ gap_align->edit_script);
+ gap_align->edit_script = NULL;
+ return obj;
}
/**
- * Create a new Kappa_DistinctAlignment and append the list of
+ * Create a new BlastCompo_Alignment and append the list of
* alignments represented by "next."
*
* @param query query sequence data
* @param queryStart the start of the alignment in the query
* @param queryEnd the end of the alignment in the query
* @param subject subject sequence data
- * @param matchStart the start of the alignment in the subject window
- * @param matchEnd the end of the alignment in the subject window
+ * @param matchStart the start of the alignment in the subject range
+ * @param matchEnd the end of the alignment in the subject range
* @param score the score of this alignment
* @param window the subject window of this alignment
* @param gap_align alignment info for gapped alignments
@@ -2356,210 +829,163 @@ StartingPointForHit(
* @param prog_number the type of alignment being performed
* @param next preexisting list of alignments [out]
*/
-static Kappa_DistinctAlignment *
-NewAlignmentUsingXdrop(
- Kappa_SequenceData * query,
- Int4 queryStart,
- Int4 queryEnd,
- Kappa_SequenceData * subject,
- Int4 matchStart,
- Int4 matchEnd,
- Int4 score,
- Kappa_WindowInfo * window,
- BlastGapAlignStruct * gap_align,
- const BlastScoringParameters* scoringParams,
- double localScalingFactor,
- Int4 prog_number,
- Kappa_DistinctAlignment * next)
+static int
+s_NewAlignmentUsingXdrop(BlastCompo_Alignment ** pnewAlign,
+ Int4 * pqueryEnd, Int4 *pmatchEnd,
+ Int4 queryStart, Int4 matchStart, Int4 score,
+ BlastCompo_SequenceData * query,
+ BlastCompo_SequenceRange * query_range,
+ Int4 queryLength,
+ BlastCompo_SequenceData * subject,
+ BlastCompo_SequenceRange * subject_range,
+ Int4 subjectLength,
+ BlastCompo_GappingParams * gapping_params,
+ ECompoAdjustModes whichMode)
{
- Int4 newScore;
- /* Extent of the alignment as computed by an x-drop alignment
- * (usually the same as (queryEnd - queryStart) and (matchEnd -
- * matchStart)) */
- Int4 queryExtent, matchExtent;
- Kappa_DistinctAlignment * obj; /* the new object */
-
- Kappa_SWFindFinalEndsUsingXdrop(query, queryStart, queryEnd,
- subject, matchStart, matchEnd,
- gap_align, scoringParams,
- score, localScalingFactor,
- &queryExtent, &matchExtent,
- &newScore);
- obj = malloc(sizeof(Kappa_DistinctAlignment));
- obj->editScript =
- Blast_PrelimEditBlockToGapEditScript(gap_align->rev_prelim_tback,
- gap_align->fwd_prelim_tback);
-
- obj->score = newScore;
- obj->queryStart = queryStart;
- obj->queryEnd = obj->queryStart + queryExtent;
- obj->matchStart = matchStart + window->begin;
- obj->matchEnd = obj->matchStart + matchExtent;
- obj->frame = window->frame;
-
- obj->next = next;
-
- return obj;
+ Int4 newScore;
+ /* Extent of the alignment as computed by an x-drop alignment
+ * (usually the same as (queryEnd - queryStart) and (matchEnd -
+ * matchStart)) */
+ Int4 queryExtent, matchExtent;
+ BlastCompo_Alignment * obj; /* the new object */
+ Blast_GappingParamsContext * context = gapping_params->context;
+ BlastGapAlignStruct * gap_align = context->gap_align;
+ const BlastScoringParameters* scoringParams = context->scoringParams;
+ double localScalingFactor = context->localScalingFactor;
+ GapEditScript* editScript;
+
+ s_SWFindFinalEndsUsingXdrop(query, queryStart, *pqueryEnd,
+ subject, matchStart, *pmatchEnd,
+ gap_align, scoringParams,
+ score, localScalingFactor,
+ &queryExtent, &matchExtent,
+ &newScore);
+ *pqueryEnd = queryStart + queryExtent;
+ *pmatchEnd = matchStart + matchExtent;
+
+ editScript =
+ Blast_PrelimEditBlockToGapEditScript(gap_align->rev_prelim_tback,
+ gap_align->fwd_prelim_tback);
+ obj = BlastCompo_AlignmentNew(newScore, whichMode,
+ queryStart, *pqueryEnd,
+ query_range->context,
+ matchStart, *pmatchEnd,
+ subject_range->context, editScript);
+ *pnewAlign = obj;
+
+ return 0;
}
-/**
- * Reads a GapAlignBlk that has been used to compute a traceback, and
- * return a Kappa_DistinctAlignment representing the alignment.
- *
- * @param gap_align the GapAlignBlk
- * @param window the window used to compute the traceback
- */
-static Kappa_DistinctAlignment *
-NewAlignmentFromGapAlign(
- BlastGapAlignStruct * gap_align,
- Kappa_WindowInfo * window)
+static BlastCompo_Alignment *
+s_RedoOneAlignment(BlastCompo_Alignment * in_align,
+ ECompoAdjustModes whichMode,
+ BlastCompo_SequenceData * query_data,
+ BlastCompo_SequenceRange * query_range,
+ int ccat_query_length,
+ BlastCompo_SequenceData * subject_data,
+ BlastCompo_SequenceRange * subject_range,
+ int full_subject_length,
+ BlastCompo_GappingParams * gapping_params)
{
- Kappa_DistinctAlignment * obj; /* the new alignment */
- obj = malloc(sizeof(Kappa_DistinctAlignment));
-
- obj->score = gap_align->score;
- obj->queryStart = gap_align->query_start;
- obj->queryEnd = gap_align->query_stop;
- obj->matchStart = gap_align->subject_start + window->begin;
- obj->matchEnd = gap_align->subject_stop + window->begin;
- obj->frame = window->frame;
-
- obj->editScript = gap_align->edit_script;
- gap_align->edit_script = NULL; /* set to NULL to avoid aliasing */
- obj->next = NULL;
-
- return obj;
+ Int4 q_start, s_start;
+ Blast_GappingParamsContext * context = gapping_params->context;
+ BlastScoreBlk* sbp = context->sbp;
+ BlastGapAlignStruct* gapAlign = context->gap_align;
+ Boolean positionBased = (sbp->psi_matrix ? TRUE : FALSE);
+ BlastHSP * hsp = in_align->context;
+
+ s_StartingPointForHit(&q_start, &s_start, sbp, positionBased,
+ hsp, subject_range, query_data, subject_data);
+ gapAlign->gap_x_dropoff = gapping_params->x_dropoff;
+
+ BLAST_GappedAlignmentWithTraceback(context->prog_number,
+ query_data->data,
+ subject_data->data, gapAlign,
+ context->scoringParams,
+ q_start, s_start,
+ query_data->length,
+ subject_data->length);
+ return s_NewAlignmentFromGapAlign(gapAlign, query_range, subject_range,
+ whichMode);
}
/**
- * A Kappa_SearchParameters represents the data needed by
+ * A s_SearchParameters represents the data needed by
* RedoAlignmentCore to adjust the parameters of a search, including
* the original value of these parameters
*/
-typedef struct Kappa_SearchParameters {
- Int4 gapOpen; /**< a penalty for the existence of a gap */
- Int4 gapExtend; /**< a penalty for each residue (or
- nucleotide) in the gap */
- Int4 gapDecline; /**< a penalty for declining to align a pair
- of residues */
- Int4 mRows; /**< the number of rows in a scoring matrix. */
- Int4 nCols; /**< the number of columns in a scoring
- matrix */
-
- double scaledUngappedLambda; /**< The value of Karlin-Altschul
- parameter lambda, rescaled
- to allow scores to have
- greater precision */
- Int4 **origMatrix; /**< The original matrix values */
- Int4 **startMatrix; /**< Rescaled values of the original matrix */
-
- double **startFreqRatios; /**< frequency ratios to start
- investigating each pair */
- double *scoreArray; /**< array of score probabilities */
- double *resProb; /**< array of probabilities for each residue
- in a matching sequence */
- double *queryProb; /**< array of probabilities for each residue
- in the query */
- Boolean adjustParameters; /**< Use composition-based statistics
- if true. */
-
- Blast_ScoreFreq* return_sfp; /**< score frequency pointers to
- compute lambda */
- Blast_KarlinBlk *kbp_gap_orig; /**< copy of the original gapped
- Karlin-Altschul block corresponding to
- the first context */
- Blast_KarlinBlk **orig_kbp_gap_array; /**< pointer to the array of gapped
- Karlin-Altschul block for all
- contexts (@todo is this really
- needed?) */
- double scale_factor; /**< The original scale factor (to be restored). */
-} Kappa_SearchParameters;
+typedef struct s_SearchParameters {
+ Int4 gap_open; /**< a penalty for the existence of a gap */
+ Int4 gapExtend; /**< a penalty for each residue in the
+ gap */
+ Int4 gapDecline; /**< a penalty for declining to align a pair
+ of residues */
+ double scale_factor; /**< the original scale factor */
+ Int4 **origMatrix; /**< The original matrix values */
+ double original_expect_value; /**< expect value on entry */
+ /** copy of the original gapped Karlin-Altschul block
+ * corresponding to the first context */
+ Blast_KarlinBlk* kbp_gap_orig;
+ /** pointer to the array of gapped Karlin-Altschul block for all
+ * contexts; needed to restore the search to its original
+ * configuration. */
+ Blast_KarlinBlk** orig_kbp_gap_array;
+} s_SearchParameters;
/**
- * Release the data associated with a Kappa_SearchParameters and
+ * Release the data associated with a s_SearchParameters and
* delete the object
* @param searchParams the object to be deleted [in][out]
*/
static void
-Kappa_SearchParametersFree(Kappa_SearchParameters ** searchParams)
+s_SearchParametersFree(s_SearchParameters ** searchParams)
{
- /* for convenience, remove one level of indirection from searchParams */
- Kappa_SearchParameters *sp = *searchParams;
+ /* for convenience, remove one level of indirection from searchParams */
+ s_SearchParameters *sp = *searchParams;
- if(sp->kbp_gap_orig) Blast_KarlinBlkFree(sp->kbp_gap_orig);
+ if(sp->kbp_gap_orig) Blast_KarlinBlkFree(sp->kbp_gap_orig);
- if(sp->startMatrix)
- _PSIDeallocateMatrix((void**) sp->startMatrix, sp->mRows);
- if(sp->origMatrix)
- _PSIDeallocateMatrix((void**) sp->origMatrix, sp->mRows);
- if(sp->startFreqRatios)
- _PSIDeallocateMatrix((void**) sp->startFreqRatios, sp->mRows);
+ Nlm_Int4MatrixFree(&sp->origMatrix);
- if(sp->return_sfp) sfree(sp->return_sfp);
- if(sp->scoreArray) sfree(sp->scoreArray);
- if(sp->resProb) sfree(sp->resProb);
- if(sp->queryProb) sfree(sp->queryProb);
-
- sfree(*searchParams);
- *searchParams = NULL;
+ sfree(*searchParams);
+ *searchParams = NULL;
}
/**
- * Create a new instance of Kappa_SearchParameters
+ * Create a new instance of s_SearchParameters
*
* @param rows number of rows in the scoring matrix
- * @param adjustParameters if true, use composition-based statistics
+ * @param adjustParameters if >0, use composition-based statistics
+ * @param numQueries the number of queries in the concatenated
+ * query
* @param positionBased if true, the search is position-based
*/
-static Kappa_SearchParameters *
-Kappa_SearchParametersNew(
- Int4 rows,
- Boolean adjustParameters,
- Boolean positionBased)
+static s_SearchParameters *
+s_SearchParametersNew(
+ Int4 rows,
+ Int4 adjustParameters,
+ Boolean positionBased)
{
- Kappa_SearchParameters *sp; /* the new object */
- sp = malloc(sizeof(Kappa_SearchParameters));
-
- sp->orig_kbp_gap_array = NULL;
-
- sp->mRows = positionBased ? rows : BLASTAA_SIZE;
- sp->nCols = BLASTAA_SIZE;
-
- sp->kbp_gap_orig = NULL;
- sp->startMatrix = NULL;
- sp->origMatrix = NULL;
- sp->startFreqRatios = NULL;
- sp->return_sfp = NULL;
- sp->scoreArray = NULL;
- sp->resProb = NULL;
- sp->queryProb = NULL;
- sp->adjustParameters = adjustParameters;
-
- if(adjustParameters) {
+ s_SearchParameters *sp; /* the new object */
+ sp = malloc(sizeof(s_SearchParameters));
+
+ sp->orig_kbp_gap_array = NULL;
+ sp->kbp_gap_orig = NULL;
+ sp->origMatrix = NULL;
+
sp->kbp_gap_orig = Blast_KarlinBlkNew();
- sp->startMatrix = (Int4**) _PSIAllocateMatrix(sp->mRows, sp->nCols,
- sizeof(Int4));
- sp->origMatrix = (Int4**) _PSIAllocateMatrix(sp->mRows, sp->nCols,
- sizeof(Int4));
- sp->resProb =
- (double *) calloc(BLASTAA_SIZE, sizeof(double));
- sp->scoreArray =
- (double *) calloc(kScoreMatrixScoreRange, sizeof(double));
- sp->return_sfp =
- (Blast_ScoreFreq*) calloc(1, sizeof(Blast_ScoreFreq));
-
- if(!positionBased) {
- sp->queryProb =
- (double *) calloc(BLASTAA_SIZE, sizeof(double));
+ if (adjustParameters) {
+ if (positionBased) {
+ sp->origMatrix = Nlm_Int4MatrixNew(rows, BLASTAA_SIZE);
+ } else {
+ sp->origMatrix = Nlm_Int4MatrixNew(BLASTAA_SIZE, BLASTAA_SIZE);
+ }
}
- }
- /* end if(adjustParameters) */
-
- return sp;
+ return sp;
}
@@ -2567,677 +993,629 @@ Kappa_SearchParametersNew(
* Record the initial value of the search parameters that are to be
* adjusted.
*
- * @param searchParams the object to be filled in [in|out]
- * @param queryBlk query sequence [in]
- * @param queryInfo query sequence information [in]
- * @param sbp Scoring Blk (contains Karlin-Altschul parameters) [in]
- * @param scoring gap-open/extend/decline_align information [in]
- * @param positionBased is this search position-specific? [in]
- * @todo instead of hard coding 0 for context we should use queryInfo
+ * @param searchParams holds the recorded values [out]
+ * @param search the search parameters [in]
+ * @param query a list of query data [in]
+ * @param numQueries the length of the array query [in]
*/
static void
-Kappa_RecordInitialSearch(Kappa_SearchParameters * searchParams,
- BLAST_SequenceBlk * queryBlk,
- BlastQueryInfo* queryInfo,
- BlastScoreBlk* sbp,
- const BlastScoringParameters* scoring,
- Boolean positionBased)
+s_RecordInitialSearch(s_SearchParameters * searchParams,
+ BLAST_SequenceBlk * queryBlk,
+ BlastQueryInfo* queryInfo,
+ BlastScoreBlk* sbp,
+ const BlastScoringParameters* scoring,
+ int query_length,
+ Boolean adjustParameters,
+ Boolean positionBased)
{
- Uint1* query; /* the query sequence */
- Int4 queryLength; /* the length of the query sequence */
- const Int4 kContextOffset = queryInfo->contexts[0].query_offset; /* offset in buffer of start of query. */
-
- query = &queryBlk->sequence[kContextOffset];
- queryLength = queryInfo->contexts[0].query_length;
- ASSERT((0 == queryInfo->first_context) &&
- (queryInfo->first_context == queryInfo->last_context));
-
- if(searchParams->adjustParameters) {
- Int4 i, j;
Blast_KarlinBlk* kbp; /* statistical parameters used to evaluate a
- * query-subject pair */
- Int4 **matrix; /* matrix used to score a local
- query-subject alignment */
-
- if(positionBased) {
- matrix = sbp->psi_matrix->pssm->data;
- ASSERT(queryLength == searchParams->mRows);
- ASSERT(queryLength == (Int4)sbp->psi_matrix->pssm->ncols);
- } else {
- matrix = sbp->matrix->data;
- Blast_FillResidueProbability(query, queryLength, searchParams->queryProb);
- }
- kbp = sbp->kbp_gap[0];
- searchParams->gapOpen = scoring->gap_open;
+ * query-subject pair */
+ /* YIKES! How do I get these! */
+ /*
+ searchParams->original_expect_value = search->pbp->cutoff_e;
+ */
+ searchParams->gap_open = scoring->gap_open;
searchParams->gapExtend = scoring->gap_extend;
searchParams->gapDecline = scoring->decline_align;
- searchParams->scale_factor = scoring->scale_factor;
+ searchParams->scale_factor = scoring->scale_factor;
searchParams->orig_kbp_gap_array = sbp->kbp_gap;
-
+ kbp = sbp->kbp_gap[0];
Blast_KarlinBlkCopy(searchParams->kbp_gap_orig, kbp);
- for(i = 0; i < searchParams->mRows; i++) {
- for(j = 0; j < BLASTAA_SIZE; j++) {
- searchParams->origMatrix[i][j] = matrix[i][j];
- }
+ if (adjustParameters) {
+ Int4 **matrix;
+ Int4 i, j; /* iteration indices */
+ int rows;
+ if (positionBased) {
+ matrix = sbp->psi_matrix->pssm->data;
+ rows = query_length;
+ } else {
+ matrix = sbp->matrix->data;
+ rows = BLASTAA_SIZE;
+ }
+ for (i = 0; i < rows; i++) {
+ for (j = 0; j < BLASTAA_SIZE; j++) {
+ searchParams->origMatrix[i][j] = matrix[i][j];
+ }
+ }
}
- }
}
/**
* Rescale the search parameters in the search object and options
* object to obtain more precision.
- *
- * @param sp record of parameters used and frequencies [in|out]
- * @param queryBlk query sequence [in]
- * @param queryInfo query sequence information [in]
- * @param sbp Scoring Blk (contains Karlin-Altschul parameters) [in]
- * @param scoringParams gap-open/extend/decline_align information [in]
- * @param positionBased is this search position-specific? [in]
- * @return scaling-factor to be used.
*/
-static double
-Kappa_RescaleSearch(Kappa_SearchParameters * sp,
- BLAST_SequenceBlk* queryBlk,
- BlastQueryInfo* queryInfo,
- BlastScoreBlk* sbp,
- BlastScoringParameters* scoringParams,
- Boolean positionBased)
+static void
+s_RescaleSearch(s_SearchParameters * sp,
+ BLAST_SequenceBlk* queryBlk,
+ BlastQueryInfo* queryInfo,
+ BlastScoreBlk* sbp,
+ BlastScoringParameters* scoringParams,
+ double localScalingFactor,
+ Boolean positionBased)
{
- double localScalingFactor; /* the factor by which to
- * scale the scoring system in
- * order to obtain greater
- * precision */
-
- if(!sp->adjustParameters) {
- localScalingFactor = 1.0;
- } else {
- double initialUngappedLambda; /* initial value of the
- * statistical parameter
- * lambda used to evaluate
- * ungapped alignments */
- Blast_KarlinBlk* kbp; /* the statistical parameters used to
- * evaluate alignments of a
- * query-subject pair */
- Uint1* query; /* the query sequence */
- Int4 queryLength; /* the length of the query sequence */
-
- if((0 == strcmp(scoringParams->options->matrix, "BLOSUM62_20"))) {
- localScalingFactor = SCALING_FACTOR / 10;
- } else {
- localScalingFactor = SCALING_FACTOR;
- }
+ Blast_KarlinBlk* kbp; /* the statistical parameters used to
+ * evaluate alignments of a
+ * query-subject pair */
+ kbp = sbp->kbp_gap[0];
+ kbp->Lambda /= localScalingFactor;
+ kbp->logK = log(kbp->K);
+ /* YIKES! and what about the cutoff_e */
+ /*
+ search->pbp->cutoff_e = options->kappa_expect_value;
+ */
+ scoringParams->gap_open = BLAST_Nint(sp->gap_open * localScalingFactor);
+ scoringParams->gap_extend = BLAST_Nint(sp->gapExtend * localScalingFactor);
scoringParams->scale_factor = localScalingFactor;
+ if (sp->gapDecline != INT2_MAX) {
+ scoringParams->decline_align =
+ BLAST_Nint(sp->gapDecline * localScalingFactor);
+ }
+}
- scoringParams->gap_open = BLAST_Nint(sp->gapOpen * localScalingFactor);
- scoringParams->gap_extend = BLAST_Nint(sp->gapExtend * localScalingFactor);
- if(sp->gapDecline != INT2_MAX) {
- scoringParams->decline_align =
- BLAST_Nint(sp->gapDecline * localScalingFactor);
+
+/**
+ * Restore the parameters that were adjusted to their original values
+ * @param searchParams a record of the original values [in]
+ * @param search the search to be restored [out]
+ * @param options the option block to be restored [out]
+ * @param matrix the scoring matrix to be restored [out]
+ * @param SmithWaterman if true, we have performed a Smith-Waterman
+ * alignment with these search parameters [in]
+ */
+static void
+s_RestoreSearch(s_SearchParameters * searchParams,
+ BlastScoreBlk* sbp,
+ Int4 ** matrix,
+ int query_length,
+ BlastScoringParameters* scoring,
+ Boolean positionBased,
+ Boolean adjustParameters)
+{
+ Blast_KarlinBlk* kbp; /* statistical parameters used to
+ evaluate the significance of
+ alignment of a query-subject
+ pair */
+ Int4 i, j;
+ /* YIKES! More stuff I don't know how to deal with */
+ /*
+ search->pbp->gap_x_dropoff_final = searchParams->gap_x_dropoff_final;
+ search->pbp->cutoff_e = searchParams->original_expect_value;
+ search->pbp->gap_open = searchParams->gap_open;
+ search->pbp->gap_extend = searchParams->gapExtend;
+ search->pbp->decline_align = searchParams->gapDecline;
+ GapAlignBlkDelete(search->gap_align);
+ search->gap_align = searchParams->orig_gap_align;
+ search->sbp->kbp_gap = searchParams->orig_kbp_gap_array;
+ */
+ kbp = sbp->kbp_gap[0];
+ Blast_KarlinBlkCopy(kbp, searchParams->kbp_gap_orig);
+
+ if(adjustParameters) {
+ int rows;
+ if (positionBased) {
+ rows = query_length;
+ } else {
+ rows = BLASTAA_SIZE;
+ }
+ for(i = 0; i < rows; i++) {
+ for(j = 0; j < BLASTAA_SIZE; j++) {
+ matrix[i][j] = searchParams->origMatrix[i][j];
+ }
+ }
}
+}
+
+static void
+s_MatrixInfoInit(Blast_MatrixInfo * self,
+ double localScalingFactor,
+ BLAST_SequenceBlk* queryBlk,
+ BlastQueryInfo* queryInfo,
+ BlastScoreBlk* sbp,
+ BlastScoringParameters* scoringParams,
+ Boolean positionBased,
+ const char * matrixName)
+{
+ Uint1 * query; /* the query sequence */
+ int queryLength;
+ /* Int4 queryLength; */ /* the length of the query sequence */
+ double initialUngappedLambda;
+
+ /* YIKES! */
+ /*
+ query = search->context[0].query->sequence;
+ queryLength = search->context[0].query->length;
+ */
query = &queryBlk->sequence[0];
queryLength = queryInfo->contexts[0].query_length;
- if(positionBased) {
- int status = 0;
- ASSERT(queryLength == sp->mRows);
- ASSERT(queryLength == (Int4)sbp->psi_matrix->pssm->ncols);
- sp->startFreqRatios =
- getStartFreqRatios(sbp, query, scoringParams->options->matrix,
- sbp->psi_matrix->freq_ratios, queryLength);
- status = scalePosMatrix(sp->startMatrix, sbp->matrix->data,
- scoringParams->options->matrix,
- sbp->psi_matrix->freq_ratios, query,
- queryLength, sbp);
- if (status) {
- return 0.0; /* return incorrect value for scalingFactor */
- }
- initialUngappedLambda = sbp->kbp_psi[0]->Lambda;
+ if (self->positionBased) {
+ /* YIKES!
+ if(sbp->posFreqs == NULL) {
+ sbp->posFreqs =
+ allocatePosFreqs(queryLength, BLASTAA_SIZE);
+ }
+ */
+ s_GetStartFreqRatios(self->startFreqRatios, query, matrixName,
+ sbp->psi_matrix->freq_ratios, queryLength,
+ TRUE);
+ s_ScalePosMatrix(self->startMatrix, sbp->matrix->data,
+ matrixName,sbp->psi_matrix->freq_ratios, query,
+ queryInfo->max_length, sbp);
+ initialUngappedLambda = sbp->kbp_psi[0]->Lambda;
} else {
- SFreqRatios* freqRatios =
- _PSIMatrixFrequencyRatiosNew(scoringParams->options->matrix);
- sp->startFreqRatios = (double**) _PSIAllocateMatrix(sp->mRows,
- sp->nCols,
- sizeof(double));
- ASSERT(sp->startFreqRatios);
- _PSICopyMatrix_double(sp->startFreqRatios, freqRatios->data,
- sp->mRows, sp->nCols);
- freqRatios = _PSIMatrixFrequencyRatiosFree(freqRatios);
- initialUngappedLambda = sbp->kbp_ideal->Lambda;
+ s_GetStartFreqRatios(self->startFreqRatios, query, matrixName,
+ NULL, BLASTAA_SIZE, FALSE);
+ initialUngappedLambda = sbp->kbp_ideal->Lambda;
}
- sp->scaledUngappedLambda = initialUngappedLambda / localScalingFactor;
- if(!positionBased) {
- computeScaledStandardMatrix(sp->startMatrix,
- scoringParams->options->matrix,
- sp->scaledUngappedLambda);
+ self->ungappedLambda = initialUngappedLambda / localScalingFactor;
+ if ( !positionBased ) {
+ SFreqRatios * freqRatios; /* frequency ratios for the matrix */
+
+ freqRatios = _PSIMatrixFrequencyRatiosNew(matrixName);
+ /*
+ if (freqRatios == NULL) {
+ ErrPostEx(SEV_FATAL, 1, 0, "blastpgp: Cannot adjust parameters "
+ "for matrix %s\n", matrixName);
+ }
+ */
+ Blast_Int4MatrixFromFreq(self->startMatrix, BLASTAA_SIZE,
+ freqRatios->data, self->ungappedLambda);
+ freqRatios = _PSIMatrixFrequencyRatiosFree(freqRatios);
}
- kbp = sbp->kbp_gap[0];
- kbp->Lambda /= localScalingFactor;
- kbp->logK = log(kbp->K);
- }
-
- return localScalingFactor;
+ self->matrixName = strdup(matrixName);
}
-/** LambdaRatioLowerBound is used when the expected score is too large
- * causing impalaKarlinLambdaNR to give a Lambda estimate that
- * is too small, or to fail entirely returning -1 */
-#define LambdaRatioLowerBound 0.5
-/**
- * Adjust the search parameters
- *
- * @param sp a record of the initial search parameters [in|out]
- * @param queryLength length of query sequence [in]
- * @param subject data from the subject sequence [in]
- * @param matrix a scoring matrix to be adjusted [out]
- * @param positionBased is this search position-specific? [in]
- * @return scaling-factor to be used.
- */
-static Int4
-Kappa_AdjustSearch(
- Kappa_SearchParameters * sp,
- Int4 queryLength,
- Kappa_SequenceData * subject,
- Int4 ** matrix,
- Boolean positionBased)
+static void
+s_GetQueryInfo(BlastCompo_QueryInfo **pquery, int * pnumQueries,
+ Uint1 * ccat_query, BlastQueryInfo* queryInfo)
{
- double LambdaRatio; /* the ratio of the corrected lambda to the
- * original lambda */
- if(!sp->adjustParameters) {
- LambdaRatio = 1.0;
- } else {
- /* do adjust the parameters */
- Blast_ScoreFreq* this_sfp;
- double correctUngappedLambda; /* new value of ungapped lambda */
-
- /* compute and plug in new matrix here */
- Blast_FillResidueProbability(subject->data, subject->length, sp->resProb);
-
- if(positionBased) {
- ASSERT(queryLength == sp->mRows);
- this_sfp =
- posfillSfp(sp->startMatrix, queryLength, sp->resProb, sp->scoreArray,
- sp->return_sfp, kScoreMatrixScoreRange);
- } else {
- this_sfp =
- notposfillSfp(sp->startMatrix, sp->resProb, sp->queryProb,
- sp->scoreArray, sp->return_sfp, kScoreMatrixScoreRange);
+ int query_index;
+ int numQueries = queryInfo->num_queries;
+ BlastCompo_QueryInfo * query = calloc(numQueries,
+ sizeof(BlastCompo_QueryInfo));
+ *pnumQueries = numQueries;
+ *pquery = query;
+ for (query_index = 0; query_index < numQueries; query_index++) {
+ query[query_index].eff_search_space =
+ queryInfo->contexts[query_index].eff_searchsp;
}
- correctUngappedLambda =
- Blast_KarlinLambdaNR(this_sfp, sp->scaledUngappedLambda);
-
- /* impalaKarlinLambdaNR will return -1 in the case where the
- * expected score is >=0; however, because of the MAX statement 3
- * lines below, LambdaRatio should always be > 0; the succeeding
- * test is retained as a vestige, in case one wishes to remove the
- * MAX statement and allow LambdaRatio to take on the error value
- * -1 */
-
- LambdaRatio = correctUngappedLambda / sp->scaledUngappedLambda;
- LambdaRatio = MIN(1, LambdaRatio);
- LambdaRatio = MAX(LambdaRatio, LambdaRatioLowerBound);
-
- if(LambdaRatio > 0) {
- scaleMatrix(matrix, sp->startFreqRatios, sp->mRows,
- sp->scaledUngappedLambda, LambdaRatio);
+ for (query_index = 0; query_index < numQueries; query_index++) {
+ query[query_index].origin =
+ queryInfo->contexts[query_index].query_offset;
+ query[query_index].seq.data = &ccat_query[query[query_index].origin];
+ query[query_index].seq.length =
+ queryInfo->contexts[query_index].query_length;
+ }
+ for (query_index = 0; query_index < numQueries; query_index++) {
+ Blast_ReadAaComposition(&query[query_index].composition,
+ query[query_index].seq.data,
+ query[query_index].seq.length);
}
- }
- /* end else do adjust the parameters */
-
- return LambdaRatio > 0 ? 0 : 1;
}
-/**
- * Restore the parameters that were adjusted to their original values
- * @param searchParams a record of the original values [in]
- * @param sbp Karlin-Altschul parameters to be restored. [out]
- * @param matrix the scoring matrix to be restored [out]
- * @param scoring the scoring parameters to be restored [out]
- * @param positionBased is this search position-specific? [in]
- */
static void
-Kappa_RestoreSearch(
- Kappa_SearchParameters * searchParams,
- BlastScoreBlk* sbp,
- Int4 ** matrix,
- BlastScoringParameters* scoring,
- Boolean positionBased)
+s_GappingParamsInit(Blast_GappingParamsContext * context,
+ BlastCompo_GappingParams * gapping_params,
+ BlastGapAlignStruct * gap_align,
+ const BlastScoringParameters* scoring,
+ BlastScoreBlk* sbp,
+ double localScalingFactor,
+ Int4 program_number,
+ double Lambda)
{
- if(searchParams->adjustParameters) {
- Blast_KarlinBlk* kbp; /* statistical parameters used to
- evaluate the significance of
- alignment of a query-subject
- pair */
- Int4 i, j; /* loop variables. */
-
- scoring->gap_open = searchParams->gapOpen;
- scoring->gap_extend = searchParams->gapExtend;
- scoring->decline_align = searchParams->gapDecline;
- scoring->scale_factor = searchParams->scale_factor;
+ context->gap_align = gap_align;
+ context->scoringParams = scoring;
+ context->sbp = sbp;
+ context->localScalingFactor = localScalingFactor;
+ context->prog_number = program_number;
+
+ gapping_params->gap_open = scoring->gap_open;
+ gapping_params->gap_extend = scoring->gap_extend;
+ gapping_params->decline_align = scoring->decline_align;
+ /* YIKES! different x-dropoff due to different pass through the
+ blast code */
+ gapping_params->x_dropoff = gap_align->gap_x_dropoff;
+ gapping_params->context = context;
+}
- sbp->kbp_gap = searchParams->orig_kbp_gap_array;
+static const Blast_RedoAlignCallbacks
+redo_align_callbacks = {
+ s_CalcLambda, s_SequenceGetRange, s_RedoOneAlignment,
+ s_NewAlignmentUsingXdrop
+};
+
+
+static Blast_RedoAlignParams *
+s_GetAlignParams(Blast_GappingParamsContext * context,
+ EBlastProgramType program_number,
+ BlastGapAlignStruct * gap_align,
+ BLAST_SequenceBlk * queryBlk,
+ BlastQueryInfo* queryInfo,
+ BlastScoreBlk* sbp,
+ BlastScoringParameters* scoringParams,
+ const BlastExtensionParameters* extendParams,
+ const BlastHitSavingParameters* hitParams,
+ const PSIBlastOptions* psiOptions,
+ const char * matrixName,
+ double localScalingFactor,
+ int adjustParameters)
+{
+ int rows;
+ int cutoff_s;
+ double cutoff_e;
+ BlastCompo_GappingParams * gapping_params = NULL;
+ Blast_MatrixInfo * scaledMatrixInfo;
+ Blast_KarlinBlk* kbp;
+ int subject_is_translated = program_number == eBlastTypeTblastn;
+ /* YIKES! wrong test for do_link_hsps */
+ int do_link_hsps = program_number == eBlastTypeTblastn;
+ Boolean positionBased = (sbp->psi_matrix ? TRUE : FALSE);
+
+ if (do_link_hsps) {
+ ASSERT( 0 && "Which cutoff needed here?" );
+ /* cutoff_s = search->pbp->cutoff_s2 * localScalingFactor; */
+ } else {
+ /* There is no cutoff score; we consider e-values instead */
+ cutoff_s = 0;
+ }
+ cutoff_e = hitParams->options->expect_value;
+ rows = positionBased ? queryInfo->max_length : BLASTAA_SIZE;
+ scaledMatrixInfo = Blast_MatrixInfoNew(rows, positionBased);
+ s_MatrixInfoInit(scaledMatrixInfo, localScalingFactor,
+ queryBlk, queryInfo, sbp, scoringParams,
+ positionBased, matrixName);
kbp = sbp->kbp_gap[0];
- Blast_KarlinBlkCopy(kbp, searchParams->kbp_gap_orig);
+ gapping_params = malloc(sizeof(BlastCompo_GappingParams));
+ s_GappingParamsInit(context, gapping_params, gap_align, scoringParams,
+ sbp, localScalingFactor, program_number,
+ kbp->Lambda);
+ return
+ Blast_RedoAlignParamsNew(&scaledMatrixInfo, &gapping_params,
+ adjustParameters, positionBased,
+ subject_is_translated,
+ queryInfo->max_length, cutoff_s, cutoff_e,
+ do_link_hsps, kbp->Lambda, kbp->logK,
+ &redo_align_callbacks);
+}
+
- for(i = 0; i < searchParams->mRows; i++) {
- for(j = 0; j < BLASTAA_SIZE; j++) {
- matrix[i][j] = searchParams->origMatrix[i][j];
- }
+/**
+ * Convert a BlastCompo_Heap to a flat list of SeqAligns. Note that
+ * there may be more than one alignment per element in the heap. The
+ * new list preserves the order of the SeqAligns associated with each
+ * HeapRecord. (@todo this function is named as it is for
+ * compatibility with kappa.c, rename in the future)
+ *
+ * @param self a BlastCompo_Heap
+ * @param results BLAST core external results structure (pre-SeqAlign)
+ * [out]
+ * @param hitlist_size size of each list in the results structure above [in]
+ */
+static void
+s_HeapToFlatList(BlastCompo_Heap * self, BlastHSPResults * results,
+ Int4 hitlist_size)
+{
+ BlastHSPList* hsp_list;
+ BlastHitList* hitlist =
+ results->hitlist_array[0] = Blast_HitListNew(hitlist_size);
+
+ hsp_list = NULL;
+ while (NULL != (hsp_list = BlastCompo_HeapPop(self))) {
+ Blast_HitListUpdate(hitlist, hsp_list);
}
- }
}
+
+/**
+ * Top level routine to recompute alignments for each
+ * match found by the gapped BLAST algorithm
+ *
+ * @param search is the structure with all the information about
+ * the search
+ * @param options is used to pass certain command line options
+ * taken in by BLAST
+ * @param hitlist_count is the number of old matches
+ * @param adjustParameters determines whether we are to adjust the
+ * Karlin-Altschul parameters and score matrix
+ * @param SmithWaterman determines whether the new local alignments
+ * should be computed by the optimal Smith-Waterman
+ * algorithm; SmithWaterman false means that
+ * alignments will be recomputed by the current
+ * X-drop algorithm as implemented in the procedure
+ * ALIGN.
+ * @return a array of lists of SeqAlign; each element
+ * in the array is a list of SeqAligns for
+ * one query in the concatenated query.
+ * It is assumed that at least one of adjustParameters and
+ * SmithWaterman is >0 or true when this procedure is called A linked list
+ * of alignments is returned; the alignments are sorted according to
+ * the lowest E-value of the best alignment for each matching
+ * sequence; alignments for the same matching sequence are in the
+ * list consecutively regardless of the E-value of the secondary
+ * alignments. Ties in sorted order are much rarer than for the
+ * standard BLAST method, but are broken deterministically based on
+ * the index of the matching sequences in the database.
+ */
Int2
-Kappa_RedoAlignmentCore(EBlastProgramType program_number,
- BLAST_SequenceBlk * queryBlk,
- BlastQueryInfo* queryInfo,
- BlastScoreBlk* sbp,
- BlastHSPStream* hsp_stream,
- const BlastSeqSrc* seqSrc,
- const Uint1* gen_code_string,
- BlastScoringParameters* scoringParams,
- const BlastExtensionParameters* extendParams,
- const BlastHitSavingParameters* hitParams,
- const PSIBlastOptions* psiOptions,
- BlastHSPResults* results)
+Blast_RedoAlignmentCore(EBlastProgramType program_number,
+ BLAST_SequenceBlk * queryBlk,
+ BlastQueryInfo* queryInfo,
+ BlastScoreBlk* sbp,
+ BlastHSPStream* hsp_stream,
+ const BlastSeqSrc* seqSrc,
+ const Uint1* gen_code_string,
+ BlastScoringParameters* scoringParams,
+ const BlastExtensionParameters* extendParams,
+ const BlastHitSavingParameters* hitParams,
+ const PSIBlastOptions* psiOptions,
+ BlastHSPResults* results)
{
- Int4 cutoff_s = 0; /* minimum score that must be achieved
- by a newly-computed alignment */
- Boolean do_link_hsps; /* if true, use BlastLinkHsps to
- compute e-values */
- Kappa_SequenceData query; /* data for the query sequence */
- double localScalingFactor; /* the factor by which to
- * scale the scoring system in
- * order to obtain greater
- * precision */
-
- Int4** matrix = NULL; /* score matrix */
- Blast_KarlinBlk* kbp; /* stores Karlin-Altschul parameters */
- Kappa_SearchParameters *searchParams; /* the values of the search
- * parameters that will be
- * recorded, altered in the
- * search structure in this
- * routine, and then restored
- * before the routine
- * exits. */
- Kappa_ForbiddenRanges forbidden; /* forbidden ranges for each
- * database position (used in
- * Smith-Waterman alignments) */
- SWheap significantMatches; /* a collection of alignments of the
- * query sequence with sequences from
- * the database */
- Kappa_WindowInfo ** windows; /* windows containing HSPs for
- * a single query-subject pair */
- Int4 nWindows; /* number of windows in the array
- * "windows" */
- Int4 lWindows; /* allocated size of "windows" */
- Int4 window_index; /* window index for use in loops */
- int status_code; /* status code of any routine that
- returns one */
-
- BlastGapAlignStruct* gapAlign; /* keeps track of gapped
- alignment params */
- Boolean SmithWaterman; /* Perform Smith-Waterman alignments? */
- /* is this search position-specific? */
- Boolean positionBased = (sbp->psi_matrix ? TRUE : FALSE);
- Boolean adjustParameters; /* Use composition based statistics? */
- BlastHSPList* thisMatch = NULL; /* alignment data for the
- * current query-subject
- * match */
-
- double inclusion_ethresh; /* All alignments above this value will be
- reported, no matter how many. */
-
- if (program_number != eBlastTypeBlastp &&
- program_number != eBlastTypePsiBlast &&
- program_number != eBlastTypePhiBlastp) { /* tblastn ported but not fully
- implemented */
- return BLASTERR_REDOALIGNMENTCORE_NOTSUPPORTED;
- }
-
- inclusion_ethresh =
- (psiOptions != NULL) ? psiOptions->inclusion_ethresh : 0;
-
- adjustParameters = extendParams->options->compositionBasedStats;
-
- if (extendParams->options->eTbackExt == eSmithWatermanTbck)
- SmithWaterman = TRUE;
- else
- SmithWaterman = FALSE;
-
- if ((status_code=BLAST_GapAlignStructNew(scoringParams, extendParams,
- BlastSeqSrcGetMaxSeqLen(seqSrc), sbp, &gapAlign)) != 0)
- return status_code;
-
- /* Initialize the window list to have a single window -- the most
- common case */
- lWindows = 1; nWindows = 1;
- windows = calloc(lWindows, sizeof(Kappa_WindowInfo *));
- windows[0] = malloc(sizeof(Kappa_WindowInfo));
-
- SWheapInitialize(&significantMatches, hitParams->options->hitlist_size,
- inclusion_ethresh);
-
- /**** Validate parameters *************/
- if(0 == strcmp(scoringParams->options->matrix, "BLOSUM62_20") &&
- !adjustParameters) {
- return 0; /* BLOSUM62_20 only makes sense if
- * adjustParameters is on */
- }
- /*****************/
- query.data = &queryBlk->sequence[0];
- query.length = queryInfo->contexts[0].query_length;
-
- if(SmithWaterman) {
- Kappa_ForbiddenRangesInitialize(&forbidden, query.length);
- }
-
- if(positionBased) {
- ASSERT(program_number == eBlastTypePsiBlast);
- matrix = sbp->psi_matrix->pssm->data;
- ASSERT( matrix != NULL );
-
- if(sbp->psi_matrix->freq_ratios == NULL) {
- sbp->psi_matrix->freq_ratios =
- (double**) _PSIAllocateMatrix(query.length, BLASTAA_SIZE,
- sizeof(double));
+ double localScalingFactor; /* the factor by which to
+ * scale the scoring system in
+ * order to obtain greater
+ * precision */
+ Int4 **matrix; /* score matrix */
+ s_SearchParameters *searchParams; /* the values of the search
+ * parameters that will be
+ * recorded, altered in the
+ * search structure in this
+ * routine, and then restored
+ * before the routine
+ * exits. */
+ Blast_ForbiddenRanges forbidden; /* forbidden ranges for each
+ * database position (used
+ * in Smith-Waterman
+ * alignments)
+ */
+ BlastCompo_Heap * redoneMatches; /* a collection of alignments
+ * for each query sequence with
+ * sequences from the
+ * database */
+ Blast_CompositionWorkspace
+ *NRrecord = NULL; /* stores all fields needed for
+ * computing a compositionally adjusted
+ * score matrix using Newton's method */
+ Int4 query_index; /* loop index */
+ Int4 numQueries; /* number of queries in the
+ concatenated query */
+ BlastGapAlignStruct* gapAlign; /* keeps track of gapped
+ alignment params */
+ double inclusion_ethresh; /* All alignments above this value will be
+ reported, no matter how many. */
+ BlastCompo_QueryInfo * query_info = NULL;
+ Blast_RedoAlignParams * redo_align_params;
+ Boolean positionBased = (sbp->psi_matrix ? TRUE : FALSE);
+ Boolean adjustParameters = extendParams->options->compositionBasedStats;
+ Boolean SmithWaterman;
+ int status_code;
+ BlastHSPList* thisMatch = NULL; /* alignment data for the
+ * current query-subject
+ * match */
+ BlastCompo_Alignment * incoming_aligns; /* existing algnments
+ for a match */
+ Blast_GappingParamsContext gapping_params_context;
+ int do_link_hsps;
+
+ /**** Validate parameters *************/
+ if (0 == strcmp(scoringParams->options->matrix, "BLOSUM62_20") &&
+ !adjustParameters) {
+ return 0; /* BLOSUM62_20 only makes sense if
+ * adjustParameters is on */
}
- } else {
- matrix = sbp->matrix->data;
- }
- kbp = sbp->kbp_gap[0];
-
- /* Initialize searchParams */
- searchParams =
- Kappa_SearchParametersNew(query.length, adjustParameters, positionBased);
- Kappa_RecordInitialSearch(searchParams, queryBlk, queryInfo, sbp,
- scoringParams, positionBased);
- localScalingFactor = Kappa_RescaleSearch(searchParams, queryBlk, queryInfo,
- sbp, scoringParams, positionBased);
- ASSERT(localScalingFactor != 0.0);
-
-
- do_link_hsps = program_number == eBlastTypeTblastn;
- if(do_link_hsps) {
- ASSERT( 0 && "Which cutoff needed here?" );
- /* cutoff_s = search->pbp->cutoff_s2 * localScalingFactor; */
- } else {
- /* There is no cutoff score; we consider e-values instead */
- cutoff_s = 0;
- }
- while (BlastHSPStreamRead(hsp_stream, &thisMatch) != kBlastHSPStream_Eof) {
- /* for all matching sequences */
- Kappa_MatchingSequence matchingSeq; /* the data for a matching
- * database sequence */
- Int4 * window_of_hsp; /* index of each HSP in the
- * array "windows" */
- Kappa_WindowInfo * window; /* current window in the
- * subject sequence */
- Kappa_DistinctAlignment * alignments; /* list of alignments for this
- * query-subject pair */
- alignments = NULL;
-
- if(thisMatch->hsp_array == NULL) {
- continue;
+ if (positionBased) {
+ adjustParameters = adjustParameters ? 1 : 0;
}
-
- if(SWheapWillAcceptOnlyBelowCutoff(&significantMatches)) {
- /* Only matches with evalue <= options->ethresh will be saved */
-
- /* e-value for a sequence is the smallest e-value among the HSPs
- * matching a region of the sequence to the query */
- double minEvalue = thisMatch->best_evalue;
- if(minEvalue > (EVALUE_STRETCH * inclusion_ethresh)) {
- /* This match is likely to have an evalue > options->ethresh
- * and therefore, we assume that all other matches with higher
- * input e-values are also unlikely to get sufficient
- * improvement in a redone alignment */
- break;
- }
+ if (extendParams->options->eTbackExt == eSmithWatermanTbck) {
+ SmithWaterman = TRUE;
+ } else {
+ SmithWaterman = FALSE;
}
- /* Get the sequence for this match */
- Kappa_MatchingSequenceInitialize(&matchingSeq, program_number,
- seqSrc, gen_code_string, thisMatch->oid);
-
- window_of_hsp = calloc(thisMatch->hspcnt, sizeof(Int4));
- if(program_number == eBlastTypeTblastn) {
- /* Find the multiple translation windows used by tblastn queries. */
- WindowsFromHSPs(thisMatch->hsp_array, thisMatch->hspcnt,
- KAPPA_WINDOW_BORDER, matchingSeq.length,
- &windows, &nWindows, &lWindows, window_of_hsp);
- } else { /* the program is not tblastn, i.e. it is blastp */
- /* Initialize the single window used by blastp queries. */
- windows[0]->frame = 0;
- windows[0]->hspcnt = thisMatch->hspcnt;
- windows[0]->begin = 0;
- windows[0]->end = matchingSeq.length;
- } /* else the program is blastp */
- if(SmithWaterman) {
- /* We are performing a Smith-Waterman alignment */
- for(window_index = 0; window_index < nWindows; window_index++) {
- /* for all window */
- Kappa_SequenceData subject; /* sequence data for this window */
-
- window = windows[window_index];
- Kappa_SequenceGetWindow( &matchingSeq, window, &subject );
-
- if(0 ==
- Kappa_AdjustSearch(searchParams, query.length, &subject, matrix,
- positionBased)) {
- /* Kappa_AdjustSearch ran without error; compute the new
- alignments. */
- Int4 aSwScore; /* score computed by the
- * Smith-Waterman algorithm. */
- Boolean alignment_is_significant; /* True if the score/evalue of
- * the Smith-Waterman alignment
- * is significant. */
- Kappa_ForbiddenRangesClear(&forbidden);
- do {
- double newSwEvalue; /* evalue as computed by the
- * Smith-Waterman algorithm */
- Int4 matchEnd, queryEnd; /* end points of the alignments
- * computed by the Smith-Waterman
- * algorithm. */
- newSwEvalue =
- SmithWatermanScoreOnly(&subject, &query, matrix,
- scoringParams->gap_open,
- scoringParams->gap_extend,
- &matchEnd, &queryEnd, &aSwScore, kbp,
- queryInfo->contexts[0].eff_searchsp,
- positionBased,
- &forbidden);
- if( do_link_hsps ) {
- alignment_is_significant = aSwScore >= cutoff_s;
- } else {
- alignment_is_significant =
- newSwEvalue < hitParams->options->expect_value;
- if( alignments == NULL ) {
- /* this is the most significant alignment; if it will not
- * be accepted, no alignments from this match will */
- alignment_is_significant =
- alignment_is_significant &&
- SWheapWouldInsert(&significantMatches, newSwEvalue,
- aSwScore, thisMatch->oid);
- }
- }
-
- if(alignment_is_significant) {
- Int4 matchStart, queryStart; /* the start of the
- * alignment in the
- * match/query sequence */
-
- SmithWatermanFindStart(&subject, &query, matrix,
- scoringParams->gap_open,
- scoringParams->gap_extend,
- matchEnd, queryEnd, aSwScore,
- &matchStart, &queryStart,
- positionBased, &forbidden);
-
- gapAlign->gap_x_dropoff =
- (Int4) (extendParams->gap_x_dropoff_final *
- NCBIMATH_LN2 / kbp->Lambda);
-
- alignments =
- NewAlignmentUsingXdrop(&query, queryStart, queryEnd,
- &subject, matchStart, matchEnd,
- aSwScore, window,
- gapAlign, scoringParams,
- localScalingFactor,
- program_number, alignments);
-
- Kappa_ForbiddenRangesPush(&forbidden,
- queryStart,
- alignments->queryEnd - queryStart,
- matchStart,
- alignments->matchEnd - matchStart);
- }
- /* end if the next local alignment is significant */
- } while(alignment_is_significant && window->hspcnt > 1);
- /* end do..while the next local alignment is significant, and
- * the original blast search found more than one alignment. */
- } /* end if Kappa_AdjustSearch ran without error. */
- Kappa_SequenceDataRelease(&subject);
- } /* end for all windows */
+ inclusion_ethresh =
+ (psiOptions != NULL) ? psiOptions->inclusion_ethresh : 0;
+
+ /*****************/
+ /* Initialize searchParams */
+ searchParams =
+ s_SearchParametersNew(queryInfo->max_length, adjustParameters,
+ positionBased);
+ s_RecordInitialSearch(searchParams, queryBlk, queryInfo, sbp,
+ scoringParams, queryInfo->max_length,
+ adjustParameters, positionBased);
+ if (adjustParameters) {
+ if((0 == strcmp(scoringParams->options->matrix, "BLOSUM62_20"))) {
+ localScalingFactor = SCALING_FACTOR / 10;
+ } else {
+ localScalingFactor = SCALING_FACTOR;
+ }
} else {
- /* else we are not performing a Smith-Waterman alignment */
- Int4 hsp_index;
- /* data for the current window */
- Kappa_SequenceData subject = {NULL,0,NULL};
- window_index = -1; /* -1 indicates that sequence data has
- * not been obtained for any window in
- * the list. */
- window = NULL;
-
- for(hsp_index = 0; hsp_index < thisMatch->hspcnt; hsp_index++) {
- /* for all HSPs in thisMatch */
- if(!isAlreadyContained(thisMatch->hsp_array[hsp_index], alignments,
- kbp->Lambda, localScalingFactor)) {
- Kappa_DistinctAlignment * newAlign; /* the new alignment */
- Boolean adjust_search_failed = FALSE; /* if true, AdjustSearch was
- * called and failed. */
- if( window_index != window_of_hsp[hsp_index] ) {
- /* The current window doesn't contain this HSP. */
- Kappa_SequenceDataRelease(&subject);
-
- window_index = window_of_hsp[hsp_index];
- window = windows[window_index];
- Kappa_SequenceGetWindow(&matchingSeq, window, &subject);
-
- adjust_search_failed =
- Kappa_AdjustSearch(searchParams, query.length, &subject, matrix,
- positionBased);
- } /* end if the current window doesn't contain this HSP */
- if(!adjust_search_failed) {
- Int4 q_start, s_start;
-
- StartingPointForHit(&q_start, &s_start, sbp, positionBased,
- thisMatch->hsp_array[hsp_index],
- window, &query, &subject);
-
- if (positionBased) {
- /* We don't use the scaled Lambda because we loose precision */
- gapAlign->gap_x_dropoff =
- (Int4) (extendParams->options->gap_x_dropoff_final *
- NCBIMATH_LN2 /
- searchParams->kbp_gap_orig->Lambda*localScalingFactor);
- } else {
- /* Lambda is already scaled */
- gapAlign->gap_x_dropoff =
- (Int4) (extendParams->options->gap_x_dropoff_final *
- NCBIMATH_LN2 / kbp->Lambda);
- }
- BLAST_GappedAlignmentWithTraceback(program_number,
- query.data, subject.data,
- gapAlign, scoringParams,
- q_start, s_start,
- query.length, subject.length);
-
- newAlign = NewAlignmentFromGapAlign(gapAlign, window);
- withDistinctEnds(&newAlign, &alignments);
- } /* end if adjust search failed */
- } /* end if not isAlreadyContained */
- } /* for all HSPs in thisMatch */
- Kappa_SequenceDataRelease(&subject);
- } /* end else we are not performing a Smith-Waterman alignment */
- sfree(window_of_hsp);
-
- if( alignments != NULL) { /* alignments were found */
- BlastHSPList * hsp_list; /* a hitlist containing the newly-computed
- * alignments */
- double bestEvalue; /* best evalue among alignments in the hitlist */
- Int4 bestScore; /* best score among alignments in the hitlist */
-
- hsp_list = s_HSPListFromDistinctAlignments(&alignments,
- matchingSeq.index);
-
- if(hsp_list->hspcnt > 1) { /* if there is more than one HSP, */
- /* then eliminate HSPs that are contained in a higher-scoring HSP. */
- if(!SmithWaterman || nWindows > 1) {
- /* For SmithWaterman alignments in a single window, the
- * forbidden ranges rule does not allow one alignment to be
- * contained in another, so the call to HitlistReapContained
- * is not needed. */
- qsort(hsp_list->hsp_array, hsp_list->hspcnt, sizeof(BlastHSP *),
- ScoreCompareHSPs);
- HitlistReapContained(hsp_list->hsp_array, &hsp_list->hspcnt);
+ localScalingFactor = 1.0;
+ }
+ s_RescaleSearch(searchParams, queryBlk, queryInfo, sbp, scoringParams,
+ localScalingFactor, positionBased);
+ /********/
+ if (positionBased) {
+ matrix = sbp->psi_matrix->pssm->data;
+ if ( !matrix ) {
+ /* YIKES! error return
+ Char* msg =
+ "Cannot perform position-specific search without a PSSM";
+ BlastConstructErrorMessage("RedoAlignmentCore", msg, 3,
+ &(search->error_return));
+ return NULL;
+ */
}
- }
-
- if(do_link_hsps) {
- BLAST_LinkHsps(program_number, hsp_list,
- queryInfo, matchingSeq.length,
- sbp, hitParams->link_hsp_params, TRUE);
- } else {
- Blast_HSPListGetEvalues(queryInfo, hsp_list, TRUE, sbp,
- 0.0, /* use a non-zero gap decay only when
- linking hsps */
- 1.0); /* Use scaling factor equal to 1, because
- both scores and Lambda are scaled, so
- they will cancel each other. */
- }
- bestEvalue = hsp_list->best_evalue;
- bestScore = hsp_list->hsp_array[0]->score;
-
- if(bestEvalue <= hitParams->options->expect_value &&
- SWheapWouldInsert(&significantMatches, bestEvalue,
- bestScore, thisMatch->oid)) {
- /* If the best alignment is significant, then save the current list */
-
- Blast_HSPListReapByEvalue(hsp_list, hitParams->options);
-
- s_HSPListRescaleScores(hsp_list, kbp->Lambda, kbp->logK,
- localScalingFactor);
-
- SWheapInsert(&significantMatches, hsp_list, bestEvalue, bestScore,
- thisMatch->oid);
- } else { /* the best alignment is not significant */
- Blast_HSPListFree(hsp_list);
- } /* end else the best alignment is not significant */
- } /* end if any alignments were found */
-
- Kappa_MatchingSequenceRelease(&matchingSeq);
- thisMatch = Blast_HSPListFree(thisMatch);
- }
- /* end for all matching sequences */
- SWheapToFlatList( &significantMatches, results,
- hitParams->options->hitlist_size );
- /* Clean up */
- for( window_index = 0; window_index < nWindows; window_index++ ) {
- sfree(windows[window_index]);
- }
- sfree(windows);
- SWheapRelease(&significantMatches);
- if(SmithWaterman) Kappa_ForbiddenRangesRelease(&forbidden);
- gapAlign = BLAST_GapAlignStructFree(gapAlign);
-
- Kappa_RestoreSearch(searchParams, sbp, matrix, scoringParams, positionBased);
- Kappa_SearchParametersFree(&searchParams);
-
- return 0;
+ } else {
+ matrix = sbp->matrix->data;
+ }
+ if ((status_code=BLAST_GapAlignStructNew(scoringParams,
+ extendParams,
+ BlastSeqSrcGetMaxSeqLen(seqSrc),
+ sbp, &gapAlign)) != 0) {
+ return status_code;
+ }
+ gapAlign->gap_x_dropoff =
+ extendParams->gap_x_dropoff_final * localScalingFactor;
+ redo_align_params =
+ s_GetAlignParams(&gapping_params_context, program_number,
+ gapAlign, queryBlk, queryInfo,
+ sbp, scoringParams, extendParams, hitParams,
+ psiOptions, scoringParams->options->matrix,
+ localScalingFactor, adjustParameters);
+ do_link_hsps = redo_align_params->do_link_hsps;
+
+ s_GetQueryInfo(&query_info, &numQueries, queryBlk->sequence, queryInfo);
+ if(SmithWaterman) {
+ Blast_ForbiddenRangesInitialize(&forbidden, queryInfo->max_length);
+ }
+ redoneMatches = calloc(numQueries, sizeof(BlastCompo_Heap));
+ for (query_index = 0; query_index < numQueries; query_index++) {
+ BlastCompo_HeapInitialize(&redoneMatches[query_index],
+ hitParams->options->hitlist_size,
+ inclusion_ethresh);
+ }
+ if( adjustParameters > 1 && !positionBased ) {
+ NRrecord = Blast_CompositionWorkspaceNew();
+ Blast_CompositionWorkspaceInit(NRrecord,
+ scoringParams->options->matrix);
+ }
+ while (BlastHSPStreamRead(hsp_stream, &thisMatch) != kBlastHSPStream_Eof) {
+ /* for all matching sequences */
+ BlastCompo_MatchingSequence matchingSeq; /* the data for a matching
+ * database sequence */
+ BlastCompo_Alignment ** alignments; /* array of lists of
+ * alignments for each
+ * query to this subject */
+ alignments = calloc(numQueries, sizeof(BlastCompo_Alignment *));
+
+ if(thisMatch->hsp_array == NULL) {
+ continue;
+ }
+ if (BlastCompo_EarlyTermination(thisMatch->best_evalue,
+ redoneMatches, numQueries)) {
+ break;
+ }
+ /* Get the sequence for this match */
+ s_MatchingSequenceInitialize(&matchingSeq, program_number,
+ seqSrc, gen_code_string, thisMatch->oid);
+ incoming_aligns =
+ s_ResultHspToDistinctAlign(queryInfo, thisMatch->hsp_array,
+ thisMatch->hspcnt, localScalingFactor);
+ if (SmithWaterman) {
+ Blast_RedoOneMatchSmithWaterman(alignments,
+ redo_align_params,
+ incoming_aligns,
+ thisMatch->hspcnt,
+ &matchingSeq, query_info,
+ numQueries, matrix,
+ NRrecord, &forbidden,
+ redoneMatches);
+ } else {
+ Blast_RedoOneMatch(alignments, redo_align_params,
+ incoming_aligns, thisMatch->hspcnt,
+ &matchingSeq, queryInfo->max_length,
+ query_info, numQueries, matrix,
+ NRrecord);
+ }
+ for (query_index = 0; query_index < numQueries; query_index++) {
+ /* Loop over queries */
+ if( alignments[query_index] != NULL) { /* alignments were found */
+ double bestEvalue; /* best evalue among alignments in the
+ hitlist */
+ Int4 bestScore; /* best score among alignments in
+ the hitlist */
+ BlastHSPList * hsp_list; /* a hitlist containing the
+ * newly-computed alignments */
+ void * discardedAligns;
+ hsp_list =
+ s_HSPListFromDistinctAlignments(&alignments[query_index],
+ matchingSeq.index);
+ if (hsp_list->hspcnt > 1) {
+ s_HitlistReapContained(hsp_list->hsp_array,
+ &hsp_list->hspcnt);
+ }
+ s_HitlistEvaluateAndPurge(&bestScore, &bestEvalue,
+ hsp_list,
+ matchingSeq.length,
+ program_number, queryInfo,
+ sbp, hitParams,
+ do_link_hsps);
+ if (bestEvalue <= hitParams->options->expect_value &&
+ BlastCompo_HeapWouldInsert(&redoneMatches[query_index],
+ bestEvalue, bestScore,
+ thisMatch->oid)) {
+ s_HSPListRescaleScores(hsp_list, redo_align_params->Lambda,
+ redo_align_params->logK,
+ localScalingFactor);
+
+ BlastCompo_HeapInsert(&redoneMatches[query_index],
+ hsp_list, bestEvalue,
+ bestScore, thisMatch->oid,
+ &discardedAligns);
+ if (discardedAligns != NULL) {
+ Blast_HSPListFree(discardedAligns);
+ }
+ } else { /* the best alignment is not significant */
+ Blast_HSPListFree(hsp_list);
+ } /* end if the best alignment is significant */
+ } /* end if any alignments were found */
+ } /* end loop over queries */
+ s_MatchingSequenceRelease(&matchingSeq);
+ thisMatch = Blast_HSPListFree(thisMatch);
+ sfree(alignments);
+ BlastCompo_AlignmentsFree(&incoming_aligns, NULL);
+ }
+ /* end for all matching sequences */
+ /* YIKES! handle multiple queries
+ for (query_index = 0; query_index < numQueries; query_index++) {
+ results[query_index] =
+ BlastCompo_HeapToFlatList(&redoneMatches[query_index]);
+ }
+ */
+ s_HeapToFlatList(&redoneMatches[0], results,
+ hitParams->options->hitlist_size);
+ /* Clean up */
+ free(query_info);
+ Blast_RedoAlignParamsFree(&redo_align_params);
+ for (query_index = 0; query_index < numQueries; query_index++) {
+ BlastCompo_HeapRelease(&redoneMatches[query_index]);
+ }
+ sfree(redoneMatches); redoneMatches = NULL;
+ if(SmithWaterman) {
+ Blast_ForbiddenRangesRelease(&forbidden);
+ }
+ gapAlign = BLAST_GapAlignStructFree(gapAlign);
+ s_RestoreSearch(searchParams, sbp, matrix, queryInfo->max_length,
+ scoringParams, positionBased, adjustParameters);
+ s_SearchParametersFree(&searchParams);
+ if (NULL != NRrecord) {
+ Blast_CompositionWorkspaceFree(&NRrecord);
+ }
+ return 0;
}
diff --git a/algo/blast/core/blast_kappa.h b/algo/blast/core/blast_kappa.h
index d60f509e..3b6a79c1 100644
--- a/algo/blast/core/blast_kappa.h
+++ b/algo/blast/core/blast_kappa.h
@@ -1,4 +1,4 @@
-/* $Id: blast_kappa.h,v 1.9 2004/11/23 21:46:03 camacho Exp $
+/* $Id: blast_kappa.h,v 1.10 2005/12/01 14:47:40 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -68,7 +68,7 @@ extern "C" {
*/
Int2
-Kappa_RedoAlignmentCore(EBlastProgramType program_number,
+Blast_RedoAlignmentCore(EBlastProgramType program_number,
BLAST_SequenceBlk * queryBlk,
BlastQueryInfo* query_info,
BlastScoreBlk* sbp,
@@ -90,6 +90,9 @@ Kappa_RedoAlignmentCore(EBlastProgramType program_number,
* ===========================================================================
*
* $Log: blast_kappa.h,v $
+ * Revision 1.10 2005/12/01 14:47:40 madden
+ * Renamed Kappa_RedoAlignmentCore as Blast_RedoAlignmentCore
+ *
* Revision 1.9 2004/11/23 21:46:03 camacho
* Brought up to date with current version of kappa.c [by Mike Gertz]
*
diff --git a/algo/blast/core/blast_lookup.c b/algo/blast/core/blast_lookup.c
index 61a24cf7..6a71a036 100644
--- a/algo/blast/core/blast_lookup.c
+++ b/algo/blast/core/blast_lookup.c
@@ -1,4 +1,4 @@
-/* $Id: blast_lookup.c,v 1.43 2005/08/02 21:20:26 coulouri Exp $
+/* $Id: blast_lookup.c,v 1.44 2005/11/16 14:27:03 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
@@ -43,7 +43,7 @@
#ifndef SKIP_DOXYGEN_PROCESSING
static char const rcsid[] =
- "$Id: blast_lookup.c,v 1.43 2005/08/02 21:20:26 coulouri Exp $";
+ "$Id: blast_lookup.c,v 1.44 2005/11/16 14:27:03 madden Exp $";
#endif /* SKIP_DOXYGEN_PROCESSING */
/** Structure containing information needed for adding neighboring words.
diff --git a/algo/blast/core/blast_lookup.h b/algo/blast/core/blast_lookup.h
index 28d71edd..a1c283ca 100644
--- a/algo/blast/core/blast_lookup.h
+++ b/algo/blast/core/blast_lookup.h
@@ -1,4 +1,4 @@
-/* $Id: blast_lookup.h,v 1.25 2005/07/27 19:11:33 camacho Exp $
+/* $Id: blast_lookup.h,v 1.26 2005/11/16 14:31:36 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
diff --git a/algo/blast/core/blast_message.c b/algo/blast/core/blast_message.c
index e43dfeba..92e10de6 100644
--- a/algo/blast/core/blast_message.c
+++ b/algo/blast/core/blast_message.c
@@ -1,4 +1,4 @@
-/* $Id: blast_message.c,v 1.18 2005/06/20 13:09:36 madden Exp $
+/* $Id: blast_message.c,v 1.19 2005/11/16 14:27:03 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
@@ -31,7 +31,7 @@
#ifndef SKIP_DOXYGEN_PROCESSING
static char const rcsid[] =
- "$Id: blast_message.c,v 1.18 2005/06/20 13:09:36 madden Exp $";
+ "$Id: blast_message.c,v 1.19 2005/11/16 14:27:03 madden Exp $";
#endif /* SKIP_DOXYGEN_PROCESSING */
#include <algo/blast/core/blast_message.h>
@@ -113,6 +113,9 @@ Blast_Perror(Int2 error_code)
* ===========================================================================
*
* $Log: blast_message.c,v $
+ * Revision 1.19 2005/11/16 14:27:03 madden
+ * Fix spelling in CRN
+ *
* Revision 1.18 2005/06/20 13:09:36 madden
* Rename BlastSeverity enums in line with C++ tookit convention
*
diff --git a/algo/blast/core/blast_options.c b/algo/blast/core/blast_options.c
index 66ef1897..2bb5d938 100644
--- a/algo/blast/core/blast_options.c
+++ b/algo/blast/core/blast_options.c
@@ -1,4 +1,4 @@
-/* $Id: blast_options.c,v 1.171 2005/06/24 12:15:40 madden Exp $
+/* $Id: blast_options.c,v 1.175 2005/11/16 14:27:03 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
@@ -34,7 +34,7 @@
#ifndef SKIP_DOXYGEN_PROCESSING
static char const rcsid[] =
- "$Id: blast_options.c,v 1.171 2005/06/24 12:15:40 madden Exp $";
+ "$Id: blast_options.c,v 1.175 2005/11/16 14:27:03 madden Exp $";
#endif /* SKIP_DOXYGEN_PROCESSING */
#include <algo/blast/core/blast_options.h>
@@ -552,9 +552,9 @@ BLAST_FillScoringOptions(BlastScoringOptions* options,
options->gap_extend = BLAST_GAP_EXTN_NUCL;
}
}
- if (gap_open)
+ if (gap_open >= 0)
options->gap_open = gap_open;
- if (gap_extend)
+ if (gap_extend >= 0)
options->gap_extend = gap_extend;
return 0;
@@ -589,7 +589,7 @@ BlastScoringOptionsValidate(EBlastProgramType program_number,
"BLASTN penalty must be negative");
return (Int2) code;
}
- if (options->gap_open > 0 && options->gap_extend == 0)
+ if (options->gapped_calculation && options->gap_open > 0 && options->gap_extend == 0)
{
Int4 code=2;
Int4 subcode=1;
@@ -601,12 +601,13 @@ BlastScoringOptionsValidate(EBlastProgramType program_number,
}
else
{
- Int2 status=0;
-
- if ((status=Blast_KarlinBlkGappedLoadFromTables(NULL, options->gap_open,
- options->gap_extend, options->decline_align,
+ if (options->gapped_calculation && !Blast_ProgramIsRpsBlast(program_number))
+ {
+ Int2 status=0;
+ if ((status=Blast_KarlinBlkGappedLoadFromTables(NULL, options->gap_open,
+ options->gap_extend, options->decline_align,
options->matrix)) != 0)
- {
+ {
if (status == 1)
{
char* buffer;
@@ -634,8 +635,8 @@ BlastScoringOptionsValidate(EBlastProgramType program_number,
sfree(buffer);
return (Int2) code;
}
- }
-
+ }
+ }
}
if (program_number != eBlastTypeBlastx &&
@@ -832,6 +833,66 @@ BLAST_FillLookupTableOptions(LookupTableOptions* options,
return 0;
}
+Int2 BLAST_GetSuggestedThreshold(EBlastProgramType program_number, const char* matrixName, Int4* threshold)
+{
+
+ const Int4 kB62_threshold = 11;
+
+ if (program_number == eBlastTypeBlastn)
+ return 0;
+
+ if (matrixName == NULL)
+ return -1;
+
+ if(strcasecmp(matrixName, "BLOSUM62") == 0)
+ *threshold = kB62_threshold;
+ else if(strcasecmp(matrixName, "BLOSUM45") == 0)
+ *threshold = 14;
+ else if(strcasecmp(matrixName, "BLOSUM62_20") == 0)
+ *threshold = 100;
+ else if(strcasecmp(matrixName, "BLOSUM80") == 0)
+ *threshold = 12;
+ else if(strcasecmp(matrixName, "PAM30") == 0)
+ *threshold = 16;
+ else if(strcasecmp(matrixName, "PAM70") == 0)
+ *threshold = 14;
+ else
+ *threshold = kB62_threshold;
+
+ if (Blast_SubjectIsTranslated(program_number) == TRUE)
+ *threshold += 2; /* Covers tblastn, tblastx, psi-tblastn rpstblastn. */
+ else if (Blast_QueryIsTranslated(program_number) == TRUE)
+ *threshold += 1;
+
+ return 0;
+}
+
+Int2 BLAST_GetSuggestedWindowSize(EBlastProgramType program_number, const char* matrixName, Int4* window_size)
+{
+ const Int4 kB62_windowsize = 40;
+
+ if (program_number == eBlastTypeBlastn)
+ return 0;
+
+ if (matrixName == NULL)
+ return -1;
+
+ if(strcasecmp(matrixName, "BLOSUM62") == 0)
+ *window_size = kB62_windowsize;
+ else if(strcasecmp(matrixName, "BLOSUM45") == 0)
+ *window_size = 60;
+ else if(strcasecmp(matrixName, "BLOSUM80") == 0)
+ *window_size = 25;
+ else if(strcasecmp(matrixName, "PAM30") == 0)
+ *window_size = 15;
+ else if(strcasecmp(matrixName, "PAM70") == 0)
+ *window_size = 20;
+ else
+ *window_size = kB62_windowsize;
+
+ return 0;
+}
+
/** Validate options for the discontiguous word megablast
* Word size must be 11 or 12; template length 16, 18 or 21;
* template type 0, 1 or 2.
@@ -1204,6 +1265,18 @@ Int2 BLAST_ValidateOptions(EBlastProgramType program_number,
* ===========================================================================
*
* $Log: blast_options.c,v $
+ * Revision 1.175 2005/11/16 14:27:03 madden
+ * Fix spelling in CRN
+ *
+ * Revision 1.174 2005/10/18 15:19:04 madden
+ * Exclude rpsblast from validation of gap parameters
+ *
+ * Revision 1.173 2005/10/17 14:03:34 madden
+ * Change convention for unset gap parameters from zero to negative number
+ *
+ * Revision 1.172 2005/08/29 13:51:44 madden
+ * Add functions BLAST_GetSuggestedThreshold and BLAST_GetSuggestedWindowSize
+ *
* Revision 1.171 2005/06/24 12:15:40 madden
* Add protection against NULL pointers in options free functons
*
diff --git a/algo/blast/core/blast_options.h b/algo/blast/core/blast_options.h
index a68b58e4..96abb605 100644
--- a/algo/blast/core/blast_options.h
+++ b/algo/blast/core/blast_options.h
@@ -1,4 +1,4 @@
-/* $Id: blast_options.h,v 1.121 2005/06/02 16:18:05 camacho Exp $
+/* $Id: blast_options.h,v 1.125 2005/11/29 17:27:40 camacho Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
@@ -316,11 +316,6 @@ typedef struct BlastHitSavingOptions {
Int4 culling_limit; /**< If the query range of an HSP is contained in
at least this many higher-scoring HSPs, throw
away the HSP as redundant (turned off if zero) */
- /* PSI-BLAST Hit saving options */
- Int4 required_start; /**< Start of the region required to be part of the
- alignment */
- Int4 required_end; /**< End of the region required to be part of the
- alignment */
/********************************************************************/
/* Merge all these in a structure for clarity? */
@@ -870,6 +865,30 @@ Int2 BLAST_ValidateOptions(EBlastProgramType program_number,
Blast_Message* *blast_msg);
+
+/** Get thresholds for word-finding suggested by Stephen Altschul.
+ *
+ * @param program_number Type of blast program: blastn, blastp, blastx,
+ * tblastn, tblastx) [in]
+ * @param matrixName matrix, e.g., BLOSUM62 [in]
+ * @param threshold returns suggested value [in|out]
+ * @return zero on success
+ */
+Int2 BLAST_GetSuggestedThreshold(EBlastProgramType program_number,
+ const char* matrixName,
+ Int4* threshold);
+
+/** Get window sizes for two hit algorithm suggested by Stephen Altschul.
+ *
+ * @param program_number Type of blast program: blastn, blastp, blastx,
+ * tblastn, tblastx) [in]
+ * @param matrixName matrix, e.g., BLOSUM62 [in]
+ * @param window_size returns suggested value [in|out]
+ * @return zero on success
+ */
+Int2 BLAST_GetSuggestedWindowSize(EBlastProgramType program_number,
+ const char* matrixName,
+ Int4* window_size);
#ifdef __cplusplus
}
#endif
diff --git a/algo/blast/core/blast_parameters.c b/algo/blast/core/blast_parameters.c
index 8fd4279e..33ebe347 100644
--- a/algo/blast/core/blast_parameters.c
+++ b/algo/blast/core/blast_parameters.c
@@ -1,4 +1,4 @@
-/* $Id: blast_parameters.c,v 1.10 2005/06/08 17:27:53 madden Exp $
+/* $Id: blast_parameters.c,v 1.12 2005/11/16 14:27:03 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
@@ -30,7 +30,7 @@
#ifndef SKIP_DOXYGEN_PROCESSING
static char const rcsid[] =
- "$Id: blast_parameters.c,v 1.10 2005/06/08 17:27:53 madden Exp $";
+ "$Id: blast_parameters.c,v 1.12 2005/11/16 14:27:03 madden Exp $";
#endif /* SKIP_DOXYGEN_PROCESSING */
#include <algo/blast/core/blast_parameters.h>
@@ -83,23 +83,27 @@ s_BlastFindValidKarlinBlk(Blast_KarlinBlk** kbp_in, const BlastQueryInfo* query_
* @param kbp_in array of Karlin blocks to be searched [in]
* @param query_info information on number of queries (specifies number of
* elements in above array) [in]
+ * @param kbp_out Karlin blocks with smallest lambda [out]
* @return The smallest lambda value
*/
static double
s_BlastFindSmallestLambda(Blast_KarlinBlk** kbp_in,
- const BlastQueryInfo* query_info)
+ const BlastQueryInfo* query_info,
+ Blast_KarlinBlk** kbp_out)
{
Int4 i;
- double min_lambda = 0.0;
+ double min_lambda = (double) INT4_MAX;
ASSERT(kbp_in && query_info);
for (i=query_info->first_context; i<=query_info->last_context; i++) {
if (s_BlastKarlinBlkIsValid(kbp_in[i])) {
- if (min_lambda == 0.0)
+ if (min_lambda > kbp_in[i]->Lambda)
+ {
min_lambda = kbp_in[i]->Lambda;
- else
- min_lambda = MIN(min_lambda, kbp_in[i]->Lambda);
+ if (kbp_out)
+ *kbp_out = kbp_in[i];
+ }
}
}
@@ -218,7 +222,7 @@ BlastInitialWordParametersNew(EBlastProgramType program_number,
(*parameters)->x_dropoff_init = (Int4)
ceil(sbp->scale_factor * word_options->x_dropoff * NCBIMATH_LN2/
- s_BlastFindSmallestLambda(sbp->kbp_std, query_info));
+ s_BlastFindSmallestLambda(sbp->kbp_std, query_info, NULL));
if (program_number == eBlastTypeBlastn &&
(query_info->contexts[query_info->last_context].query_offset +
@@ -376,7 +380,7 @@ Int2 BlastExtensionParametersNew(EBlastProgramType program_number,
/* Set gapped X-dropoffs only if it is a gapped search. */
if (sbp->kbp_gap) {
- double min_lambda = s_BlastFindSmallestLambda(sbp->kbp_gap, query_info);
+ double min_lambda = s_BlastFindSmallestLambda(sbp->kbp_gap, query_info, NULL);
params->gap_x_dropoff = (Int4)
(options->gap_x_dropoff*NCBIMATH_LN2 / min_lambda);
/* Note that this conversion from bits to raw score is done prematurely
@@ -732,9 +736,6 @@ BlastHitSavingParametersUpdate(EBlastProgramType program_number,
return 0;
}
-/** machine epsilon assumed by CalculateLinkHSPCutoffs */
-#define MY_EPS 1.0e-9
-
/* FIXME, move to blast_engine.c and make private? */
void
CalculateLinkHSPCutoffs(EBlastProgramType program, BlastQueryInfo* query_info,
@@ -742,34 +743,34 @@ CalculateLinkHSPCutoffs(EBlastProgramType program, BlastQueryInfo* query_info,
const BlastInitialWordParameters* word_params,
Int8 db_length, Int4 subject_length)
{
- double gap_prob, gap_decay_rate, x_variable, y_variable;
Blast_KarlinBlk* kbp;
+ double gap_prob, gap_decay_rate, x_variable, y_variable;
Int4 expected_length, window_size, query_length;
Int8 search_sp;
- Int4 concat_qlen;
+ const double kEpsilon = 1.0e-9;
if (!link_hsp_params)
return;
- /* Do this for the first context, should this be changed?? */
- kbp = sbp->kbp[query_info->first_context];
+ /* Get KarlinBlk for context with smallest lambda (still greater than zero) */
+ s_BlastFindSmallestLambda(sbp->kbp, query_info, &kbp);
window_size
= link_hsp_params->gap_size + link_hsp_params->overlap_size + 1;
gap_prob = link_hsp_params->gap_prob = BLAST_GAP_PROB;
gap_decay_rate = link_hsp_params->gap_decay_rate;
/* Use average query length */
- concat_qlen =
- query_info->contexts[query_info->last_context].query_offset +
- query_info->contexts[query_info->last_context].query_length - 1;
-
- query_length = concat_qlen / (query_info->last_context + 1);
+ query_length =
+ (query_info->contexts[query_info->last_context].query_offset +
+ query_info->contexts[query_info->last_context].query_length - 1)
+ / (query_info->last_context + 1);
if (Blast_SubjectIsTranslated(program) || program == eBlastTypeRpsTblastn) {
/* Lengths in subsequent calculations should be on the protein scale */
subject_length /= CODON_LENGTH;
db_length /= CODON_LENGTH;
}
+
/* Subtract off the expected score. */
expected_length = BLAST_Nint(log(kbp->K*((double) query_length)*
@@ -789,6 +790,7 @@ CalculateLinkHSPCutoffs(EBlastProgramType program, BlastQueryInfo* query_info,
y_variable = log((double) (subject_length + expected_length)/
(double) subject_length)*(kbp->K)/(gap_decay_rate);
}
+
search_sp = ((Int8) query_length)* ((Int8) subject_length);
x_variable = 0.25*y_variable*((double) search_sp);
@@ -798,11 +800,11 @@ CalculateLinkHSPCutoffs(EBlastProgramType program, BlastQueryInfo* query_info,
are being checked for. */
if (search_sp > 8*window_size*window_size) {
- x_variable /= (1.0 - gap_prob + MY_EPS);
+ x_variable /= (1.0 - gap_prob + kEpsilon);
link_hsp_params->cutoff_big_gap =
(Int4) floor((log(x_variable)/kbp->Lambda)) + 1;
x_variable = y_variable*(window_size*window_size);
- x_variable /= (gap_prob + MY_EPS);
+ x_variable /= (gap_prob + kEpsilon);
link_hsp_params->cutoff_small_gap =
MAX(word_params->cutoff_score,
(Int4) floor((log(x_variable)/kbp->Lambda)) + 1);
@@ -824,6 +826,12 @@ CalculateLinkHSPCutoffs(EBlastProgramType program, BlastQueryInfo* query_info,
* ===========================================================================
*
* $Log: blast_parameters.c,v $
+ * Revision 1.12 2005/11/16 14:27:03 madden
+ * Fix spelling in CRN
+ *
+ * Revision 1.11 2005/11/04 13:26:20 madden
+ * Fixes to CalculateLinkHSPCutoffs so that invalid KarlinBlk is not used
+ *
* Revision 1.10 2005/06/08 17:27:53 madden
* Use functions from blast_program.c
*
diff --git a/algo/blast/core/blast_parameters.h b/algo/blast/core/blast_parameters.h
index df09213f..3c683ce4 100644
--- a/algo/blast/core/blast_parameters.h
+++ b/algo/blast/core/blast_parameters.h
@@ -1,4 +1,4 @@
-/* $Id: blast_parameters.h,v 1.5 2005/02/08 14:45:55 madden Exp $
+/* $Id: blast_parameters.h,v 1.6 2005/11/16 14:31:36 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
diff --git a/algo/blast/core/blast_program.c b/algo/blast/core/blast_program.c
index c9edd3ad..009a5fb2 100644
--- a/algo/blast/core/blast_program.c
+++ b/algo/blast/core/blast_program.c
@@ -1,6 +1,6 @@
#ifndef SKIP_DOXYGEN_PROCESSING
static char const rcsid[] =
- "$Id: blast_program.c,v 1.2 2005/06/08 19:30:51 camacho Exp $";
+ "$Id: blast_program.c,v 1.3 2005/11/16 14:27:03 madden Exp $";
#endif /* SKIP_DOXYGEN_PROCESSING */
/* ===========================================================================
*
@@ -9,7 +9,7 @@ static char const rcsid[] =
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
diff --git a/algo/blast/core/blast_program.h b/algo/blast/core/blast_program.h
index cd747bcc..6223cfca 100644
--- a/algo/blast/core/blast_program.h
+++ b/algo/blast/core/blast_program.h
@@ -1,7 +1,7 @@
#ifndef ALGO_BLAST_CORE___BLAST_PROGRAM__H
#define ALGO_BLAST_CORE___BLAST_PROGRAM__H
-/* $Id: blast_program.h,v 1.1 2005/06/08 17:25:37 madden Exp $
+/* $Id: blast_program.h,v 1.2 2005/11/16 14:31:36 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -9,7 +9,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
diff --git a/algo/blast/core/blast_psi.c b/algo/blast/core/blast_psi.c
index d89fae97..88b18416 100644
--- a/algo/blast/core/blast_psi.c
+++ b/algo/blast/core/blast_psi.c
@@ -1,6 +1,6 @@
#ifndef SKIP_DOXYGEN_PROCESSING
static char const rcsid[] =
- "$Id: blast_psi.c,v 1.29 2005/05/23 15:32:56 camacho Exp $";
+ "$Id: blast_psi.c,v 1.30 2005/11/16 14:27:03 madden Exp $";
#endif /* SKIP_DOXYGEN_PROCESSING */
/* ===========================================================================
*
@@ -9,7 +9,7 @@ static char const rcsid[] =
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
diff --git a/algo/blast/core/blast_psi.h b/algo/blast/core/blast_psi.h
index 40e59b32..e5227c51 100644
--- a/algo/blast/core/blast_psi.h
+++ b/algo/blast/core/blast_psi.h
@@ -1,7 +1,7 @@
#ifndef ALGO_BLAST_CORE___BLAST_PSI__H
#define ALGO_BLAST_CORE___BLAST_PSI__H
-/* $Id: blast_psi.h,v 1.15 2005/05/20 18:18:31 camacho Exp $
+/* $Id: blast_psi.h,v 1.16 2005/11/16 14:31:36 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -9,7 +9,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
diff --git a/algo/blast/core/blast_psi_priv.c b/algo/blast/core/blast_psi_priv.c
index 92dfb1a7..1062ac8a 100644
--- a/algo/blast/core/blast_psi_priv.c
+++ b/algo/blast/core/blast_psi_priv.c
@@ -1,6 +1,6 @@
#ifndef SKIP_DOXYGEN_PROCESSING
static char const rcsid[] =
- "$Id: blast_psi_priv.c,v 1.53 2005/04/21 20:26:57 camacho Exp $";
+ "$Id: blast_psi_priv.c,v 1.57 2005/11/18 20:09:45 camacho Exp $";
#endif /* SKIP_DOXYGEN_PROCESSING */
/* ===========================================================================
*
@@ -454,7 +454,7 @@ _PSISequenceWeightsFree(_PSISequenceWeights* seq_weights)
}
#ifdef _DEBUG
-static char getRes(char input)
+char GetResidue(char input)
{
switch (input) {
case 0: return ('-');
@@ -502,7 +502,7 @@ __printMsa(const char* filename, const _PSIMsa* msa)
/*fprintf(fp, "%3d\t", i);*/
for (j = 0; j < msa->dimensions->query_length; j++) {
if (msa->cell[i][j].is_aligned) {
- fprintf(fp, "%c", getRes(msa->cell[i][j].letter));
+ fprintf(fp, "%c", GetResidue(msa->cell[i][j].letter));
} else {
fprintf(fp, ".");
}
@@ -1637,6 +1637,15 @@ _PSISpreadGapWeights(const _PSIMsa* msa,
}
}
+/** The following define enables/disables the _PSICheckSequenceWeights
+ * function's abort statement in the case when the sequence weights are not
+ * being checked. When this is enabled, abort() will be invoked if none of the
+ * sequence weights are checked to be in the proper range. The C toolkit code
+ * silently ignores this situation, so it's implemented that way here for
+ * backwards compatibility.
+ */
+#define SEQUENCE_WEIGHTS_CHECK__ABORT_ON_FAILURE 0
+
/* Verifies that each column of the match_weights field in the seq_weights
* structure adds up to 1. */
static int
@@ -1646,9 +1655,12 @@ _PSICheckSequenceWeights(const _PSIMsa* msa,
{
const Uint1 kXResidue = AMINOACID_TO_NCBISTDAA['X'];
Uint4 pos = 0; /* residue position (ie: column number) */
- Boolean check_performed = FALSE; /* were there any sequences checked? */
const Uint4 kExpectedNumMatchingSeqs = nsg_compatibility_mode ? 0 : 1;
+#if SEQUENCE_WEIGHTS_CHECK__ABORT_ON_FAILURE
+ Boolean check_performed = FALSE; /* were there any sequences checked? */
+#endif
+
ASSERT(msa);
ASSERT(seq_weights);
@@ -1659,6 +1671,9 @@ _PSICheckSequenceWeights(const _PSIMsa* msa,
if (msa->num_matching_seqs[pos] <= kExpectedNumMatchingSeqs ||
msa->cell[kQueryIndex][pos].letter == kXResidue) {
+ /* N.B.: the following statement allows for the sequence weights to
+ * go unchecked. To allow more strict checking, enable the
+ * SEQUENCE_WEIGHTS_CHECK__ABORT_ON_FAILURE #define above */
continue;
}
@@ -1669,15 +1684,19 @@ _PSICheckSequenceWeights(const _PSIMsa* msa,
if (running_total < 0.99 || running_total > 1.01) {
return PSIERR_BADSEQWEIGHTS;
}
+#if SEQUENCE_WEIGHTS_CHECK__ABORT_ON_FAILURE
check_performed = TRUE;
+#endif
}
+#if SEQUENCE_WEIGHTS_CHECK__ABORT_ON_FAILURE
/* This condition should never happen because it means that no sequences
* were selected to calculate the sequence weights! */
if ( !check_performed &&
!nsg_compatibility_mode ) { /* old code didn't check for this... */
assert(!"Did not perform sequence weights check");
}
+#endif
return PSI_SUCCESS;
}
@@ -1923,7 +1942,7 @@ _PSIConvertFreqRatiosToPSSM(_PSIInternalPssmData* internal_pssm,
internal_pssm->pssm[i][j] = sbp->matrix->data[kResidue][j];
- if (sbp->matrix->data[kResidue][j] != BLAST_SCORE_MIN) {
+ if (freq_ratios->data[kResidue][j] != 0.0) {
double tmp =
kPSIScaleFactor * freq_ratios->bit_scale_factor *
log(freq_ratios->data[kResidue][j])/NCBIMATH_LN2;
@@ -1959,8 +1978,8 @@ _PSIScaleMatrix(const Uint1* query,
int** scaled_pssm = NULL;
int** pssm = NULL;
double factor;
- double factor_low = 0.0;
- double factor_high = 0.0;
+ double factor_low = 1.0;
+ double factor_high = 1.0;
double ideal_lambda = 0.0; /* ideal value of ungapped lambda for
underlying scoring matrix */
double new_lambda = 0.0; /* Karlin-Altschul parameter calculated
@@ -2004,6 +2023,7 @@ _PSIScaleMatrix(const Uint1* query,
if (first_time) {
factor_high = 1.0 + kPositScalingPercent;
factor = factor_high;
+ factor_low = 1.0;
too_high = TRUE;
first_time = FALSE;
} else {
@@ -2188,7 +2208,7 @@ _PSIComputeScoreProbabilities(const int** pssm, /* [in] */
}
ASSERT(score_freqs->score_avg == 0.0);
- for (s = min_score; s < max_score; s++) {
+ for (s = min_score; s <= max_score; s++) {
score_freqs->score_avg += (s * score_freqs->sprob[s]);
}
@@ -2358,6 +2378,19 @@ _PSISaveDiagnostics(const _PSIMsa* msa,
/*
* ===========================================================================
* $Log: blast_psi_priv.c,v $
+ * Revision 1.57 2005/11/18 20:09:45 camacho
+ * Fixes for backwards compatibility with C toolkit PSSM engine for certain corner
+ * cases.
+ *
+ * Revision 1.56 2005/10/17 18:34:54 camacho
+ * Remove abort() call when sequence weights are not checked
+ *
+ * Revision 1.55 2005/10/05 14:09:30 camacho
+ * Port change in revision 6.76 of posit.c
+ *
+ * Revision 1.54 2005/10/03 20:42:41 camacho
+ * Minor
+ *
* Revision 1.53 2005/04/21 20:26:57 camacho
* Relax validation in s_PSIValidateAlignedColumns so that query sequence can be
* the only aligned sequence for a given column of the multiple sequence
diff --git a/algo/blast/core/blast_rps.h b/algo/blast/core/blast_rps.h
index 39e8d5e9..9731679b 100644
--- a/algo/blast/core/blast_rps.h
+++ b/algo/blast/core/blast_rps.h
@@ -1,4 +1,4 @@
-/* $Id: blast_rps.h,v 1.8 2004/11/04 15:52:14 papadopo Exp $
+/* $Id: blast_rps.h,v 1.9 2005/11/16 14:31:36 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
diff --git a/algo/blast/core/blast_seg.h b/algo/blast/core/blast_seg.h
index 152d50e2..7f29a78f 100644
--- a/algo/blast/core/blast_seg.h
+++ b/algo/blast/core/blast_seg.h
@@ -1,4 +1,4 @@
-/* $Id: blast_seg.h,v 1.15 2004/11/29 19:54:00 dondosha Exp $
+/* $Id: blast_seg.h,v 1.16 2005/11/16 14:31:36 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
diff --git a/algo/blast/core/blast_setup.c b/algo/blast/core/blast_setup.c
index 969385f9..ac4da0e2 100644
--- a/algo/blast/core/blast_setup.c
+++ b/algo/blast/core/blast_setup.c
@@ -1,4 +1,4 @@
-/* $Id: blast_setup.c,v 1.123 2005/08/15 16:11:43 dondosha Exp $
+/* $Id: blast_setup.c,v 1.127 2005/10/03 12:57:03 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -34,7 +34,7 @@
#ifndef SKIP_DOXYGEN_PROCESSING
static char const rcsid[] =
- "$Id: blast_setup.c,v 1.123 2005/08/15 16:11:43 dondosha Exp $";
+ "$Id: blast_setup.c,v 1.127 2005/10/03 12:57:03 madden Exp $";
#endif /* SKIP_DOXYGEN_PROCESSING */
#include <algo/blast/core/blast_setup.h>
@@ -69,7 +69,7 @@ Blast_ScoreBlkKbpGappedCalc(BlastScoreBlk * sbp,
Blast_KarlinBlkNuclGappedCalc(sbp->kbp_gap_std[index],
scoring_options->gap_open, scoring_options->gap_extend,
scoring_options->reward, scoring_options->penalty,
- sbp->kbp_std[index], error_return);
+ sbp->kbp_std[index], &(sbp->round_down), error_return);
} else {
retval =
Blast_KarlinBlkGappedCalc(sbp->kbp_gap_std[index],
@@ -385,57 +385,62 @@ BlastSetup_ScoreBlkInit(BLAST_SequenceBlk* query_blk,
Int2 BLAST_MainSetUp(EBlastProgramType program_number,
const QuerySetUpOptions *qsup_options,
const BlastScoringOptions *scoring_options,
- const BlastHitSavingOptions *hit_options,
BLAST_SequenceBlk *query_blk,
const BlastQueryInfo *query_info,
double scale_factor,
BlastSeqLoc **lookup_segments,
- BlastMaskInformation* maskInfo,
+ BlastMaskLoc **mask,
BlastScoreBlk **sbpp,
Blast_Message **blast_message)
{
Boolean mask_at_hash = FALSE; /* mask only for making lookup table? */
Int2 status = 0; /* return value */
BlastMaskLoc *filter_maskloc = NULL; /* Local variable for mask locs. */
- SBlastFilterOptions* filter_options = NULL;
+ SBlastFilterOptions* filter_options = qsup_options->filtering_options;
+ Boolean filter_options_allocated = FALSE;
- if (maskInfo)
- {
- maskInfo->filter_slp = NULL;
- maskInfo->mask_at_hash = FALSE;
- }
- if (qsup_options->filtering_options == NULL && qsup_options->filter_string)
+ if (mask)
+ *mask = NULL;
+
+ if (filter_options == NULL && qsup_options->filter_string)
{
- status = BlastFilteringOptionsFromString(program_number, qsup_options->filter_string, &filter_options, blast_message);
- if (status)
+ status = BlastFilteringOptionsFromString(program_number,
+ qsup_options->filter_string,
+ &filter_options,
+ blast_message);
+ if (status) {
+ filter_options = SBlastFilterOptionsFree(filter_options);
return status;
+ }
+ filter_options_allocated = TRUE;
}
+ ASSERT(filter_options);
status = BlastSetUp_GetFilteringLocations(query_blk,
query_info,
program_number,
- filter_options ? filter_options : qsup_options->filtering_options,
+ filter_options,
& filter_maskloc,
blast_message);
-
if (status) {
+ if (filter_options_allocated)
+ filter_options = SBlastFilterOptionsFree(filter_options);
return status;
}
- mask_at_hash = SBlastFilterOptionsMaskAtHash(filter_options ? filter_options : qsup_options->filtering_options);
+ mask_at_hash = SBlastFilterOptionsMaskAtHash(filter_options);
- filter_options = SBlastFilterOptionsFree(filter_options);
+ if (filter_options_allocated) {
+ filter_options = SBlastFilterOptionsFree(filter_options);
+ }
- if (!mask_at_hash)
- {
- status = BlastSetUp_MaskQuery(query_blk, query_info, filter_maskloc,
- program_number);
- if (status != 0) {
- return status;
- }
+
+ if (!mask_at_hash) {
+ BlastSetUp_MaskQuery(query_blk, query_info, filter_maskloc,
+ program_number);
}
if (program_number == eBlastTypeBlastx && scoring_options->is_ooframe) {
@@ -451,17 +456,14 @@ Int2 BLAST_MainSetUp(EBlastProgramType program_number,
filter_maskloc, lookup_segments);
}
- if (maskInfo)
+ if (mask)
{
- if (program_number == eBlastTypeBlastx ||
- program_number == eBlastTypeTblastx ||
- program_number == eBlastTypeRpsTblastn) {
+ if (Blast_QueryIsTranslated(program_number)) {
/* Filter locations so far are in protein coordinates;
convert them back to nucleotide here. */
BlastMaskLocProteinToDNA(filter_maskloc, query_info);
}
- maskInfo->filter_slp = filter_maskloc;
- maskInfo->mask_at_hash = mask_at_hash;
+ *mask = filter_maskloc;
filter_maskloc = NULL;
}
else
@@ -470,11 +472,8 @@ Int2 BLAST_MainSetUp(EBlastProgramType program_number,
status = BlastSetup_ScoreBlkInit(query_blk, query_info, scoring_options,
program_number, sbpp, scale_factor,
blast_message);
- if (status > 0) {
- return status;
- }
- return 0;
+ return status;
}
@@ -688,8 +687,7 @@ BlastSeqLoc_RestrictToInterval(BlastSeqLoc* *mask, Int4 from, Int4 to)
/* Shift the pointer to the next link in chain and free this link. */
if (last_loc)
last_loc->next = seqloc->next;
- sfree(seqloc->ssr);
- sfree(seqloc);
+ seqloc = BlastSeqLocNodeFree(seqloc);
} else if (!head_loc) {
/* First time a mask was found within the range. */
head_loc = last_loc = seqloc;
diff --git a/algo/blast/core/blast_setup.h b/algo/blast/core/blast_setup.h
index 622e3dc3..3614d9b4 100644
--- a/algo/blast/core/blast_setup.h
+++ b/algo/blast/core/blast_setup.h
@@ -1,4 +1,4 @@
-/* $Id: blast_setup.h,v 1.54 2005/08/15 16:10:21 dondosha Exp $
+/* $Id: blast_setup.h,v 1.55 2005/08/29 14:32:36 dondosha Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -51,13 +51,12 @@ extern "C" {
* @param program_number Type of BLAST program (0=blastn, ...). [in]
* @param qsup_options options for query setup. [in]
* @param scoring_options options for scoring. [in]
- * @param hit_options options for saving hits. [in]
* @param query_blk BLAST_SequenceBlk* for the query. [in]
* @param query_info The query information block [in]
* @param scale_factor Multiplier for cutoff and dropoff scores [in]
* @param lookup_segments Start/stop locations for non-masked query
* segments [out]
- * @param maskInfo masking location information, mask_at_hash value. [out]
+ * @param mask masking locations. [out]
* @param sbpp Contains scoring information. [out]
* @param blast_message error or warning [out]
*/
@@ -65,12 +64,11 @@ NCBI_XBLAST_EXPORT
Int2 BLAST_MainSetUp(EBlastProgramType program_number,
const QuerySetUpOptions* qsup_options,
const BlastScoringOptions* scoring_options,
- const BlastHitSavingOptions* hit_options,
BLAST_SequenceBlk* query_blk,
const BlastQueryInfo* query_info,
double scale_factor,
BlastSeqLoc* *lookup_segments,
- BlastMaskInformation* maskInfo,
+ BlastMaskLoc* *mask,
BlastScoreBlk* *sbpp,
Blast_Message* *blast_message);
@@ -242,6 +240,11 @@ PHIPatternSpaceCalc(BlastQueryInfo* query_info,
/*
*
* $Log: blast_setup.h,v $
+* Revision 1.55 2005/08/29 14:32:36 dondosha
+* From Ilya Dondoshansky:
+* Removed BlastMaskInformation wrapper structure, because mask_at_hash can
+* be retrieved from options
+*
* Revision 1.54 2005/08/15 16:10:21 dondosha
* Added error return argument to Blast_ScoreBlkKbpGappedCalc
*
diff --git a/algo/blast/core/blast_stat.c b/algo/blast/core/blast_stat.c
index e5c57887..a82f2201 100644
--- a/algo/blast/core/blast_stat.c
+++ b/algo/blast/core/blast_stat.c
@@ -1,4 +1,4 @@
-/* $Id: blast_stat.c,v 1.123 2005/08/19 17:56:18 dondosha Exp $
+/* $Id: blast_stat.c,v 1.136 2005/11/14 15:55:42 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -50,7 +50,7 @@
#ifndef SKIP_DOXYGEN_PROCESSING
static char const rcsid[] =
- "$Id: blast_stat.c,v 1.123 2005/08/19 17:56:18 dondosha Exp $";
+ "$Id: blast_stat.c,v 1.136 2005/11/14 15:55:42 madden Exp $";
#endif /* SKIP_DOXYGEN_PROCESSING */
#include <algo/blast/core/blast_stat.h>
@@ -295,6 +295,7 @@ static Int4 blosum80_prefs[BLOSUM80_VALUES_MAX] = {
BLAST_MATRIX_NOMINAL,
BLAST_MATRIX_NOMINAL,
BLAST_MATRIX_NOMINAL,
+ BLAST_MATRIX_NOMINAL,
BLAST_MATRIX_BEST,
BLAST_MATRIX_NOMINAL
}; /**< Quality values for BLOSUM80 matrix, each element corresponds to same element number in array blosum80_values */
@@ -351,8 +352,8 @@ BLAST_MATRIX_NOMINAL,
BLAST_MATRIX_NOMINAL,
BLAST_MATRIX_NOMINAL,
BLAST_MATRIX_NOMINAL,
-BLAST_MATRIX_BEST,
BLAST_MATRIX_NOMINAL,
+BLAST_MATRIX_BEST,
BLAST_MATRIX_NOMINAL,
BLAST_MATRIX_NOMINAL,
BLAST_MATRIX_NOMINAL,
@@ -562,6 +563,12 @@ BLAST_MATRIX_NOMINAL
* 8. Theta
*/
+/** Karlin-Altschul parameter values for substitution scores 1 and -5. */
+static const array_of_8 blastn_values_1_5[] = {
+ { 0, 0, 1.39, 0.747, 1.38, 1.00, 0, 100 },
+ { 3, 3, 1.39, 0.747, 1.38, 1.00, 0, 100 }
+};
+
/** Karlin-Altschul parameter values for substitution scores 1 and -4. */
static const array_of_8 blastn_values_1_4[] = {
{ 0, 0, 1.383, 0.738, 1.36, 1.02, 0, 100 },
@@ -632,6 +639,16 @@ static const array_of_8 blastn_values_2_3[] = {
{ 2, 2, 0.515, 0.14, 0.33, 1.55, -9, 81 }
};
+/** Karlin-Altschul parameter values for substitution scores 3 and -4. */
+static const array_of_8 blastn_values_3_4[] = {
+ { 6, 3, 0.389, 0.25, 0.56, 0.7, -5, 95},
+ { 5, 3, 0.375, 0.21, 0.47, 0.8, -6, 92},
+ { 4, 3, 0.351, 0.14, 0.35, 1.0, -9, 86},
+ { 6, 2, 0.362, 0.16, 0.45, 0.8, -4, 88},
+ { 5, 2, 0.330, 0.092, 0.28, 1.2, -13, 81},
+ { 4, 2, 0.281, 0.046, 0.16, 1.8, -23, 69}
+};
+
/** Karlin-Altschul parameter values for substitution scores 4 and -5. */
static const array_of_8 blastn_values_4_5[] = {
{ 0, 0, 0.22, 0.061, 0.22, 1.0, -15, 74 },
@@ -652,6 +669,11 @@ static const array_of_8 blastn_values_1_1[] = {
{ 2, 1, 0.99, 0.17, 0.30, 3.3, -10, 90 }
};
+/** Karlin-Altschul parameter values for substitution scores 3 and -2. */
+static const array_of_8 blastn_values_3_2[] = {
+ { 5, 5, 0.208, 0.030, 0.072, 2.9, -47, 77}
+};
+
/** Karlin-Altschul parameter values for substitution scores 5 and -4. */
static const array_of_8 blastn_values_5_4[] = {
{ 10, 6, 0.163, 0.068, 0.16, 1.0, -19, 85 },
@@ -2697,6 +2719,306 @@ void BLAST_GetAlphaBeta(const char* matrixName, double *alpha,
sfree(beta_arr);
}
+static Int2
+s_SplitArrayOf8(const array_of_8* input, const array_of_8** normal, const array_of_8** non_affine, Boolean *split)
+{
+
+ if (input == NULL || normal == NULL || non_affine == NULL)
+ return -1;
+
+ *normal = NULL;
+ *non_affine = NULL;
+
+ if (input[0][0] == 0 && input[0][1] == 0)
+ {
+ *normal = input+1;
+ *non_affine = input;
+ *split = TRUE;
+ }
+ else
+ {
+ *normal = input;
+ *split = FALSE;
+ }
+ return 0;
+
+}
+
+/** Adjust Lambda and H if reward and penalty have a non-1 gcd.
+ * the two arrays (normal and linear) should be filled in with values already.
+ * @param normal the values for normal (e.g, "affine") gap costs [in|out]
+ * @param linear specialized values used for megablast [in|out]
+ * @param size Number of supported combinations for this match/mismatch pair [out]
+ * @param gap_existence_max start of infinite regime for gap existence [in|out]
+ * @param gap_extend_max start of infinite regime for gap extension [in|out]
+ * @param divisor divisor for gap costs [out]
+*/
+static Int2
+s_AdjustGapParametersByGcd(array_of_8* normal, array_of_8* linear, int size, Int4* gap_existence_max, Int4* gap_extend_max, int divisor)
+{
+ if (divisor == 1)
+ return 0;
+
+ if (size <=0)
+ return 1;
+
+ (*gap_existence_max) *= divisor;
+ (*gap_extend_max) *= divisor;
+
+ if (normal)
+ {
+ int i;
+
+ for (i=0; i<size; i++)
+ { /* divide lambda and alpha by divisor. */
+ /* multiply gap existence and extension by divisor. */
+ normal[i][0] *= divisor;
+ normal[i][1] *= divisor;
+ normal[i][2] /= divisor;
+ normal[i][5] /= divisor;
+ }
+ }
+ if (linear)
+ { /* divide lambda and alpha by divisor. */
+ linear[0][0] *= divisor;
+ linear[0][1] *= divisor;
+ linear[0][2] /= divisor;
+ linear[0][5] /= divisor;
+ }
+
+ return 0;
+}
+
+/** Returns the array of values corresponding to the given match/mismatch
+ * scores, the number of supported gap cost combinations and thresholds for
+ * the gap costs, beyond which the ungapped statistics can be applied.
+ * @param reward Match reward score [in]
+ * @param penalty Mismatch penalty score [in]
+ * @param array_size Number of supported combinations for this match/mismatch
+ * pair [out]
+ * @param normal the values for normal (e.g, "affine") gap costs [out]
+ * @param non_affine specialized values used for megablast [out]
+ * @param gap_open_max Gap opening cost threshold for infinite gap costs [out]
+ * @param gap_extend_max Gap extension cost threshold for infinite gap costs [out]
+ * @param round_down if set to TRUE only even scores should be used for calculation
+ * of expect value or bit scores [out]
+ * @param error_return Pointer to error message [out]
+ * @return zero on success, other values if error
+ */
+static Int2
+s_GetNuclValuesArray(Int4 reward, Int4 penalty, Int4* array_size,
+ array_of_8** normal, array_of_8** non_affine,
+ Int4* gap_open_max, Int4* gap_extend_max, Boolean* round_down,
+ Blast_Message** error_return)
+{
+ Int2 status = 0;
+ const array_of_8 * kValues = NULL;
+ const array_of_8 * kValues_non_affine = NULL;
+ Boolean split = FALSE;
+ int divisor = BLAST_Gcd(reward, penalty);
+
+ *round_down = FALSE;
+
+ *array_size = 0;
+ *normal = NULL;
+ *non_affine = NULL;
+
+ if (divisor != 1)
+ {
+ reward /= divisor;
+ penalty /= divisor;
+ }
+
+ if (reward == 1 && penalty == -5) {
+ if ((status=s_SplitArrayOf8(blastn_values_1_5, &kValues, &kValues_non_affine, &split)))
+ return status;
+
+ *array_size = sizeof(blastn_values_1_5)/sizeof(array_of_8);
+ *gap_open_max = 3;
+ *gap_extend_max = 3;
+ } else if (reward == 1 && penalty == -4) {
+ if ((status=s_SplitArrayOf8(blastn_values_1_4, &kValues, &kValues_non_affine, &split)))
+ return status;
+
+ *array_size = sizeof(blastn_values_1_4)/sizeof(array_of_8);
+ *gap_open_max = 2;
+ *gap_extend_max = 2;
+ } else if (reward == 2 && penalty == -7) {
+ if ((status=s_SplitArrayOf8(blastn_values_2_7, &kValues, &kValues_non_affine, &split)))
+ return status;
+
+ *round_down = TRUE;
+ *array_size = sizeof(blastn_values_2_7)/sizeof(array_of_8);
+ *gap_open_max = 4;
+ *gap_extend_max = 4;
+ } else if (reward == 1 && penalty == -3) {
+ if ((status=s_SplitArrayOf8(blastn_values_1_3, &kValues, &kValues_non_affine, &split)))
+ return status;
+
+ *array_size = sizeof(blastn_values_1_3)/sizeof(array_of_8);
+ *gap_open_max = 2;
+ *gap_extend_max = 2;
+ } else if (reward == 2 && penalty == -5) {
+ if ((status=s_SplitArrayOf8(blastn_values_2_5, &kValues, &kValues_non_affine, &split)))
+ return status;
+
+ *round_down = TRUE;
+ *array_size = sizeof(blastn_values_2_5)/sizeof(array_of_8);
+ *gap_open_max = 4;
+ *gap_extend_max = 4;
+ } else if (reward == 1 && penalty == -2) {
+ if ((status=s_SplitArrayOf8(blastn_values_1_2, &kValues, &kValues_non_affine, &split)))
+ return status;
+
+ *array_size = sizeof(blastn_values_1_2)/sizeof(array_of_8);
+ *gap_open_max = 2;
+ *gap_extend_max = 2;
+ } else if (reward == 2 && penalty == -3) {
+ if ((status=s_SplitArrayOf8(blastn_values_2_3, &kValues, &kValues_non_affine, &split)))
+ return status;
+
+ *round_down = TRUE;
+ *array_size = sizeof(blastn_values_2_3)/sizeof(array_of_8);
+ *gap_open_max = 6;
+ *gap_extend_max = 4;
+ } else if (reward == 3 && penalty == -4) {
+ if ((status=s_SplitArrayOf8(blastn_values_3_4, &kValues, &kValues_non_affine, &split)))
+ return status;
+
+ *round_down = TRUE;
+ *array_size = sizeof(blastn_values_3_4)/sizeof(array_of_8);
+ *gap_open_max = 6;
+ *gap_extend_max = 3;
+ } else if (reward == 1 && penalty == -1) {
+ if ((status=s_SplitArrayOf8(blastn_values_1_1, &kValues, &kValues_non_affine, &split)))
+ return status;
+
+ *array_size = sizeof(blastn_values_1_1)/sizeof(array_of_8);
+ *gap_open_max = 4;
+ *gap_extend_max = 2;
+ } else if (reward == 3 && penalty == -2) {
+ if ((status=s_SplitArrayOf8(blastn_values_3_2, &kValues, &kValues_non_affine, &split)))
+ return status;
+
+ *array_size = sizeof(blastn_values_3_2)/sizeof(array_of_8);
+ *gap_open_max = 5;
+ *gap_extend_max = 5;
+ } else if (reward == 4 && penalty == -5) {
+ if ((status=s_SplitArrayOf8(blastn_values_4_5, &kValues, &kValues_non_affine, &split)))
+ return status;
+
+ *array_size = sizeof(blastn_values_4_5)/sizeof(array_of_8);
+ *gap_open_max = 12;
+ *gap_extend_max = 8;
+ } else if (reward == 5 && penalty == -4) {
+ if ((status=s_SplitArrayOf8(blastn_values_5_4, &kValues, &kValues_non_affine, &split)))
+ return status;
+
+ *array_size = sizeof(blastn_values_5_4)/sizeof(array_of_8);
+ *gap_open_max = 25;
+ *gap_extend_max = 10;
+ } else { /* Unsupported reward-penalty */
+ status = -1;
+ if (error_return) {
+ char buffer[256];
+ sprintf(buffer, "Substitution scores %d and %d are not supported",
+ reward, penalty);
+ Blast_MessageWrite(error_return, eBlastSevError, 0, 0, buffer);
+ }
+ }
+ if (split)
+ (*array_size)--;
+
+ if (status == 0)
+ {
+ if (*array_size > 0)
+ *normal = BlastMemDup(kValues, (*array_size)*sizeof(array_of_8));
+ if (kValues_non_affine)
+ *non_affine = BlastMemDup(kValues_non_affine, sizeof(array_of_8));
+
+ status = s_AdjustGapParametersByGcd(*normal, *non_affine, *array_size, gap_open_max, gap_extend_max, divisor);
+ }
+
+ return status;
+}
+
+Int2 BLAST_GetProteinGapExistenceExtendParams(const char* matrixName,
+ Int4* gap_existence,
+ Int4* gap_extension)
+{
+ Int4* gapOpen_arr,* gapExtend_arr,* pref_flags;
+ Int4 i; /*loop index*/
+ Int2 num_values = Blast_GetMatrixValues(matrixName, &gapOpen_arr,
+ &gapExtend_arr, NULL, NULL, NULL, NULL, NULL, NULL, &pref_flags);
+
+ if (num_values <= 0)
+ return -1;
+
+ for(i = 1; i < num_values; i++) {
+ if(pref_flags[i]==BLAST_MATRIX_BEST) {
+ (*gap_existence) = gapOpen_arr[i];
+ (*gap_extension) = gapExtend_arr[i];
+ break;
+ }
+ }
+
+ sfree(gapOpen_arr);
+ sfree(gapExtend_arr);
+ sfree(pref_flags);
+
+ return 0;
+}
+
+
+Int2 BLAST_GetNucleotideGapExistenceExtendParams(Int4 reward,
+ Int4 penalty,
+ Int4* gap_existence,
+ Int4* gap_extension)
+{
+ int array_size = 0; /* dummy parameter. */
+ array_of_8* normal=NULL; /* dummy parameter */
+ array_of_8* non_affine=NULL; /* dummy parameter */
+ Boolean round_down = FALSE;
+ int gap_existence_max=0;
+ int gap_extension_max=0;
+ Int2 status = s_GetNuclValuesArray(reward, penalty, &array_size, &normal, &non_affine,
+ &gap_existence_max, &gap_extension_max, &round_down, NULL);
+
+ if (status)
+ {
+ sfree(normal);
+ sfree(non_affine);
+ return status;
+ }
+
+ if (*gap_existence == 0 && *gap_extension == 0 && non_affine)
+ status = 0; /* these values are supported. */
+ else
+ {
+ int index=0;
+ Boolean found=FALSE;
+ while (index < array_size)
+ {
+ if (*gap_existence == normal[index][0] && *gap_extension == normal[index][1])
+ {
+ found = TRUE;
+ break; /* these values are supported. */
+ }
+ index++;
+ }
+
+ if (!found)
+ {
+ *gap_existence = gap_existence_max;
+ *gap_extension = gap_extension_max;
+ }
+ status = 0;
+ }
+ sfree(normal);
+ sfree(non_affine);
+ return status;
+}
+
/** Fills in error_return with strings describing the allowed values.
* @param matrix_name name of the matrix [in]
* @param error_return object to be filled in [in|out]
@@ -2956,85 +3278,11 @@ BLAST_PrintAllowedValues(const char *matrix_name, Int4 gap_open, Int4 gap_extend
return buffer;
}
-/** Returns the array of values corresponding to the given match/mismatch
- * scores, the number of supported gap cost combinations and thresholds for
- * the gap costs, beyond which the ungapped statistics can be applied.
- * @param reward Match reward score [in]
- * @param penalty Mismatch penalty score [in]
- * @param array_size Number of supported combinations for this match/mismatch
- * pair [out]
- * @param gap_open_max Gap opening cost threshold for infinite gap costs [in]
- * @param gap_extend_max Gap extension cost threshold for infinite gap costs [in]
- * @param error_return Pointer to error message [in] [out]
- * @return Corresponding array of values.
- */
-static const array_of_8 *
-s_GetNuclValuesArray(Int4 reward, Int4 penalty, Int4* array_size,
- Int4* gap_open_max, Int4* gap_extend_max,
- Blast_Message** error_return)
-{
- const array_of_8 * kValues = NULL;
-
- if (reward == 1 && penalty == -4) {
- kValues = blastn_values_1_4;
- *array_size = sizeof(blastn_values_1_4)/sizeof(array_of_8);
- *gap_open_max = 2;
- *gap_extend_max = 2;
- } else if (reward == 2 && penalty == -7) {
- kValues = blastn_values_2_7;
- *array_size = sizeof(blastn_values_2_7)/sizeof(array_of_8);
- *gap_open_max = 4;
- *gap_extend_max = 4;
- } else if (reward == 1 && penalty == -3) {
- kValues = blastn_values_1_3;
- *array_size = sizeof(blastn_values_1_3)/sizeof(array_of_8);
- *gap_open_max = 2;
- *gap_extend_max = 2;
- } else if (reward == 2 && penalty == -5) {
- kValues = blastn_values_2_5;
- *array_size = sizeof(blastn_values_2_5)/sizeof(array_of_8);
- *gap_open_max = 4;
- *gap_extend_max = 4;
- } else if (reward == 1 && penalty == -2) {
- kValues = blastn_values_1_2;
- *array_size = sizeof(blastn_values_1_2)/sizeof(array_of_8);
- *gap_open_max = 2;
- *gap_extend_max = 2;
- } else if (reward == 2 && penalty == -3) {
- kValues = blastn_values_2_3;
- *array_size = sizeof(blastn_values_2_3)/sizeof(array_of_8);
- *gap_open_max = 6;
- *gap_extend_max = 4;
- } else if (reward == 1 && penalty == -1) {
- kValues = blastn_values_1_1;
- *array_size = sizeof(blastn_values_1_1)/sizeof(array_of_8);
- *gap_open_max = 4;
- *gap_extend_max = 2;
- } else if (reward == 4 && penalty == -5) {
- kValues = blastn_values_4_5;
- *array_size = sizeof(blastn_values_4_5)/sizeof(array_of_8);
- *gap_open_max = 12;
- *gap_extend_max = 8;
- } else if (reward == 5 && penalty == -4) {
- kValues = blastn_values_5_4;
- *array_size = sizeof(blastn_values_5_4)/sizeof(array_of_8);
- *gap_open_max = 25;
- *gap_extend_max = 10;
- } else if (error_return) {
- char buffer[256];
- /* Unsupported reward-penalty */
- sprintf(buffer, "Substitution scores %d and %d are not supported",
- reward, penalty);
- Blast_MessageWrite(error_return, eBlastSevError, 0, 0, buffer);
- }
-
- return kValues;
-}
-
Int2
Blast_KarlinBlkNuclGappedCalc(Blast_KarlinBlk* kbp, Int4 gap_open,
Int4 gap_extend, Int4 reward, Int4 penalty,
Blast_KarlinBlk* kbp_ungap,
+ Boolean* round_down,
Blast_Message** error_return)
{
const int kGapOpenIndex = 0;
@@ -3043,53 +3291,88 @@ Blast_KarlinBlkNuclGappedCalc(Blast_KarlinBlk* kbp, Int4 gap_open,
const int kKIndex = 3;
const int kHIndex = 4;
int num_combinations = 0;
- int index;
int gap_open_max, gap_extend_max;
-
- const array_of_8 *kValues =
- s_GetNuclValuesArray(reward, penalty, &num_combinations,
- &gap_open_max, &gap_extend_max, error_return);
+ array_of_8* normal=NULL;
+ array_of_8* linear=NULL;
+ Int2 status = s_GetNuclValuesArray(reward,
+ penalty,
+ &num_combinations,
+ &normal,
+ &linear,
+ &gap_open_max,
+ &gap_extend_max,
+ round_down,
+ error_return);
+
+ if (status)
+ {
+ sfree(normal);
+ sfree(linear);
+ return status;
+ }
ASSERT(kbp && kbp_ungap);
-#ifndef NEW_BLASTN_STAT
- Blast_KarlinBlkCopy(kbp, kbp_ungap);
- return 0;
-#endif
-
- if (!kValues)
- return 1;
/* Try to find the table entry corresponding to input gap costs. */
- for (index = 0; index < num_combinations; ++index) {
- if (kValues[index][kGapOpenIndex] == gap_open &&
- kValues[index][kGapExtIndex] == gap_extend) {
- kbp->Lambda = kValues[index][kLambdaIndex];
- kbp->K = kValues[index][kKIndex];
- kbp->logK = log(kbp->K);
- kbp->H = kValues[index][kHIndex];
- break;
- }
+ if (gap_open == 0 && gap_extend == 0 && linear)
+ {
+ kbp->Lambda = linear[0][kLambdaIndex];
+ kbp->K = linear[0][kKIndex];
+ kbp->logK = log(kbp->K);
+ kbp->H = linear[0][kHIndex];
}
+ else
+ {
+ int index=0;
+ for (index = 0; index < num_combinations; ++index) {
+ if (normal[index][kGapOpenIndex] == gap_open &&
+ normal[index][kGapExtIndex] == gap_extend) {
+ kbp->Lambda = normal[index][kLambdaIndex];
+ kbp->K = normal[index][kKIndex];
+ kbp->logK = log(kbp->K);
+ kbp->H = normal[index][kHIndex];
+ break;
+ }
+ }
- /* If gap costs are not found in the table, check if they belong to the
- infinite domain, where ungapped values of the parameters can be used. */
- if (index == num_combinations) {
+ /* If gap costs are not found in the table, check if they belong to the
+ infinite domain, where ungapped values of the parameters can be used. */
+ if (index == num_combinations) {
/* If gap costs are larger than maximal provided in tables, copy
the values from the ungapped Karlin block. */
- if (gap_open >= gap_open_max && gap_extend >= gap_extend_max) {
- Blast_KarlinBlkCopy(kbp, kbp_ungap);
- } else if (error_return) {
- char buffer[256];
- /* Unsupported gap costs combination. */
- sprintf(buffer, "Gap existence and extension values %d and %d "
- "are not supported for substitution scores %d and %d",
- gap_open, gap_extend, reward, penalty);
- Blast_MessageWrite(error_return, eBlastSevError, 0, 0, buffer);
- return 1;
+ if (gap_open >= gap_open_max && gap_extend >= gap_extend_max) {
+ Blast_KarlinBlkCopy(kbp, kbp_ungap);
+ } else if (error_return) {
+ char buffer[8192];
+ int i=0;
+ int len=0;
+ /* Unsupported gap costs combination. */
+ sprintf(buffer, "Gap existence and extension values %ld and %ld "
+ "are not supported for substitution scores %ld and %ld\n",
+ (long) gap_open, (long) gap_extend, (long) reward, (long) penalty);
+ for (i = 0; i < num_combinations; ++i)
+ {
+ len = strlen(buffer);
+ sprintf(buffer+len, "%ld and %ld are supported existence and extension values\n",
+ (long) normal[i][kGapOpenIndex], (long) normal[i][kGapExtIndex]);
+ }
+ len = strlen(buffer);
+ sprintf(buffer+len, "%ld and %ld are supported existence and extension values\n",
+ (long) gap_open_max, (long) gap_extend_max);
+ len = strlen(buffer);
+ sprintf(buffer+len, "Any values more stringent than %ld and %ld are supported\n",
+ (long) gap_open_max, (long) gap_extend_max);
+ Blast_MessageWrite(error_return, eBlastSevError, 0, 0, buffer);
+ sfree(normal);
+ sfree(linear);
+ return 1;
+ }
}
}
+ sfree(normal);
+ sfree(linear);
return 0;
}
@@ -3121,35 +3404,59 @@ Int2 Blast_GetNuclAlphaBeta(Int4 reward, Int4 penalty, Int4 gap_open,
Int4 num_combinations = 0;
Int4 gap_open_max = 0, gap_extend_max = 0;
Int4 index = 0;
- const array_of_8 *kValues =
- s_GetNuclValuesArray(reward, penalty, &num_combinations,
- &gap_open_max, &gap_extend_max, NULL);;
+ array_of_8* normal=NULL;
+ array_of_8* linear=NULL;
+ Boolean round_down = FALSE;
+ Boolean found = FALSE;
+ Int2 status = s_GetNuclValuesArray(reward,
+ penalty,
+ &num_combinations,
+ &normal,
+ &linear,
+ &gap_open_max,
+ &gap_extend_max,
+ &round_down,
+ NULL);
- ASSERT(alpha && beta && kbp);
+ if (status)
+ return status;
-#ifndef NEW_BLASTN_STAT
- *alpha = kbp->Lambda/kbp->H;
- *beta = 0;
- return 0;
-#endif
+ ASSERT(alpha && beta && kbp);
/* For ungapped search return ungapped values of alpha and beta. */
- if (gapped_calculation && kValues) {
- for (index = 0; index < num_combinations; ++index) {
- if (kValues[index][kGapOpenIndex] == gap_open &&
- kValues[index][kGapExtIndex] == gap_extend) {
- *alpha = kValues[index][kAlphaIndex];
- *beta = kValues[index][kBetaIndex];
- return 0;
+ if (gapped_calculation && normal) {
+ if (gap_open == 0 && gap_extend == 0 && linear)
+ {
+ *alpha = linear[0][kAlphaIndex];
+ *beta = linear[0][kBetaIndex];
+ found = TRUE;
+ }
+ else
+ {
+
+ for (index = 0; index < num_combinations; ++index) {
+ if (normal[index][kGapOpenIndex] == gap_open &&
+ normal[index][kGapExtIndex] == gap_extend) {
+ *alpha = normal[index][kAlphaIndex];
+ *beta = normal[index][kBetaIndex];
+ found = TRUE;
+ break;
+ }
}
}
+
}
/* If input values not found in tables, or if this is an ungapped search,
return the ungapped values of alpha and beta. */
- *alpha = kbp->Lambda/kbp->H;
- *beta = s_GetUngappedBeta(reward, penalty);
+ if (!found)
+ {
+ *alpha = kbp->Lambda/kbp->H;
+ *beta = s_GetUngappedBeta(reward, penalty);
+ }
+ sfree(linear);
+ sfree(normal);
return 0;
}
@@ -3997,6 +4304,51 @@ BLAST_ComputeLengthAdjustment(double K,
* ===========================================================================
*
* $Log: blast_stat.c,v $
+ * Revision 1.136 2005/11/14 15:55:42 madden
+ * Correct comment
+ *
+ * Revision 1.135 2005/11/04 13:48:09 madden
+ * Doxygen fixes
+ *
+ * Revision 1.134 2005/11/01 18:49:01 madden
+ * Changes to s_GetNuclValuesArray and calling functions to support (for blastn) reward and penalty values that are multiples of already supported values
+ *
+ * Revision 1.133 2005/10/31 14:05:24 madden
+ * 1.) add support for blastn reward/penalty values of 1/-5, 3/-4, and 3/-2.
+ * 2.) BLAST_GetNucleotideGapExistenceExtendParams now validates value as well as suggesting a
+ * reasonable value.
+ *
+ * Revision 1.132 2005/10/14 17:29:22 madden
+ * Add preliminary support for vecscreen parameters
+ *
+ * Revision 1.131 2005/10/12 19:15:47 madden
+ * Fix bug in s_GetNuclValuesArray
+ *
+ * Revision 1.130 2005/09/27 14:43:56 madden
+ * Centralize round_down decision in s_GetNuclValuesArray
+ *
+ * Revision 1.129 2005/09/16 14:01:45 madden
+ * 1.) BLAST_GetGapExistenceExtendParams renamed to BLAST_GetProteinGapExistenceExtendParams
+ * 2.) Added BLAST_GetNucleotideGapExistenceExtendParams
+ * 3.) Added informative error message to s_GetNuclValuesArray
+ *
+ * Revision 1.128 2005/09/12 19:16:38 coulouri
+ * Enable precomputed statistical parameters for blastn
+ *
+ * Revision 1.127 2005/09/08 14:48:11 ucko
+ * Tweak Blast_KarlinBlkNuclGappedCalc and Blast_GetNuclAlphaBeta to
+ * declare kValues unconditionally, to fix compilation errors when
+ * NEW_BLASTN_STAT is undefined.
+ *
+ * Revision 1.126 2005/09/08 13:40:34 coulouri
+ * Call s_GetNuclValuesArray iff NEW_BLASTN_STAT
+ *
+ * Revision 1.125 2005/08/30 15:42:58 madden
+ * BLAST_GetGapExistenceExtendParams now takes program_number as an argument so it can properly identify blastn queries
+ *
+ * Revision 1.124 2005/08/29 13:52:05 madden
+ * Add BLAST_GetGapExistenceExtendParams
+ *
* Revision 1.123 2005/08/19 17:56:18 dondosha
* Removed unnecessary redefinition of HUGE_VAL
*
diff --git a/algo/blast/core/blast_stat.h b/algo/blast/core/blast_stat.h
index 2ab38c76..e8295c92 100644
--- a/algo/blast/core/blast_stat.h
+++ b/algo/blast/core/blast_stat.h
@@ -1,4 +1,4 @@
-/* $Id: blast_stat.h,v 1.70 2005/08/15 16:10:41 dondosha Exp $
+/* $Id: blast_stat.h,v 1.74 2005/09/27 14:43:16 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -160,6 +160,7 @@ protein alphabet (e.g., ncbistdaa etc.), FALSE for nt. alphabets. */
Uint1* ambiguous_res; /**< Array of ambiguous res. (e.g, 'X', 'N')*/
Int2 ambig_size, /**< size of array above. FIXME: not needed here? */
ambig_occupy; /**< How many occupied? */
+ Boolean round_down; /**< Score must be rounded down to nearest even score if odd. */
} BlastScoreBlk;
/**
@@ -264,12 +265,15 @@ Int2 Blast_KarlinBlkGappedCalc (Blast_KarlinBlk* kbp, Int4 gap_open,
* @param reward Match reward score [in]
* @param penalty Mismatch penalty score [in]
* @param kbp_ungap Karlin block with ungapped Karlin-Altschul parameters [in]
+ * @param round_down specifies that the score should be rounded down to nearest even
+ * score in some cases [in|out]
* @param error_return Pointer to error message. [in] [out]
*/
Int2
Blast_KarlinBlkNuclGappedCalc(Blast_KarlinBlk* kbp, Int4 gap_open,
Int4 gap_extend, Int4 reward, Int4 penalty,
Blast_KarlinBlk* kbp_ungap,
+ Boolean* round_down,
Blast_Message** error_return);
@@ -418,6 +422,30 @@ double BLAST_LargeGapSumE (Int2 num, double xsum,
Int4 query_length, Int4 subject_length,
Int8 searchsp_eff, double weight_divisor );
+/** Extract the recommended gap existence and extension values.
+ * Only to be used with protein matrices.
+ * @param matrixName name of the matrix [in]
+ * @param gap_existence returns recommended existence cost [in|out]
+ * @param gap_extension returns recommended extension cost [in|out]
+ * @return zero on success
+ */
+Int2 BLAST_GetProteinGapExistenceExtendParams(const char* matrixName,
+ Int4* gap_existence,
+ Int4* gap_extension);
+
+/** Extract the recommended gap existence and extension values.
+ * Only to be used with blastn searches.
+ * @param reward match score [in]
+ * @param penalty mismatch score [in]
+ * @param gap_existence returns recommended existence cost [in|out]
+ * @param gap_extension returns recommended extension cost [in|out]
+ * @return zero on success
+ */
+Int2 BLAST_GetNucleotideGapExistenceExtendParams(Int4 reward,
+ Int4 penalty,
+ Int4* gap_existence,
+ Int4* gap_extension);
+
/** Extract the alpha and beta settings for this matrixName, and these
* gap open and gap extension costs
* @param matrixName name of the matrix used [in]
diff --git a/algo/blast/core/blast_traceback.c b/algo/blast/core/blast_traceback.c
index 6cdfa6f2..04aa836a 100644
--- a/algo/blast/core/blast_traceback.c
+++ b/algo/blast/core/blast_traceback.c
@@ -1,4 +1,4 @@
-/* $Id: blast_traceback.c,v 1.175 2005/08/15 16:11:20 dondosha Exp $
+/* $Id: blast_traceback.c,v 1.179 2005/12/01 14:47:48 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
@@ -23,7 +23,6 @@
*
* ===========================================================================
*
- * Author: Ilya Dondoshansky
*
*/
@@ -39,7 +38,7 @@
* for ( all HSP lists )
* Blast_TracebackFromHSPList
* else if ( composition based statistics )
- * Kappa_RedoAlignmentCore
+ * Blast_RedoAlignmentCore
* else
* for ( all HSP lists )
* if ( PHI BLAST)
@@ -51,7 +50,7 @@
#ifndef SKIP_DOXYGEN_PROCESSING
static char const rcsid[] =
- "$Id: blast_traceback.c,v 1.175 2005/08/15 16:11:20 dondosha Exp $";
+ "$Id: blast_traceback.c,v 1.179 2005/12/01 14:47:48 madden Exp $";
#endif /* SKIP_DOXYGEN_PROCESSING */
#include <algo/blast/core/blast_traceback.h>
@@ -269,10 +268,7 @@ s_HSPListPostTracebackUpdate(EBlastProgramType program_number,
/* For nucleotide search, if match score is = 2, the odd scores
are rounded down to the nearest even number. */
- if (program_number == eBlastTypeBlastn &&
- score_params->options->reward == 2) {
- Blast_HSPListAdjustOddBlastnScores(hsp_list);
- }
+ Blast_HSPListAdjustOddBlastnScores(hsp_list, kGapped, sbp);
Blast_HSPListGetEvalues(query_info, hsp_list, kGapped, sbp, 0,
scale_factor);
@@ -387,7 +383,7 @@ Blast_TracebackFromHSPList(EBlastProgramType program_number,
hsp = hsp_array[index];
if (program_number == eBlastTypeBlastx && kIsOutOfFrame) {
Int4 context = hsp->context - hsp->context % 3;
- Int4 context_offset = query_info->contexts[hsp->context].query_offset;
+ Int4 context_offset = query_info->contexts[context].query_offset;
query = query_blk->oof_sequence + CODON_LENGTH + context_offset;
query_length = query_info->contexts[context+2].query_offset +
@@ -1015,7 +1011,7 @@ BLAST_ComputeTraceback(EBlastProgramType program_number,
(ext_params->options->compositionBasedStats == TRUE ||
ext_params->options->eTbackExt == eSmithWatermanTbck)) {
status =
- Kappa_RedoAlignmentCore(program_number, query, query_info, sbp,
+ Blast_RedoAlignmentCore(program_number, query, query_info, sbp,
hsp_stream, seq_src, gen_code_string,
score_params, ext_params, hit_params,
psi_options, results);
diff --git a/algo/blast/core/blast_traceback.h b/algo/blast/core/blast_traceback.h
index fe353777..5b31affa 100644
--- a/algo/blast/core/blast_traceback.h
+++ b/algo/blast/core/blast_traceback.h
@@ -1,4 +1,4 @@
-/* $Id: blast_traceback.h,v 1.46 2005/05/10 16:07:35 camacho Exp $
+/* $Id: blast_traceback.h,v 1.47 2005/11/16 14:31:36 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
@@ -23,7 +23,6 @@
*
* ===========================================================================
*
- * Author: Ilya Dondoshansky
*
*/
diff --git a/algo/blast/core/blast_util.c b/algo/blast/core/blast_util.c
index 655d1a66..f149f7db 100644
--- a/algo/blast/core/blast_util.c
+++ b/algo/blast/core/blast_util.c
@@ -1,4 +1,4 @@
-/* $Id: blast_util.c,v 1.100 2005/08/17 16:21:31 dondosha Exp $
+/* $Id: blast_util.c,v 1.105 2005/11/16 14:27:04 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
@@ -34,7 +34,7 @@
#ifndef SKIP_DOXYGEN_PROCESSING
static char const rcsid[] =
- "$Id: blast_util.c,v 1.100 2005/08/17 16:21:31 dondosha Exp $";
+ "$Id: blast_util.c,v 1.105 2005/11/16 14:27:04 madden Exp $";
#endif /* SKIP_DOXYGEN_PROCESSING */
#include <algo/blast/core/blast_util.h>
@@ -185,6 +185,8 @@ Int2 BlastProgram2Number(const char *program, EBlastProgramType *number)
*number = eBlastTypeRpsTblastn;
else if (strcasecmp("psiblast", program) == 0)
*number = eBlastTypePsiBlast;
+ else if (strcasecmp("psitblastn", program) == 0)
+ *number = eBlastTypePsiTblastn;
else if (strcasecmp("phiblastn", program) == 0)
*number = eBlastTypePhiBlastn;
else if (strcasecmp("phiblastp", program) == 0)
@@ -200,10 +202,10 @@ Int2 BlastNumber2Program(EBlastProgramType number, char* *program)
return 1;
switch (number) {
- case eBlastTypeBlastn: case eBlastTypePhiBlastn:
+ case eBlastTypeBlastn:
*program = strdup("blastn");
break;
- case eBlastTypeBlastp: case eBlastTypePhiBlastp:
+ case eBlastTypeBlastp:
*program = strdup("blastp");
break;
case eBlastTypeBlastx:
@@ -224,6 +226,15 @@ Int2 BlastNumber2Program(EBlastProgramType number, char* *program)
case eBlastTypePsiBlast:
*program = strdup("psiblast");
break;
+ case eBlastTypePsiTblastn:
+ *program = strdup("psitblastn");
+ break;
+ case eBlastTypePhiBlastp:
+ *program = strdup("phiblastp");
+ break;
+ case eBlastTypePhiBlastn:
+ *program = strdup("phiblastn");
+ break;
default:
*program = strdup("unknown");
break;
@@ -644,7 +655,7 @@ Int2 GetReverseNuclSequence(const Uint1* sequence, Int4 length,
Int1 BLAST_ContextToFrame(EBlastProgramType prog_number, Uint4 context_number)
{
- Int1 frame = 127; /* 127 is used to indicate error */
+ Int1 frame = INT1_MAX; /* INT1_MAX is used to indicate error */
if (prog_number == eBlastTypeBlastn) {
if (context_number % NUM_STRANDS == 0)
@@ -653,8 +664,8 @@ Int1 BLAST_ContextToFrame(EBlastProgramType prog_number, Uint4 context_number)
frame = -1;
} else if (prog_number == eBlastTypeBlastp ||
prog_number == eBlastTypeRpsBlast ||
- prog_number == eBlastTypePsiBlast ||
prog_number == eBlastTypeTblastn ||
+ Blast_ProgramIsPsiBlast(prog_number) ||
Blast_ProgramIsPhiBlast(prog_number)) {
/* Query and subject are protein, no frame. */
frame = 0;
@@ -730,6 +741,57 @@ BlastQueryInfo* BlastQueryInfoDup(BlastQueryInfo* query_info)
return retval;
}
+/** Calculates length of the DNA query from the BlastQueryInfo structure that
+ * contains context information for translated frames for a set of queries.
+ * @param query_info Query information containing data for all contexts [in]
+ * @param query_index Which query to find DNA length for?
+ * @return DNA length of the query, calculated as sum of 3 protein frame lengths,
+ * plus 2, because 2 last nucleotide residues do not have a
+ * corresponding codon.
+ */
+static Int4
+s_GetTranslatedQueryDNALength(const BlastQueryInfo* query_info, Int4 query_index)
+{
+ Int4 start_context = NUM_FRAMES*query_index;
+ Int4 dna_length = 2;
+ Int4 index;
+
+ /* Make sure that query index is within appropriate range, and that this is
+ really a translated search */
+ ASSERT(query_index < query_info->num_queries);
+ ASSERT(start_context < query_info->last_context);
+
+ /* If only reverse strand is searched, then forward strand contexts don't
+ have lengths information */
+ if (query_info->contexts[start_context].query_length == 0)
+ start_context += 3;
+
+ for (index = start_context; index < start_context + 3; ++index)
+ dna_length += query_info->contexts[index].query_length;
+
+ return dna_length;
+}
+
+Int4 BlastQueryInfoGetQueryLength(const BlastQueryInfo* qinfo,
+ EBlastProgramType program,
+ Int4 query_index)
+{
+ const Uint4 kNumContexts = BLAST_GetNumberOfContexts(program);
+ ASSERT(query_index < qinfo->num_queries);
+
+ if (Blast_QueryIsTranslated(program)) {
+ return s_GetTranslatedQueryDNALength(qinfo, query_index);
+ } else if (program == eBlastTypeBlastn) {
+ Int4 retval = qinfo->contexts[query_index*kNumContexts].query_length;
+ if (retval <= 0) {
+ retval = qinfo->contexts[query_index*kNumContexts+1].query_length;
+ }
+ return retval;
+ } else {
+ return qinfo->contexts[query_index*kNumContexts].query_length;
+ }
+}
+
Int2 BLAST_PackDNA(const Uint1* buffer, Int4 length, EBlastEncoding encoding,
Uint1** packed_seq)
{
@@ -1320,3 +1382,32 @@ char* BLAST_StrToUpper(const char* string)
return retval;
}
+unsigned int
+BLAST_GetNumberOfContexts(EBlastProgramType p)
+{
+ unsigned int retval = 0;
+
+ switch (p) {
+ case eBlastTypeBlastn:
+ case eBlastTypePhiBlastn:
+ retval = NUM_STRANDS;
+ break;
+ case eBlastTypeBlastp:
+ case eBlastTypeRpsBlast:
+ case eBlastTypeTblastn:
+ case eBlastTypePsiBlast:
+ case eBlastTypePsiTblastn:
+ case eBlastTypePhiBlastp:
+ retval = 1;
+ break;
+ case eBlastTypeBlastx:
+ case eBlastTypeTblastx:
+ case eBlastTypeRpsTblastn:
+ retval = NUM_FRAMES;
+ break;
+ default:
+ break;
+ }
+
+ return retval;
+}
diff --git a/algo/blast/core/blast_util.h b/algo/blast/core/blast_util.h
index 212b58e0..d35b34f2 100644
--- a/algo/blast/core/blast_util.h
+++ b/algo/blast/core/blast_util.h
@@ -1,4 +1,4 @@
-/* $Id: blast_util.h,v 1.68 2005/08/09 19:25:30 dondosha Exp $
+/* $Id: blast_util.h,v 1.72 2005/11/16 14:31:36 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
@@ -23,7 +23,6 @@
*
* ===========================================================================
*
- * Author: Ilya Dondoshansky
*
*/
@@ -194,7 +193,7 @@ Int2 GetReverseNuclSequence(const Uint1* sequence, Int4 length,
* @param prog_number Integer corresponding to the BLAST program
* @param context_number Context number
* @return Sequence frame: -1,1 for nucleotides, -3,-2,-1,1,2,3 for
- * translations, 0 for proteins and 127 in case of unsupported program
+ * translations, 0 for proteins and INT1_MAX in case of unsupported program
*/
NCBI_XBLAST_EXPORT
Int1 BLAST_ContextToFrame(EBlastProgramType prog_number, Uint4 context_number);
@@ -215,6 +214,18 @@ BlastQueryInfo* BlastQueryInfoFree(BlastQueryInfo* query_info);
NCBI_XBLAST_EXPORT
BlastQueryInfo* BlastQueryInfoDup(BlastQueryInfo* query_info);
+/** Obtains the sequence length for a given query in the query, without taking
+ * into consideration any applicable translations
+ * @param qinfo BlastQueryInfo structure [in]
+ * @param program CORE program type [in]
+ * @param query_index number of the query
+ * (query_index < BlastQueryInfo::num_queries) [in]
+ * @return the length of the query sequence requested
+ */
+Int4 BlastQueryInfoGetQueryLength(const BlastQueryInfo* qinfo,
+ EBlastProgramType program,
+ Int4 query_index);
+
/** Create auxiliary query structures with all data corresponding
* to a single query sequence within a concatenated set. Allocates the
* structures if the pointers are NULL on input; otherwise only changes the
@@ -400,6 +411,13 @@ Blast_SetUpSubjectTranslation(BLAST_SequenceBlk* subject_blk,
Int4** frame_offsets,
Boolean* partial_translation);
+/** Get the number of contexts for a given program. This corresponds to the
+ * number of translation frames or strands whenever applicable.
+ * @return 0 on unsupported program, non-zero otherwise
+ */
+NCBI_XBLAST_EXPORT
+unsigned int BLAST_GetNumberOfContexts(EBlastProgramType program);
+
#ifdef __cplusplus
}
#endif
diff --git a/algo/blast/core/gapinfo.c b/algo/blast/core/gapinfo.c
index fa5aba13..d7e2b7e6 100644
--- a/algo/blast/core/gapinfo.c
+++ b/algo/blast/core/gapinfo.c
@@ -1,4 +1,4 @@
-/* $Id: gapinfo.c,v 1.16 2005/04/27 19:55:13 dondosha Exp $
+/* $Id: gapinfo.c,v 1.17 2005/11/16 14:27:04 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
@@ -34,7 +34,7 @@
#ifndef SKIP_DOXYGEN_PROCESSING
static char const rcsid[] =
- "$Id: gapinfo.c,v 1.16 2005/04/27 19:55:13 dondosha Exp $";
+ "$Id: gapinfo.c,v 1.17 2005/11/16 14:27:04 madden Exp $";
#endif /* SKIP_DOXYGEN_PROCESSING */
#include <algo/blast/core/gapinfo.h>
diff --git a/algo/blast/core/gapinfo.h b/algo/blast/core/gapinfo.h
index 3360c26d..48f3374f 100644
--- a/algo/blast/core/gapinfo.h
+++ b/algo/blast/core/gapinfo.h
@@ -1,4 +1,4 @@
-/* $Id: gapinfo.h,v 1.20 2005/04/27 19:49:49 dondosha Exp $
+/* $Id: gapinfo.h,v 1.21 2005/11/16 14:31:37 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
diff --git a/algo/blast/core/greedy_align.c b/algo/blast/core/greedy_align.c
index cd53764b..1a2bfec1 100644
--- a/algo/blast/core/greedy_align.c
+++ b/algo/blast/core/greedy_align.c
@@ -1,4 +1,4 @@
-/* $Id: greedy_align.c,v 1.35 2005/04/07 20:09:28 madden Exp $
+/* $Id: greedy_align.c,v 1.38 2005/11/18 14:43:58 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -36,7 +36,7 @@
#ifndef SKIP_DOXYGEN_PROCESSING
static char const rcsid[] =
- "$Id: greedy_align.c,v 1.35 2005/04/07 20:09:28 madden Exp $";
+ "$Id: greedy_align.c,v 1.38 2005/11/18 14:43:58 madden Exp $";
#endif /* SKIP_DOXYGEN_PROCESSING */
#include <algo/blast/core/greedy_align.h>
@@ -308,7 +308,8 @@ Int4 BLAST_GreedyAlign(const Uint1* seq1, Int4 len1,
very similar, the average running time will be sig-
nificantly better than this */
- max_dist = len2 / GREEDY_MAX_COST_FRACTION + 1;
+ max_dist = MIN(GREEDY_MAX_COST,
+ len2 / GREEDY_MAX_COST_FRACTION + 1);
/* the main loop assumes that the index of all diagonals is
biased to lie in the middle of allocated bookkeeping
@@ -382,7 +383,7 @@ Int4 BLAST_GreedyAlign(const Uint1* seq1, Int4 len1,
if (index == len1 || index == len2) {
if (edit_block != NULL)
GapPrelimEditBlockAdd(edit_block, eGapAlignSub, index);
- return best_dist;
+ return 0; /* This function returns number of differences, here it is zero. */
}
/* set up the memory pool */
@@ -549,7 +550,9 @@ Int4 BLAST_GreedyAlign(const Uint1* seq1, Int4 len1,
}
/* clamp the bounds on diagonals to avoid walking off
- either sequence */
+ either sequence. Because the bounds increase by at
+ most one for each distance, diag_lower and diag_upper
+ can each be of size at most max_diags+2 */
if (seq2_index == len2) {
diag_lower = k + 1;
@@ -586,7 +589,10 @@ Int4 BLAST_GreedyAlign(const Uint1* seq1, Int4 len1,
if (diag_lower > diag_upper)
break;
- /* set up for the next distance to examine */
+ /* set up for the next distance to examine. Because the
+ bounds increase by at most one for each distance,
+ diag_lower and diag_upper can each be of size at
+ most max_diags+2 */
if (!end2_reached)
diag_lower--;
@@ -776,7 +782,8 @@ Int4 BLAST_AffineGreedyAlign (const Uint1* seq1, Int4 len1,
/* set the number of distinct distances the algorithm will
examine in the search for an optimal alignment */
- max_dist = len2 / GREEDY_MAX_COST_FRACTION + 1;
+ max_dist = MIN(GREEDY_MAX_COST,
+ len2 / GREEDY_MAX_COST_FRACTION + 1);
scaled_max_dist = max_dist * gap_extend;
/* the main loop assumes that the index of all diagonals is
@@ -853,7 +860,7 @@ Int4 BLAST_AffineGreedyAlign (const Uint1* seq1, Int4 len1,
if (index == len1 || index == len2) {
if (edit_block != NULL)
GapPrelimEditBlockAdd(edit_block, eGapAlignSub, index);
- return best_dist;
+ return (index*match_score);
}
/* set up the memory pool */
@@ -1147,7 +1154,9 @@ Int4 BLAST_AffineGreedyAlign (const Uint1* seq1, Int4 len1,
/* compute the range of diagonals to test for the next
value of d. These must be conservative, in that any
- diagonal that could possibly contribute must be allowed */
+ diagonal that could possibly contribute must be allowed.
+ curr_diag_lower and curr_diag_upper can each be of size at
+ most scaled_max_diags+2 */
d++;
curr_diag_lower = MIN(diag_lower[d - gap_open_extend],
diff --git a/algo/blast/core/greedy_align.h b/algo/blast/core/greedy_align.h
index bafb5e7a..7ed78bbc 100644
--- a/algo/blast/core/greedy_align.h
+++ b/algo/blast/core/greedy_align.h
@@ -1,4 +1,4 @@
-/* $Id: greedy_align.h,v 1.21 2005/04/07 20:09:54 madden Exp $
+/* $Id: greedy_align.h,v 1.23 2005/11/16 14:31:37 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
@@ -46,6 +46,9 @@ extern "C" {
this gives a worst case bound on the number of loop iterations */
#define GREEDY_MAX_COST_FRACTION 2
+/** The largest distance to be examined for an optimal alignment */
+#define GREEDY_MAX_COST 1000
+
/* ----- pool allocator ----- */
/** Bookkeeping structure for greedy alignment. When aligning
diff --git a/algo/blast/core/hspstream_collector.c b/algo/blast/core/hspstream_collector.c
index ec1d1c23..2565f086 100644
--- a/algo/blast/core/hspstream_collector.c
+++ b/algo/blast/core/hspstream_collector.c
@@ -1,4 +1,4 @@
-/* $Id: hspstream_collector.c,v 1.14 2005/05/16 12:21:40 madden Exp $
+/* $Id: hspstream_collector.c,v 1.15 2005/09/30 12:17:10 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -34,7 +34,7 @@
#ifndef SKIP_DOXYGEN_PROCESSING
static char const rcsid[] =
- "$Id: hspstream_collector.c,v 1.14 2005/05/16 12:21:40 madden Exp $";
+ "$Id: hspstream_collector.c,v 1.15 2005/09/30 12:17:10 madden Exp $";
#endif /* SKIP_DOXYGEN_PROCESSING */
@@ -154,6 +154,7 @@ static int
s_BlastHSPListCollectorWrite(BlastHSPStream* hsp_stream,
BlastHSPList** hsp_list)
{
+ Int2 status = 0;
BlastHSPListCollectorData* stream_data =
(BlastHSPListCollectorData*) GetData(hsp_stream);
@@ -166,18 +167,24 @@ s_BlastHSPListCollectorWrite(BlastHSPStream* hsp_stream,
* every read after a write.
*/
if (stream_data->results_sorted) {
+ MT_LOCK_Do(stream_data->x_lock, eMT_Unlock);
return kBlastHSPStream_Error;
}
/* For RPS BLAST saving procedure is different, because HSPs from different
subjects are bundled in one HSP list */
if (Blast_ProgramIsRpsBlast(stream_data->program)) {
- Blast_HSPResultsSaveRPSHSPList(stream_data->program,
+ status = Blast_HSPResultsSaveRPSHSPList(stream_data->program,
stream_data->results, *hsp_list, stream_data->blasthit_params);
} else {
- Blast_HSPResultsSaveHSPList(stream_data->program, stream_data->results,
+ status = Blast_HSPResultsSaveHSPList(stream_data->program, stream_data->results,
*hsp_list, stream_data->blasthit_params);
}
+ if (status != 0)
+ {
+ MT_LOCK_Do(stream_data->x_lock, eMT_Unlock);
+ return kBlastHSPStream_Error;
+ }
/* Results structure is no longer sorted, even if it was before.
The following assignment is only necessary if the logic to prohibit
writing after the first read is removed. */
diff --git a/algo/blast/core/link_hsps.c b/algo/blast/core/link_hsps.c
index b90ee1cf..c2c4f56a 100644
--- a/algo/blast/core/link_hsps.c
+++ b/algo/blast/core/link_hsps.c
@@ -1,5 +1,5 @@
-/* $Id: link_hsps.c,v 1.56 2005/06/08 17:27:53 madden Exp $
+/* $Id: link_hsps.c,v 1.58 2005/11/16 14:27:04 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -7,7 +7,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
@@ -34,7 +34,7 @@
#ifndef SKIP_DOXYGEN_PROCESSING
static char const rcsid[] =
- "$Id: link_hsps.c,v 1.56 2005/06/08 17:27:53 madden Exp $";
+ "$Id: link_hsps.c,v 1.58 2005/11/16 14:27:04 madden Exp $";
#endif /* SKIP_DOXYGEN_PROCESSING */
#include <algo/blast/core/link_hsps.h>
@@ -1805,6 +1805,7 @@ BLAST_LinkHsps(EBlastProgramType program_number, BlastHSPList* hsp_list,
/* The HSP's may be in a different order than they were before,
but hsp contains the first one. */
} else {
+ Blast_HSPListAdjustOddBlastnScores(hsp_list, gapped_calculation, sbp);
/* Calculate individual HSP e-values first - they'll be needed to
compare with sum e-values. Use decay rate to compensate for
multiple tests. */
diff --git a/algo/blast/core/link_hsps.h b/algo/blast/core/link_hsps.h
index dba93fda..55a60f86 100644
--- a/algo/blast/core/link_hsps.h
+++ b/algo/blast/core/link_hsps.h
@@ -1,4 +1,4 @@
-/* $Id: link_hsps.h,v 1.14 2004/09/23 14:57:46 dondosha Exp $
+/* $Id: link_hsps.h,v 1.15 2005/11/16 14:31:37 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
diff --git a/algo/blast/core/lookup_util.c b/algo/blast/core/lookup_util.c
index 9d135cd9..1c8b15b1 100644
--- a/algo/blast/core/lookup_util.c
+++ b/algo/blast/core/lookup_util.c
@@ -1,4 +1,4 @@
-/* $Id: lookup_util.c,v 1.11 2005/03/01 14:00:56 coulouri Exp $
+/* $Id: lookup_util.c,v 1.12 2005/11/16 14:27:04 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
@@ -30,7 +30,7 @@
#ifndef SKIP_DOXYGEN_PROCESSING
static char const rcsid[] =
- "$Id: lookup_util.c,v 1.11 2005/03/01 14:00:56 coulouri Exp $";
+ "$Id: lookup_util.c,v 1.12 2005/11/16 14:27:04 madden Exp $";
#endif /* SKIP_DOXYGEN_PROCESSING */
#include <algo/blast/core/lookup_util.h>
diff --git a/algo/blast/core/lookup_util.h b/algo/blast/core/lookup_util.h
index 060d8af7..988dc42b 100644
--- a/algo/blast/core/lookup_util.h
+++ b/algo/blast/core/lookup_util.h
@@ -1,4 +1,4 @@
-/* $Id: lookup_util.h,v 1.10 2005/03/01 13:59:51 coulouri Exp $
+/* $Id: lookup_util.h,v 1.11 2005/11/16 14:31:37 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
diff --git a/algo/blast/core/lookup_wrap.c b/algo/blast/core/lookup_wrap.c
index 27ce7e2e..7c7dfe51 100644
--- a/algo/blast/core/lookup_wrap.c
+++ b/algo/blast/core/lookup_wrap.c
@@ -1,4 +1,4 @@
-/* $Id: lookup_wrap.c,v 1.16 2005/07/29 15:21:15 camacho Exp $
+/* $Id: lookup_wrap.c,v 1.17 2005/11/16 14:27:04 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
@@ -37,7 +37,7 @@
#ifndef SKIP_DOXYGEN_PROCESSING
static char const rcsid[] =
- "$Id: lookup_wrap.c,v 1.16 2005/07/29 15:21:15 camacho Exp $";
+ "$Id: lookup_wrap.c,v 1.17 2005/11/16 14:27:04 madden Exp $";
#endif /* SKIP_DOXYGEN_PROCESSING */
#include <algo/blast/core/lookup_wrap.h>
diff --git a/algo/blast/core/lookup_wrap.h b/algo/blast/core/lookup_wrap.h
index 0c9f9e67..d7dc376f 100644
--- a/algo/blast/core/lookup_wrap.h
+++ b/algo/blast/core/lookup_wrap.h
@@ -1,4 +1,4 @@
-/* $Id: lookup_wrap.h,v 1.11 2005/07/29 15:21:08 camacho Exp $
+/* $Id: lookup_wrap.h,v 1.12 2005/11/16 14:31:37 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
diff --git a/algo/blast/core/mb_lookup.c b/algo/blast/core/mb_lookup.c
index 739d6184..6d372e93 100644
--- a/algo/blast/core/mb_lookup.c
+++ b/algo/blast/core/mb_lookup.c
@@ -1,4 +1,4 @@
-/* $Id: mb_lookup.c,v 1.56 2005/08/17 16:21:31 dondosha Exp $
+/* $Id: mb_lookup.c,v 1.57 2005/11/16 14:27:04 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
@@ -34,7 +34,7 @@
#ifndef SKIP_DOXYGEN_PROCESSING
static char const rcsid[] =
- "$Id: mb_lookup.c,v 1.56 2005/08/17 16:21:31 dondosha Exp $";
+ "$Id: mb_lookup.c,v 1.57 2005/11/16 14:27:04 madden Exp $";
#endif /* SKIP_DOXYGEN_PROCESSING */
#include <algo/blast/core/blast_options.h>
diff --git a/algo/blast/core/mb_lookup.h b/algo/blast/core/mb_lookup.h
index 3478182e..c87d2801 100644
--- a/algo/blast/core/mb_lookup.h
+++ b/algo/blast/core/mb_lookup.h
@@ -1,4 +1,4 @@
-/* $Id: mb_lookup.h,v 1.25 2005/06/06 15:37:02 papadopo Exp $
+/* $Id: mb_lookup.h,v 1.26 2005/11/16 14:31:37 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
diff --git a/algo/blast/core/ncbi_std.c b/algo/blast/core/ncbi_std.c
index eddf7c97..581f1970 100644
--- a/algo/blast/core/ncbi_std.c
+++ b/algo/blast/core/ncbi_std.c
@@ -1,4 +1,4 @@
-/* $Id: ncbi_std.c,v 1.16 2005/02/24 15:39:34 madden Exp $
+/* $Id: ncbi_std.c,v 1.17 2005/11/16 14:27:04 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
@@ -31,7 +31,7 @@
#ifndef SKIP_DOXYGEN_PROCESSING
static char const rcsid[] =
- "$Id: ncbi_std.c,v 1.16 2005/02/24 15:39:34 madden Exp $";
+ "$Id: ncbi_std.c,v 1.17 2005/11/16 14:27:04 madden Exp $";
#endif /* SKIP_DOXYGEN_PROCESSING */
#include <algo/blast/core/blast_def.h> /* for sfree() macro */
diff --git a/algo/blast/core/ncbi_std.h b/algo/blast/core/ncbi_std.h
index c2aff8ff..e7e022a6 100644
--- a/algo/blast/core/ncbi_std.h
+++ b/algo/blast/core/ncbi_std.h
@@ -1,4 +1,4 @@
-/* $Id: ncbi_std.h,v 1.34 2004/12/14 17:11:24 ucko Exp $
+/* $Id: ncbi_std.h,v 1.37 2005/11/16 14:31:37 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
@@ -23,7 +23,6 @@
*
* ===========================================================================
*
- * Author: Ilya Dondoshansky
*
*/
@@ -127,26 +126,55 @@ typedef Uint1 Boolean;
#ifndef _NCBISTD_ /* if we're not in the C toolkit ... */
+#ifndef UINT4_MAX
/** largest number represented by unsigned int. */
#define UINT4_MAX 4294967295U
+#endif
+
+#ifndef INT4_MAX
/** largest nubmer represented by signed int */
#define INT4_MAX 2147483647
+#endif
+
+#ifndef INT4_MIN
/** Smallest (most negative) number represented by signed int */
#define INT4_MIN (-2147483647-1)
+#endif
+
+#ifndef NCBIMATH_LN2
/** natural log of 2. */
#define NCBIMATH_LN2 0.69314718055994530941723212145818
+#endif
+
+#ifndef INT2_MAX
/** largest number represented by signed (two byte) short */
#define INT2_MAX 32767
+#endif
+
+#ifndef INT2_MIN
/** smallest (most negative) number represented by signed (two byte) short */
#define INT2_MIN (-32768)
+#endif
+
+#ifndef INT1_MAX
+/** largest number represented by signed short (one byte) */
+#define INT1_MAX 127
+#endif
+
+#ifndef INT1_MIN
+/** smallest (most negative) number represented by signed short (one byte) */
+#define INT1_MIN (-128)
+#endif
#ifndef DIM
/** dimension of an array. */
#define DIM(A) (sizeof(A)/sizeof((A)[0]))
#endif
+#ifndef NULLB
/** terminating byte of a char* string. */
#define NULLB '\0'
+#endif
#endif /* _NCBISTD_ */
diff --git a/algo/blast/core/pattern.c b/algo/blast/core/pattern.c
index b970d61e..44665e85 100644
--- a/algo/blast/core/pattern.c
+++ b/algo/blast/core/pattern.c
@@ -1,4 +1,4 @@
-/* $Id: pattern.c,v 1.17 2005/07/18 19:38:33 bealer Exp $
+/* $Id: pattern.c,v 1.18 2005/11/16 14:27:04 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
@@ -56,7 +56,7 @@
#ifndef SKIP_DOXYGEN_PROCESSING
static char const rcsid[] =
- "$Id: pattern.c,v 1.17 2005/07/18 19:38:33 bealer Exp $";
+ "$Id: pattern.c,v 1.18 2005/11/16 14:27:04 madden Exp $";
#endif /* SKIP_DOXYGEN_PROCESSING */
#include <algo/blast/core/pattern.h>
diff --git a/algo/blast/core/pattern.h b/algo/blast/core/pattern.h
index 864cadf7..a7eb8cae 100644
--- a/algo/blast/core/pattern.h
+++ b/algo/blast/core/pattern.h
@@ -1,4 +1,4 @@
-/* $Id: pattern.h,v 1.7 2005/07/18 19:38:33 bealer Exp $
+/* $Id: pattern.h,v 1.8 2005/11/16 14:31:37 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
diff --git a/algo/blast/core/pattern_priv.h b/algo/blast/core/pattern_priv.h
index 541bf734..1a79bc49 100644
--- a/algo/blast/core/pattern_priv.h
+++ b/algo/blast/core/pattern_priv.h
@@ -1,4 +1,4 @@
-/* $Id: pattern_priv.h,v 1.3 2005/07/18 19:38:33 bealer Exp $
+/* $Id: pattern_priv.h,v 1.4 2005/11/16 14:27:04 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
diff --git a/algo/blast/core/phi_extend.c b/algo/blast/core/phi_extend.c
index 149c77b9..de73206d 100644
--- a/algo/blast/core/phi_extend.c
+++ b/algo/blast/core/phi_extend.c
@@ -1,4 +1,4 @@
-/* $Id: phi_extend.c,v 1.12 2005/04/27 19:56:13 dondosha Exp $
+/* $Id: phi_extend.c,v 1.13 2005/11/16 14:27:04 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
@@ -33,7 +33,7 @@
#ifndef SKIP_DOXYGEN_PROCESSING
static char const rcsid[] =
- "$Id: phi_extend.c,v 1.12 2005/04/27 19:56:13 dondosha Exp $";
+ "$Id: phi_extend.c,v 1.13 2005/11/16 14:27:04 madden Exp $";
#endif /* SKIP_DOXYGEN_PROCESSING */
#include <algo/blast/core/blast_def.h>
diff --git a/algo/blast/core/phi_extend.h b/algo/blast/core/phi_extend.h
index 74d39ca6..33bbcfd0 100644
--- a/algo/blast/core/phi_extend.h
+++ b/algo/blast/core/phi_extend.h
@@ -1,4 +1,4 @@
-/* $Id: phi_extend.h,v 1.10 2005/05/18 15:27:19 papadopo Exp $
+/* $Id: phi_extend.h,v 1.11 2005/11/16 14:31:37 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
diff --git a/algo/blast/core/phi_gapalign.c b/algo/blast/core/phi_gapalign.c
index 02d5668e..7750f73e 100644
--- a/algo/blast/core/phi_gapalign.c
+++ b/algo/blast/core/phi_gapalign.c
@@ -1,4 +1,4 @@
-/* $Id: phi_gapalign.c,v 1.7 2005/08/17 16:21:31 dondosha Exp $
+/* $Id: phi_gapalign.c,v 1.9 2005/11/30 18:25:32 papadopo Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
@@ -58,13 +58,14 @@
#ifndef SKIP_DOXYGEN_PROCESSING
static char const rcsid[] =
- "$Id: phi_gapalign.c,v 1.7 2005/08/17 16:21:31 dondosha Exp $";
+ "$Id: phi_gapalign.c,v 1.9 2005/11/30 18:25:32 papadopo Exp $";
#endif /* SKIP_DOXYGEN_PROCESSING */
#include <algo/blast/core/blast_options.h>
#include <algo/blast/core/blast_def.h>
#include <algo/blast/core/phi_gapalign.h>
#include <algo/blast/core/blast_encoding.h>
+#include <algo/blast/core/blast_gapalign.h>
#include "blast_gapalign_priv.h"
#include "pattern_priv.h"
@@ -101,7 +102,7 @@ s_Align(Uint1 * seq1, Uint1 * seq2, Int4 end1, Int4 end2, Int4 lowDiag,
inclusive*/
Int4 diagIndex; /*loop index over diagonals*/
Int4 leftd, rightd; /* diagonal indices for CC, DD, CP and DP */
- BlastGapSmallDP* score_array; /*array for dynamic program information*/
+ BlastGapDP* score_array; /*array for dynamic program information*/
Int4 curd; /* current index for CC, DD CP and DP */
Int4 i; /*loop index*/
Int4 index1; /*index on seq1*/
@@ -109,7 +110,7 @@ s_Align(Uint1 * seq1, Uint1 * seq2, Int4 end1, Int4 end2, Int4 lowDiag,
Int4 temp_indel_score = 0; /*placeholder for an indel score */
Int4 tempHorScore; /*dual of temp_indel_score for the case where a
horizontal edge (insertion) is the last step*/
- BlastGapSmallDP* score_row = NULL; /*points to a row of CD*/
+ BlastGapDP* score_row = NULL; /*points to a row of CD*/
Int4 stateDecoder; /*used to decode the edge information in a state*/
Int4 initialScore; /*score to initialize dynamic program entries*/
Int4 *matrixRow; /*row of score matrix*/
@@ -129,7 +130,7 @@ s_Align(Uint1 * seq1, Uint1 * seq2, Int4 end1, Int4 end2, Int4 lowDiag,
band = highDiag-lowDiag+1;
/* Allocate array of scores. */
- score_array = (BlastGapSmallDP*) calloc(band+2, sizeof(BlastGapSmallDP));
+ score_array = (BlastGapDP*) calloc(band+2, sizeof(BlastGapDP));
state = (Int1 **) malloc(sizeof(Int1 *)*(end1+1));
state[0] = (Int1 *) malloc((end1+1)*(band+2));
diff --git a/algo/blast/core/phi_lookup.c b/algo/blast/core/phi_lookup.c
index 818807ce..af40b90f 100644
--- a/algo/blast/core/phi_lookup.c
+++ b/algo/blast/core/phi_lookup.c
@@ -1,4 +1,4 @@
-/* $Id: phi_lookup.c,v 1.29 2005/08/23 20:26:58 camacho Exp $
+/* $Id: phi_lookup.c,v 1.30 2005/11/16 14:27:04 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.
@@ -34,7 +34,7 @@
#ifndef SKIP_DOXYGEN_PROCESSING
static char const rcsid[] =
- "$Id: phi_lookup.c,v 1.29 2005/08/23 20:26:58 camacho Exp $";
+ "$Id: phi_lookup.c,v 1.30 2005/11/16 14:27:04 madden Exp $";
#endif /* SKIP_DOXYGEN_PROCESSING */
#include <algo/blast/core/blast_def.h>
diff --git a/algo/blast/core/phi_lookup.h b/algo/blast/core/phi_lookup.h
index 112f2306..58ccf747 100644
--- a/algo/blast/core/phi_lookup.h
+++ b/algo/blast/core/phi_lookup.h
@@ -1,4 +1,4 @@
-/* $Id: phi_lookup.h,v 1.9 2005/04/27 19:50:47 dondosha Exp $
+/* $Id: phi_lookup.h,v 1.10 2005/11/16 14:31:37 madden Exp $
* ===========================================================================
*
* PUBLIC DOMAIN NOTICE
@@ -6,7 +6,7 @@
*
* This software/database is a "United States Government Work" under the
* terms of the United States Copyright Act. It was written as part of
- * the author's offical duties as a United States Government employee and
+ * the author's official duties as a United States Government employee and
* thus cannot be copyrighted. This software/database is freely available
* to the public for use. The National Library of Medicine and the U.S.
* Government have not placed any restriction on its use or reproduction.