diff options
author | Aaron M. Ucko <ucko@debian.org> | 2007-09-05 15:33:43 +0000 |
---|---|---|
committer | Aaron M. Ucko <ucko@debian.org> | 2007-09-05 15:33:43 +0000 |
commit | 7647e504b18f91edcedba85e7a6ef772b2a0f48b (patch) | |
tree | 6152f91efe8f30174ce9d51525a458b85335f12b /algo/blast/api | |
parent | b0629a94e882461a9d6cab18807c5f96501cf38f (diff) |
[svn-upgrade] Integrating new upstream version, ncbi-tools6 (6.1.20070822)
Diffstat (limited to 'algo/blast/api')
-rw-r--r-- | algo/blast/api/blast_api.c | 151 | ||||
-rw-r--r-- | algo/blast/api/blast_api.h | 26 | ||||
-rw-r--r-- | algo/blast/api/blast_format.c | 16 | ||||
-rw-r--r-- | algo/blast/api/blast_input.c | 517 | ||||
-rw-r--r-- | algo/blast/api/blast_input.h | 62 | ||||
-rw-r--r-- | algo/blast/api/blast_options_api.c | 28 | ||||
-rw-r--r-- | algo/blast/api/blast_options_api.h | 4 | ||||
-rw-r--r-- | algo/blast/api/blast_returns.c | 26 | ||||
-rw-r--r-- | algo/blast/api/blast_returns.h | 4 | ||||
-rw-r--r-- | algo/blast/api/blast_seq.c | 23 | ||||
-rw-r--r-- | algo/blast/api/blast_seq.h | 3 | ||||
-rw-r--r-- | algo/blast/api/blast_seqalign.c | 14 | ||||
-rw-r--r-- | algo/blast/api/blast_tabular.c | 19 | ||||
-rw-r--r-- | algo/blast/api/dust_filter.c | 87 | ||||
-rw-r--r-- | algo/blast/api/hspstream_queue.c | 8 | ||||
-rw-r--r-- | algo/blast/api/repeats_filter.c | 10 | ||||
-rw-r--r-- | algo/blast/api/seqsrc_multiseq.c | 36 | ||||
-rw-r--r-- | algo/blast/api/seqsrc_multiseq.h | 4 | ||||
-rw-r--r-- | algo/blast/api/seqsrc_readdb.c | 52 | ||||
-rw-r--r-- | algo/blast/api/twoseq_api.c | 18 |
20 files changed, 975 insertions, 133 deletions
diff --git a/algo/blast/api/blast_api.c b/algo/blast/api/blast_api.c index 67d64e4d..a088090e 100644 --- a/algo/blast/api/blast_api.c +++ b/algo/blast/api/blast_api.c @@ -1,4 +1,4 @@ -/* $Id: blast_api.c,v 1.41 2006/09/15 13:12:43 madden Exp $ +/* $Id: blast_api.c,v 1.50 2007/07/27 18:02:26 papadopo Exp $ *************************************************************************** * * * COPYRIGHT NOTICE * @@ -44,6 +44,7 @@ #include <algo/blast/core/blast_traceback.h> #include <algo/blast/core/hspstream_collector.h> #include <algo/blast/core/phi_lookup.h> +#include <algo/blast/core/blast_psi.h> #include <algo/blast/api/hspstream_queue.h> #include <algo/blast/api/blast_mtlock.h> #include <algo/blast/api/blast_prelim.h> @@ -53,6 +54,7 @@ #include <algo/blast/api/blast_seqalign.h> #include <algo/blast/api/dust_filter.h> #include <algo/blast/api/blast_message_api.h> +#include <algo/blast/core/gencode_singleton.h> /** @addtogroup CToolkitAlgoBlast * @@ -273,10 +275,6 @@ s_BlastHSPStreamSetUp(BLAST_SequenceBlk* query, BlastQueryInfo* query_info, if (!tf_data) { const Int4 kNumResults = query_info->num_queries; - /* Results in the collector stream should be sorted only for a - database search. The latter is true if and only if the sequence - source has non-zero database length. */ - const Boolean kSortOnRead = (BlastSeqSrcGetTotLen(seq_src) != 0); SBlastHitsParameters* blasthit_params=NULL; MT_LOCK lock = NULL; if (options->num_cpus > 1) @@ -286,7 +284,7 @@ s_BlastHSPStreamSetUp(BLAST_SequenceBlk* query, BlastQueryInfo* query_info, options->score_options, &blasthit_params); *hsp_stream = Blast_HSPListCollectorInitMT(options->program, blasthit_params, - kNumResults, kSortOnRead, lock); + kNumResults, lock); } else { /* Initialize the queue HSP stream for tabular formatting. */ *hsp_stream = Blast_HSPListQueueInit(); @@ -335,6 +333,8 @@ s_BlastThreadManager(BLAST_SequenceBlk* query, BlastQueryInfo* query_info, ASSERT(query && query_info && seq_src && lookup_wrap && sbp && hsp_stream && extra_returns); + BlastSeqSrcResetChunkIterator((BlastSeqSrc*) seq_src); + /* Start the formatting thread */ if(tf_data && NlmThreadsAvailable() && (format_thread = @@ -374,7 +374,8 @@ s_BlastThreadManager(BLAST_SequenceBlk* query, BlastQueryInfo* query_info, SPHIPatternSearchBlk* pattern_blk = NULL; if (Blast_ProgramIsPhiBlast(kProgram)) { pattern_blk = (SPHIPatternSearchBlk*) lookup_wrap->lut; - pattern_blk->num_patterns_db = diagnostics->ungapped_stat->lookup_hits; + pattern_blk->num_patterns_db = + (Int4)diagnostics->ungapped_stat->lookup_hits; } if ((status = Blast_RunTracebackSearch(kProgram, query, @@ -500,15 +501,79 @@ s_BlastFindMatrixPath(const char* matrix_name, Boolean is_prot) return NULL; } -Int2 -Blast_RunSearch(SeqLoc* query_seqloc, - const BlastSeqSrc* seq_src, - SeqLoc* masking_locs, - const SBlastOptions* options, - BlastTabularFormatData* tf_data, - BlastHSPResults **results, - SeqLoc** filter_out, - Blast_SummaryReturn* extra_returns) + +/** + * Read a checkpoint file and set the necessary structures in a + * BlastScoreBlk: the psi_matrix, kbp_psi[0], and kbp_gap_psi[0]. + * + * @param sbp a BlastScoreBlk to receive a PSSM [in/out] + * @param query query sequence data + * @param psi_matrix_file checkpoint file to read + * @pcore_msg a pointer to receive error and warning messages + */ +static int +s_SetupScoreBlkPssmFromChkpt(BlastScoreBlk * sbp, + BLAST_SequenceBlk * query, + Blast_PsiCheckpointLoc * psi_checkpoint, + Blast_Message* *pcore_msg) +{ + int status = 0; + /* An intermediate representation of the PSSM data that is used + in PSIBlast routines */ + PSIMatrix * pssm = NULL; + /* The actual PSSM that is saved in the BlastScoreBlk */ + SPsiBlastScoreMatrix * psi_matrix = NULL; + size_t i, j; + + psi_matrix = SPsiBlastScoreMatrixNew(query->length); + if (!psi_matrix) { + ErrPostEx(SEV_FATAL, 1, 0, + "Out-of-memory: cannot allocate a PSSM of length %d.\n", + query->length); + status = -1; + goto error_return; + } + status = Blast_PosReadCheckpoint(psi_matrix->freq_ratios, + query->length, query->sequence, + psi_checkpoint, + pcore_msg); + if (status != 0) { + goto error_return; + } + Blast_KarlinBlkCopy(psi_matrix->kbp, sbp->kbp_gap_std[0]); + status = PSICreatePssmFromFrequencyRatios(query->sequence, + query->length, sbp, + psi_matrix->freq_ratios, + kPSSM_NoImpalaScaling, + &pssm); + if (0 != status) { + goto error_return; + } + for (i = 0; i < psi_matrix->pssm->ncols; i++) { + for (j = 0; j < psi_matrix->pssm->nrows; j++) { + psi_matrix->pssm->data[i][j] = pssm->pssm[i][j]; + } + } + PSIMatrixFree(pssm); + sbp->psi_matrix = psi_matrix; + return 0; +error_return: + if (psi_matrix) + SPsiBlastScoreMatrixFree(psi_matrix); + return status; +} + + +Int2 +Blast_RunSearch(SeqLoc* query_seqloc, + Blast_PsiCheckpointLoc * psi_checkpoint, + const BlastSeqSrc* seq_src, + SeqLoc* masking_locs, + const SBlastOptions* options, + BlastTabularFormatData* tf_data, + BlastHSPResults **results, + SeqLoc** filter_out, + Blast_SummaryReturn* extra_returns) { Int2 status = 0; BLAST_SequenceBlk *query = NULL; @@ -600,6 +665,20 @@ Blast_RunSearch(SeqLoc* query_seqloc, if (status) return status; + if (psi_checkpoint) { + core_msg = NULL; + status = s_SetupScoreBlkPssmFromChkpt(sbp, query, psi_checkpoint, + &core_msg); + if (core_msg) { + extra_returns->error = + Blast_MessageToSBlastMessage(core_msg, query_seqloc, + query_info, + options->believe_query); + core_msg = Blast_MessageFree(core_msg); + } + if (status) + return status; + } if (filter_out) { *filter_out = BlastMaskLocToSeqLoc(kProgram, mask_loc, query_seqloc); @@ -687,8 +766,10 @@ Blast_RunSearch(SeqLoc* query_seqloc, return status; } -Int2 -Blast_DatabaseSearch(SeqLoc* query_seqloc, char* db_name, +Int2 +Blast_DatabaseSearch(SeqLoc* query_seqloc, + Blast_PsiCheckpointLoc * psi_checkpoint, + char* db_name, SeqLoc* masking_locs, const SBlastOptions* options, BlastTabularFormatData* tf_data, @@ -733,9 +814,10 @@ Blast_DatabaseSearch(SeqLoc* query_seqloc, char* db_name, if (extra_returns->error) return -1; - status = - Blast_RunSearch(query_seqloc, seq_src, masking_locs, options, tf_data, - &results, filter_out, extra_returns); + status = + Blast_RunSearch(query_seqloc, psi_checkpoint, seq_src, + masking_locs, options, tf_data, &results, + filter_out, extra_returns); /* The ReadDBFILE structure will not be destroyed here, because the initialising function used readdb_attach */ @@ -852,8 +934,10 @@ PHIBlastRunSearch(SeqLoc* query_seqloc, char* db_name, SeqLoc* masking_locs, /* Masking at hash and on-the-fly tabular output are not applicable for PHI BLAST, so pass NULL in corresponding arguments. */ status = - Blast_RunSearch(query_seqloc, seq_src, masking_locs, options, NULL, - &results, filter_out, extra_returns); + Blast_RunSearch(query_seqloc, (Blast_PsiCheckpointLoc *) NULL, + seq_src, masking_locs, options, + (BlastTabularFormatData*) NULL, &results, + filter_out, extra_returns); /* The ReadDBFILE structure will not be destroyed here, because the initialising function used readdb_attach */ @@ -909,9 +993,10 @@ Blast_TwoSeqLocSetsAdvanced(SeqLoc* query_seqloc, if (extra_returns->error) return -1; - status = - Blast_RunSearch(query_seqloc, seq_src, masking_locs, options, tf_data, - &results, filter_out, extra_returns); + status = + Blast_RunSearch(query_seqloc, (Blast_PsiCheckpointLoc *) NULL, + seq_src, masking_locs, options, tf_data, + &results, filter_out, extra_returns); /* The ReadDBFILE structure will not be destroyed here, because the initialising function used readdb_attach */ @@ -932,6 +1017,20 @@ Blast_TwoSeqLocSetsAdvanced(SeqLoc* query_seqloc, return status; } +void GeneticCodeSingletonInit() +{ + Uint1* gc = NULL; + GenCodeSingletonInit(); + BLAST_GeneticCodeFind(BLAST_GENETIC_CODE, &gc); + GenCodeSingletonAdd(BLAST_GENETIC_CODE, gc); + free(gc); +} + +void GeneticCodeSingletonFini() +{ + GenCodeSingletonFini(); +} + /* @} */ diff --git a/algo/blast/api/blast_api.h b/algo/blast/api/blast_api.h index 25c76fb1..2957ec9c 100644 --- a/algo/blast/api/blast_api.h +++ b/algo/blast/api/blast_api.h @@ -1,4 +1,4 @@ -/* $Id: blast_api.h,v 1.7 2006/01/13 15:58:31 madden Exp $ +/* $Id: blast_api.h,v 1.10 2007/03/20 14:56:21 camacho Exp $ *************************************************************************** * * * COPYRIGHT NOTICE * @@ -47,15 +47,27 @@ extern "C" { #include <algo/blast/api/blast_tabular.h> #include <algo/blast/api/blast_options_api.h> #include <algo/blast/api/blast_seqalign.h> +#include <algo/blast/api/blast_input.h> /** @addtogroup CToolkitAlgoBlast * * @{ */ +/** Initialize the genetic code singleton. + * This function must be called before running any BLAST search with the new + * engine */ +void GeneticCodeSingletonInit(); + +/** Uinitialize the genetic code singleton. + * This function must be called after running any BLAST search with the new + * engine */ +void GeneticCodeSingletonFini(); + /** Compares a list of SeqLoc's against a BLAST database using the * BLAST algorithm. * @param query_seqloc List of query Seq-loc's [in] + * @param psi_matrix_file a checkpoint file for PSSM searches [in] * @param db_name Name of a BLAST database to search [in] * @param masking_locs Locations in the queries that should be masked [in] * @param options Search options [in] @@ -64,8 +76,10 @@ extern "C" { * @param filter_out Filtering locations [out] * @param extra_returns Additional information about the search [out] */ -Int2 -Blast_DatabaseSearch(SeqLoc* query_seqloc, char* db_name, +Int2 +Blast_DatabaseSearch(SeqLoc* query_seqloc, + Blast_PsiCheckpointLoc * psi_checkpoint, + char* db_name, SeqLoc* masking_locs, const SBlastOptions* options, BlastTabularFormatData* tf_data, @@ -96,6 +110,7 @@ Blast_TwoSeqLocSetsAdvanced(SeqLoc* query_seqloc, /** Compare a list of query SeqLoc's against a source of subject sequences. * @param query_seqloc List of query sequences locations [in] + * @param psi_matrix_file a checkpoint file for PSSM searches [in] * @param seq_src Source of subject sequences [in] * @param masking_locs Locations where query sequences should be masked. [in] * @param options Search options [in] @@ -106,8 +121,9 @@ Blast_TwoSeqLocSetsAdvanced(SeqLoc* query_seqloc, * @param extra_returns Additional search statistits [out] * @return 0 on success, -1 on failure. */ -Int2 -Blast_RunSearch(SeqLoc* query_seqloc, +Int2 +Blast_RunSearch(SeqLoc* query_seqloc, + Blast_PsiCheckpointLoc * psi_checkpoint, const BlastSeqSrc* seq_src, SeqLoc* masking_locs, const SBlastOptions* options, diff --git a/algo/blast/api/blast_format.c b/algo/blast/api/blast_format.c index 3d4ff10c..20b1f643 100644 --- a/algo/blast/api/blast_format.c +++ b/algo/blast/api/blast_format.c @@ -1,4 +1,4 @@ -/* $Id: blast_format.c,v 1.112 2006/10/06 12:22:28 madden Exp $ +/* $Id: blast_format.c,v 1.114 2007/01/19 14:32:31 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -32,7 +32,7 @@ */ #ifndef SKIP_DOXYGEN_PROCESSING -static char const rcsid[] = "$Id: blast_format.c,v 1.112 2006/10/06 12:22:28 madden Exp $"; +static char const rcsid[] = "$Id: blast_format.c,v 1.114 2007/01/19 14:32:31 madden Exp $"; #endif /* SKIP_DOXYGEN_PROCESSING */ #include <algo/blast/api/blast_format.h> @@ -483,7 +483,10 @@ Int2 BlastFormattingInfoNew(EAlignView align_view, if (align_view != eAlignViewXml && align_view < eAlignViewAsnText) { FILE* outfp; if ((outfp = FileOpen(outfile_name, "w")) == NULL) - return -1; + { + ErrPostEx(SEV_WARNING, 0, 0, "Unable to open output file %s:", outfile_name); + return -1; + } info->outfp = outfp; } else { char write_mode[3]; @@ -497,7 +500,10 @@ Int2 BlastFormattingInfoNew(EAlignView align_view, strcpy(write_mode, "wb"); if ((info->aip = AsnIoOpen(filename_copy, write_mode)) == NULL) - return -1; + { + ErrPostEx(SEV_WARNING, 0, 0, "Unable to open output file %s:", filename_copy); + return -1; + } sfree(filename_copy); } info->is_seqalign_null = TRUE; /* will be updated in BLAST_FormatResults */ @@ -597,7 +603,7 @@ GetOldAlignType(EBlastProgramType prog, Boolean* db_is_na) align_type = 3; *db_is_na = FALSE; break; - case eBlastTypeTblastn: + case eBlastTypePsiTblastn: case eBlastTypeTblastn: align_type = 4; *db_is_na = TRUE; break; diff --git a/algo/blast/api/blast_input.c b/algo/blast/api/blast_input.c index 6c9e50f1..364d1dfc 100644 --- a/algo/blast/api/blast_input.c +++ b/algo/blast/api/blast_input.c @@ -1,5 +1,5 @@ #ifndef SKIP_DOXYGEN_PROCESSING -static char const rcsid[] = "$Id: blast_input.c,v 1.25 2006/05/24 21:17:50 camacho Exp $"; +static char const rcsid[] = "$Id: blast_input.c,v 1.30 2007/05/07 13:27:15 kans Exp $"; #endif /* SKIP_DOXYGEN_PROCESSING */ /* =========================================================================== * @@ -37,6 +37,9 @@ static char const rcsid[] = "$Id: blast_input.c,v 1.25 2006/05/24 21:17:50 camac #include <algo/blast/api/blast_seq.h> #include <algo/blast/core/blast_filter.h> #include <algo/blast/core/blast_setup.h> +#include <algo/blast/core/blast_posit.h> +#include <objscoremat.h> +#include <asn.h> /** @addtogroup CToolkitAlgoBlast * @@ -178,5 +181,517 @@ BLAST_GetQuerySeqLoc(FILE *infp, Boolean query_is_na, Uint1 strand, return total_length; } + + +/** + * Read the query data stored in a PSI-BLAST checkpoint file. + * + * @param *pold_query_length length of the old query, as stored in the + * checkpoint file + * @param old_query a buffer of length at least query_length + * to hold the query data from the checkpoint + * file + * @param query_length the expected query length, and the length + * of the old_query buffer + * @param file the checkpoint file + * @return the number of query characters read, + * <= the value of the query_length parameter. + */ +static int +s_GetOldQueryFromCheckpoint(Int4 * pold_query_length, Uint1 * old_query, + int query_length, FILE * file) +{ + int i; + int num_read; /* number of query characters read from disk */ + + ASSERT(query_length > 0); + if (1 != fread(pold_query_length, sizeof(Int4), 1, file)) { + *pold_query_length = 0; + return 0; + } + /* Read at most query_length bytes, no matter what the value of + * *pold_query_length is */ + num_read = (int)fread(old_query, sizeof(Uint1), query_length, file); + for (i = 0; i < num_read; i++) { + old_query[i] = AMINOACID_TO_NCBISTDAA[old_query[i]]; + } + return num_read; +} + + +/** + * Compare the query to another sequence, typically a query stored in + * a PSI-BLAST checkpoint file. Warn if they don't match. + * @param query one query to be compared + * @param query_length the length of query and old_query + * @param old_query the other query + * @return 0 if the sequences match, -1 otherwise */ +static Int4 +s_ValidateOldQuery(const Uint1 query[], int query_length, + const Uint1 old_query[], int old_query_length) +{ + int query_index; + /* Value for the X ambiguity character */ + enum { eXchar = 21 }; + + if (query_length != old_query_length) { + ErrPostEx(SEV_WARNING, 0, 0, "Invalid usage of checkpoint recovery; " + "old query has length %ld, new query has length %ld", + (long) old_query_length, (long) query_length); + return -1; + } + for (query_index = 0; query_index < query_length; query_index++) { + if (old_query[query_index] != query[query_index]) { + char old_char = NCBISTDAA_TO_AMINOACID[old_query[query_index]]; + char new_char = NCBISTDAA_TO_AMINOACID[query[query_index]]; + + if (old_query[query_index] != eXchar) { + if (query[query_index] == eXchar) { + ErrPostEx(SEV_WARNING, 0, 0, + "\nStored query has a %c at position " + "%d, while new query has a %c there.\n%c " + "appears in query sequence: The query " + "could be filtered. Run with \"-F F\" " + "option to turn the filter off.", + old_char, query_index, new_char, new_char); + } else { + ErrPostEx(SEV_WARNING, 0, 0, + "Stored query has a %c at position %d, " + "while new query has a %c there.", + old_char, query_index, new_char); + } + return -1; + } else { /* old_query[c] == eXchar */ + ErrPostEx(SEV_WARNING, 0, 0, + "Stored query has a %c at position %d, " + "while new query has a %c there\n%c appears " + "in the stored query: The stored query may be " + "filtered. Run blastpgp with \"-F F\" option " + "to turn the filter off.", + old_char, query_index, new_char, old_char); + /* This is only a warning */ + } + } + } + return 0; +} + + +/** + * Read frequency ratios from a PSI-BLAST checkpoint file; read the + * query data for the file before calling this routine. + * + * @param freq_ratios the frequency ratios [out] + * @param query_length the length of the query + * @param file file to read + * + * @return 0 on sucess, nonzero on error */ +static int +s_PosReadFreqRatios(double ** freq_ratios, + int qlength, + FILE * file) +{ + int query_index; /* Query position (column) in the PSSM */ + int stdaa_index; /* Index in the ncbi_stdaa alphabet */ + int trueaa_index; /* Index in the ARND... alphabet of true + amino acids */ + /* conversion from 28 letter NCBIstdaa alphabet to 20 letter order + * for true amino acids: ARNDCQEGHILKMFPSTWYV. */ + static int alphaConvert[BLASTAA_SIZE] = + {(-1), 0, (-1), 4, 3, 6, 13, 7, 8, 9, 11, 10, 12, 2, 14, 5, 1, 15, + 16, 19, 17, (-1), 18, (-1), (-1), (-1), (-1), (-1)}; + /* Buffer to hold frequency data in ARND... order */ + double trueaa_buffer[PRO_TRUE_ALPHABET_SIZE]; + int num_read; /* Number of items retrieved by a call to read */ + + for (query_index = 0; query_index < qlength; query_index++) { + num_read = (int)fread(trueaa_buffer, sizeof(double), + PRO_TRUE_ALPHABET_SIZE, file); + if (num_read < PRO_TRUE_ALPHABET_SIZE) + return -1; + for (stdaa_index = 0; stdaa_index < BLASTAA_SIZE; stdaa_index++) { + trueaa_index = alphaConvert[stdaa_index]; + if (trueaa_index < 0) { + freq_ratios[query_index][stdaa_index] = 0.0; + } else { + freq_ratios[query_index][stdaa_index] = + trueaa_buffer[trueaa_index]; + } + } + } + return 0; +} + + +/** + * Read frequency ratios from a standard format PSI-BLAST checkpoint file. + * + * @param freq_ratios the frequency ratios + * @param query_length the length of the query, and second dimension of + * freq_ratios + * @param query query sequence data + * @param file an open file to be read + * @param blast_msg a pointer to hold BLAST warnings. + * + * @return 0 on success, nonzero otherwise + */ +static int +s_PosReadStdCheckpointFile(double ** freq_ratios, + int query_length, + const Uint1 * query, + FILE * file, + Blast_Message* *blast_msg) +{ + int chkpt_query_length = 0; /* Length of the query saved in the + checkpoint file */ + int num_read = 0; /* number if query characters actually read from + the checkpoint file */ + int status = 0; /* error status */ + /* Buffer to hold the query from the checkpoint file */ + Uint1 * chkpt_query = NULL; + + chkpt_query = calloc(query_length, sizeof(Uint1)); + if (NULL != chkpt_query) { + num_read = s_GetOldQueryFromCheckpoint(&chkpt_query_length, + chkpt_query, query_length, + file); + } + if (NULL == chkpt_query || num_read < chkpt_query_length) { + Blast_MessageWrite(blast_msg, eBlastSevFatal, + kBlastMessageNoContext, + "Blast_PosReadCheckpoint: " + "Failed to reconstruct previous query\n"); + goto error_return; + } + status = s_ValidateOldQuery(query, query_length, + chkpt_query, chkpt_query_length); + free(chkpt_query); + + if (0 != status) + goto error_return; + + status = s_PosReadFreqRatios(freq_ratios, query_length, file); + if (0 != status) + goto error_return; + + return 0; +error_return: + Blast_MessageWrite(blast_msg, eBlastSevFatal, + kBlastMessageNoContext, + "Blast_PosReadCheckpoint: " + "Failed to recover data\n"); + return -1; +} + + +/** + * Read frequency ratios from a standard format PSI-BLAST checkpoint file + * of the given name. + * + * @param freq_ratios the frequency ratios + * @param query_length the length of the query, and second dimension of + * freq_ratios + * @param query query sequence data + * @param file the name of the file to be read + * @param blast_msg a pointer to hold BLAST warnings. + * + * @return 0 on success, nonzero otherwise + */ +static int +s_PosReadStdCheckpoint(Nlm_FloatHi ** freq_ratios, + int qlength, + const Uint1 * query, + const char * filename, + Blast_Message* *blast_msg) +{ + FILE * file = fopen(filename, "rb"); + int status, close_status; + + if (!file) { + Blast_MessageWrite(blast_msg, eBlastSevFatal, + kBlastMessageNoContext, + "Blast_PosReadCheckpointFile: " + "Could not open checkpoint file\n"); + return -1; + } + status = s_PosReadStdCheckpointFile(freq_ratios, qlength, query, + file, blast_msg); + close_status = fclose(file); + + return (0 == status) ? close_status : status; +} + + +static int +s_PosReadAsnCheckpoint(double ** freq_ratios, + int query_length, + const Uint1 query[], + char fileName[], + int is_ascii_scoremat) +{ + AsnIoPtr infile = NULL; + PssmWithParametersPtr scoremat = NULL; + PssmPtr pssm = NULL; + PssmIntermediateDataPtr freqs = NULL; + ValNodePtr freq_list; + Bioseq *bsp; + int i, j, c; + enum { eXchar = 21 }; + + if (is_ascii_scoremat) { + infile = AsnIoOpen(fileName, "r"); + } else { + infile = AsnIoOpen(fileName, "rb"); + } + if (infile == NULL) { + ErrPostEx(SEV_WARNING, 0, 0,"Could not open scoremat file\n"); + return -1; + } + + scoremat = PssmWithParametersAsnRead(infile, NULL); + AsnIoClose(infile); + if (scoremat == NULL) { + ErrPostEx(SEV_WARNING, 0, 0, + "Could not read scoremat from input file\n"); + return 1; + } + pssm = scoremat->pssm; + if (pssm == NULL) { + ErrPostEx(SEV_WARNING, 0, 0,"Scoremat is empty\n"); + PssmWithParametersFree(scoremat); + return -1; + } + freqs = pssm->intermediateData; + if (freqs == NULL) { + ErrPostEx(SEV_WARNING, 0, 0, + "Scoremat doesn't contain intermediate data\n"); + PssmWithParametersFree(scoremat); + return -1; + } + if (freqs->freqRatios == NULL) { + ErrPostEx(SEV_WARNING, 0, 0, + "Scoremat does not contain frequency ratios\n"); + PssmWithParametersFree(scoremat); + return -1; + } + if (pssm->numRows != BLASTAA_SIZE) { + ErrPostEx(SEV_WARNING, 0, 0, "Wrong alphabet size of %d in " + "input scoremat\n", pssm->numRows); + PssmWithParametersFree(scoremat); + return -1; + } + if (!pssm->query || !pssm->query->data.ptrvalue) { + ErrPostEx(SEV_WARNING, 0, 0, + "Missing sequence data in input scoremat\n"); + PssmWithParametersFree(scoremat); + return -1; + } + bsp = (Bioseq *)(pssm->query->data.ptrvalue); + if (pssm->numColumns != bsp->length) { + ErrPostEx(SEV_WARNING, 0, 0, "Different sequence lengths " + "(%d and %d) in input scoremat\n", pssm->numColumns, + bsp->length); + PssmWithParametersFree(scoremat); + return -1; + } + if (pssm->numColumns != query_length) { + ErrPostEx(SEV_WARNING, 0, 0, "Scoremat sequence length " + "(%d) does not match query length (%d)\n", + pssm->numColumns, query_length); + PssmWithParametersFree(scoremat); + return -1; + } + if (!bsp->seq_data || !ISA_aa(bsp->mol)) { + ErrPostEx(SEV_WARNING, 0, 0, + "Sequence within checkpoint file has no data or is " + "not protein\n"); + PssmWithParametersFree(scoremat); + return -1; + } + + if (bsp->seq_data_type == Seq_code_gap) { + ErrPostEx(SEV_WARNING, 0, 0,"Seq_code_gap passed to s_PosReadAsnCheckpoint\n"); + return -1; + } + + BSSeek((ByteStorePtr) bsp->seq_data, 0, SEEK_SET); + + /* Convert sequence data into Seq_code_ncbistdaa */ + if (bsp->seq_data_type != Seq_code_ncbistdaa) { + + ByteStore* new_byte_store = BSConvertSeq((ByteStorePtr) bsp->seq_data, + Seq_code_ncbistdaa, + bsp->seq_data_type, + bsp->length); + if ( !new_byte_store ) { + ErrPostEx(SEV_FATAL, 1, 0, "Failed to convert Bioseq in ASN.1" + " PSSM to Seq_code_ncbistdaa"); + } + bsp->seq_data = (SeqDataPtr) new_byte_store; + bsp->seq_data_type = Seq_code_ncbistdaa; + BSSeek((ByteStorePtr) bsp->seq_data, 0, SEEK_SET); + } + + /* verify the input query is the same as the sequence + within the checkpoint file */ + + for (i = 0; i < query_length; i++) { + c = BSGetByte((ByteStorePtr) bsp->seq_data); + if (c == EOF) { + ErrPostEx(SEV_WARNING, 0, 0, "Premature end of sequence data\n"); + PssmWithParametersFree(scoremat); + return -1; + } + if (c != query[i]) { + char old_char = NCBISTDAA_TO_AMINOACID[query[i]]; + char new_char = NCBISTDAA_TO_AMINOACID[c]; + if (query[i] == eXchar) { + ErrPostEx(SEV_WARNING, 0, 0, + "Query sequence contains '%c' at position %d; " + "if filtering was used, rerun the search with " + "filtering turned off ('-F F')\n", old_char, i); + } + else { + ErrPostEx(SEV_WARNING, 0, 0, + "Query sequence contains '%c' at position %d, " + "while sequence withing checkpoint file contains " + "'%c' at this position\n", + old_char, i, new_char); + } + PssmWithParametersFree(scoremat); + return -1; + } + } + + /* Read in the frequency ratios, verify they fall + in the correct range, and verify that the linked list + of residue frequencies is exactly as long as it should be */ + + freq_list = freqs->freqRatios; + /* This is bad */ + if (pssm->byRow == FALSE) { + j = 0; + for (i = 0; i < pssm->numColumns; i++) { + for (j = 0; j < pssm->numRows; j++) { + if (freq_list == NULL) + break; + freq_ratios[i][j] = freq_list->data.realvalue; + + if (freq_ratios[i][j] < 0.0) { + ErrPostEx(SEV_WARNING, 0, 0, "position frequency (%d,%d) " + "out of bounds\n", i, j); + PssmWithParametersFree(scoremat); + return -1; + } + freq_list = freq_list->next; + } + if (j < pssm->numRows) + break; + } + } else { + i = 0; + for (j = 0; j < pssm->numRows; j++) { + for (i = 0; i < pssm->numColumns; i++) { + if (freq_list == NULL) + break; + freq_ratios[i][j] = freq_list->data.realvalue; + + if (freq_ratios[i][j] < 0.0) { + ErrPostEx(SEV_WARNING, 0, 0, "position frequency (%d,%d) " + "out of bounds\n", i, j); + PssmWithParametersFree(scoremat); + return -1; + } + freq_list = freq_list->next; + } + if (i < pssm->numColumns) + break; + } + } + if (i < pssm->numColumns || j < pssm->numRows) { + ErrPostEx(SEV_WARNING, 0, 0, "Not enough frequency " + "ratios in input scoremat\n"); + PssmWithParametersFree(scoremat); + return -1; + } + if (freq_list != NULL) { + ErrPostEx(SEV_WARNING, 0, 0, "Too many frequency " + "ratios in input scoremat\n"); + PssmWithParametersFree(scoremat); + return -1; + } + PssmWithParametersFree(scoremat); + return 0; +} + + +Blast_PsiCheckpointLoc * +Blast_PsiCheckpointLocNew(EPsiCheckpointType checkpoint_type, + char * filename) +{ + Blast_PsiCheckpointLoc * psi_checkpoint = + malloc(sizeof(Blast_PsiCheckpointLoc)); + if (psi_checkpoint) { + size_t length_filename = strlen(filename); + psi_checkpoint->filename = calloc(length_filename + 1, sizeof(char)); + if (!psi_checkpoint->filename) { + free(psi_checkpoint); + psi_checkpoint = NULL; + } + memcpy(psi_checkpoint->filename, filename, length_filename + 1); + psi_checkpoint->checkpoint_type = checkpoint_type; + } + return psi_checkpoint; +} + + +void +Blast_PsiCheckpointLocFree(Blast_PsiCheckpointLoc ** ppsi_checkpoint) +{ + Blast_PsiCheckpointLoc * psi_checkpoint = *ppsi_checkpoint; + if (psi_checkpoint) { + if (psi_checkpoint->filename) { + free(psi_checkpoint->filename); + } + free(psi_checkpoint); + } + *ppsi_checkpoint = NULL; +} + + +int +Blast_PosReadCheckpoint(double ** freq_ratios, + int query_length, + const Uint1 * query, + Blast_PsiCheckpointLoc * psi_checkpoint, + Blast_Message* *blast_msg) +{ + int status; + + switch(psi_checkpoint->checkpoint_type) { + case eStandardCheckpoint: + status = s_PosReadStdCheckpoint(freq_ratios, query_length, + query, + psi_checkpoint->filename, + blast_msg); + break; + case eAsnTextCheckpoint: + case eAsnBinaryCheckpoint: + {{ + int is_ascii_checkpoint = + psi_checkpoint->checkpoint_type == eAsnTextCheckpoint; + status = s_PosReadAsnCheckpoint(freq_ratios, + query_length, query, + psi_checkpoint->filename, + is_ascii_checkpoint); + }} + break; + default: + ASSERT(0 && "Impossible type of checkpoint file"); + status = -1; + break; + } + return status; +} /* @} */ diff --git a/algo/blast/api/blast_input.h b/algo/blast/api/blast_input.h index 26d042e2..60586ef4 100644 --- a/algo/blast/api/blast_input.h +++ b/algo/blast/api/blast_input.h @@ -1,4 +1,4 @@ -/* $Id: blast_input.h,v 1.18 2006/04/21 14:33:44 madden Exp $ +/* $Id: blast_input.h,v 1.20 2007/03/12 16:12:46 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -44,6 +44,7 @@ extern "C" { #include <ncbi.h> #include <algo/blast/core/blast_def.h> +#include <algo/blast/core/blast_message.h> /** @addtogroup CToolkitAlgoBlast * @@ -76,6 +77,52 @@ BLAST_GetQuerySeqLoc(FILE *infp, Boolean query_is_na, Uint1 strand, Int4* num_queries, Boolean believe_query, Int4 genetic_code); + +/** The possible file formats of a PSI-BLAST checkpoint file. */ +typedef enum EPsiCheckpointType { + eStandardCheckpoint = 0, /**< The useual PSI-BLAST binary format */ + eAsnTextCheckpoint = 1, /**< ASN.1 text format */ + eAsnBinaryCheckpoint = 2 /**< ASN.1 binary format */ +} EPsiCheckpointType; + +/** The location and type of a PSI-BLAST checkpoint file */ +typedef struct Blast_PsiCheckpointLoc { + EPsiCheckpointType checkpoint_type; /**< file format */ + char * filename; /**< name of the file */ +} Blast_PsiCheckpointLoc; + +/** Create a new locator for a PSI-BLAST checkpoint file. + * @param checkpoint_type file format + * @param filename name of the file */ +Blast_PsiCheckpointLoc * +Blast_PsiCheckpointLocNew(EPsiCheckpointType checkpoint_type, + char * filename); + +/** Free a PSI-BLAST checkpoint file locator */ +void +Blast_PsiCheckpointLocFree(Blast_PsiCheckpointLoc ** psi_checkpoint); + + +/** + * Read frequency ratios from a PSI-BLAST checkpoint file. + * + * @param freq_ratios the frequency ratios + * @param query_length the length of the query, and second dimension of + * freq_ratios + * @param query query sequence data + * @param psi_checkpoint location of the checkpoint data + * @param blast_msg a pointer to hold BLAST warnings. + * + * @return 0 on success, nonzero otherwise + */ +int +Blast_PosReadCheckpoint(double ** freq_ratios, + int query_length, + const Uint1 * query, + Blast_PsiCheckpointLoc * psi_checkpoint, + Blast_Message* *blast_msg); + + /* @} */ #ifdef __cplusplus @@ -86,6 +133,19 @@ BLAST_GetQuerySeqLoc(FILE *infp, Boolean query_is_na, Uint1 strand, * =========================================================================== * * $Log: blast_input.h,v $ +* Revision 1.20 2007/03/12 16:12:46 madden +* - Create an enum EPsiCheckpointType that specifies the file format +* of a PSI-BLAST checkpoint file. +* - Define a new datatype Blast_PsiCheckpointLoc to +* specify the location and type of a PSI-BLAST checkpoint file. +* - Declare Blast_PsiCheckpointLocNew and Blast_PsiCheckpointLocFree. +* [from Mike Gertz] +* +* Revision 1.19 2007/03/05 14:50:08 camacho +* - Added a prototype for Blast_PosReadCheckpoint. +* - Added core/blast_message.h to the includes because +* Blast_PosReadCheckpoint has a Blast_Message ** parameter. +* * Revision 1.18 2006/04/21 14:33:44 madden * BLAST_GetQuerySeqLoc parameter ctr is now a pointer to Int4, fixes case of more than 32k queries * diff --git a/algo/blast/api/blast_options_api.c b/algo/blast/api/blast_options_api.c index f855c187..12c0f855 100644 --- a/algo/blast/api/blast_options_api.c +++ b/algo/blast/api/blast_options_api.c @@ -1,4 +1,4 @@ -/* $Id: blast_options_api.c,v 1.18 2006/04/26 12:45:28 madden Exp $ +/* $Id: blast_options_api.c,v 1.24 2007/03/20 15:17:16 kans Exp $ *************************************************************************** * * * COPYRIGHT NOTICE * @@ -35,6 +35,7 @@ #include <algo/blast/core/blast_util.h> #include <algo/blast/core/blast_filter.h> #include <algo/blast/api/blast_seq.h> +#include <algo/blast/core/gencode_singleton.h> /** @addtogroup CToolkitAlgoBlast * @@ -83,11 +84,11 @@ Int2 SBlastOptionsNew(const char* program_name, SBlastOptions** options_out, return status; } - if (program == eBlastTypeTblastn || program == eBlastTypeRpsTblastn || - program == eBlastTypeTblastx) { - if ((status = BLAST_GeneticCodeFind(db_options->genetic_code, - &db_options->gen_code_string))) - return status; + if (Blast_SubjectIsTranslated(program) || program == eBlastTypeRpsTblastn) { + Uint1* gc = NULL; + BLAST_GeneticCodeFind(db_options->genetic_code, &gc); + GenCodeSingletonAdd(db_options->genetic_code, gc); + free(gc); } *options_out = options = (SBlastOptions*) calloc(1, sizeof(SBlastOptions)); @@ -145,7 +146,7 @@ Int2 SBlastOptionsSetWordSize(SBlastOptions* options, Int4 word_size) return -1; } -Int2 SBlastOptionsSetThreshold(SBlastOptions* options, Int4 threshold) +Int2 SBlastOptionsSetThreshold(SBlastOptions* options, double threshold) { if (!options || !options->lookup_options || !options->score_options) @@ -189,6 +190,8 @@ Int2 SBlastOptionsSetWindowSize(SBlastOptions* options, Int4 window_size) } options->word_options->window_size = window_size; + + return 0; } Int2 SBlastOptionsSetDiscMbParams(SBlastOptions* options, Int4 template_length, @@ -197,7 +200,7 @@ Int2 SBlastOptionsSetDiscMbParams(SBlastOptions* options, Int4 template_length, if (!options || !options->lookup_options) return -1; - options->lookup_options->lut_type = MB_LOOKUP_TABLE; + options->lookup_options->lut_type = eMBLookupTable; options->lookup_options->mb_template_length = template_length; options->lookup_options->mb_template_type = template_type; @@ -318,14 +321,7 @@ Int2 SBlastOptionsSetDbGeneticCode(SBlastOptions* options, Int4 gc) if (!options || !options->db_options) return -1; - /* If previously set genetic code is the same as the new one, there is no - need to do anything. */ - if (options->db_options->genetic_code != gc) { - options->db_options->genetic_code = gc; - /* Free old genetic code string. */ - sfree(options->db_options->gen_code_string); - status = BLAST_GeneticCodeFind(gc, &options->db_options->gen_code_string); - } + options->db_options->genetic_code = gc; return status; diff --git a/algo/blast/api/blast_options_api.h b/algo/blast/api/blast_options_api.h index b2e6b1a1..f9b10199 100644 --- a/algo/blast/api/blast_options_api.h +++ b/algo/blast/api/blast_options_api.h @@ -1,4 +1,4 @@ -/* $Id: blast_options_api.h,v 1.13 2006/06/08 21:46:24 papadopo Exp $ +/* $Id: blast_options_api.h,v 1.14 2007/03/08 14:57:31 kans Exp $ *************************************************************************** * * * COPYRIGHT NOTICE * @@ -139,7 +139,7 @@ Int2 SBlastOptionsSetRewardPenaltyAndGapCosts(SBlastOptions* options, * @return zero unless error (e.g., threshold is < zero) */ Int2 SBlastOptionsSetThreshold(SBlastOptions* options, - Int4 threshold); + double threshold); /** Set window size for two hit extension. * @param options options Options structure to update. [in] [out] diff --git a/algo/blast/api/blast_returns.c b/algo/blast/api/blast_returns.c index 664b858d..186f054e 100644 --- a/algo/blast/api/blast_returns.c +++ b/algo/blast/api/blast_returns.c @@ -1,4 +1,4 @@ -/* $Id: blast_returns.c,v 1.34 2006/10/13 18:44:29 papadopo Exp $ +/* $Id: blast_returns.c,v 1.38 2007/07/10 15:28:07 papadopo Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -29,7 +29,7 @@ */ #ifndef SKIP_DOXYGEN_PROCESSING -static char const rcsid[] = "$Id: blast_returns.c,v 1.34 2006/10/13 18:44:29 papadopo Exp $"; +static char const rcsid[] = "$Id: blast_returns.c,v 1.38 2007/07/10 15:28:07 papadopo Exp $"; #endif /* SKIP_DOXYGEN_PROCESSING */ #include <algo/blast/api/blast_returns.h> @@ -194,8 +194,12 @@ Blast_GetParametersBuffer(EBlastProgramType program_number, add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); } if (search_params->threshold) { - sprintf(buffer, "Neighboring words threshold: %ld", - (long) search_params->threshold); + if (search_params->threshold == floor(search_params->threshold)) + sprintf(buffer, "Neighboring words threshold: %ld", + (long)search_params->threshold); + else + sprintf(buffer, "Neighboring words threshold: %.2f", + search_params->threshold); add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); } if (search_params->window_size) { @@ -366,10 +370,9 @@ s_SummaryDBStatsFill(EBlastProgramType program_number, itr = BlastSeqSrcIteratorFree(itr); } } - - if (program_number == eBlastTypeTblastn || - program_number == eBlastTypeRpsTblastn || - program_number == eBlastTypeTblastx) + + if (Blast_SubjectIsTranslated(program_number) || + program_number == eBlastTypeRpsTblastn) total_length /= 3; (*db_stats)->dblength = total_length; @@ -400,8 +403,11 @@ s_SummaryDBStatsFill(EBlastProgramType program_number, /** FIXME: Should this be different for RPS BLAST? */ (*db_stats)->qlen = qlen; (*db_stats)->eff_qlen = qlen - ((*db_stats)->hsp_length); - (*db_stats)->eff_dblength = - total_length - num_entries*((*db_stats)->hsp_length); + (*db_stats)->eff_dblength = total_length; + if (eff_len_options->db_length == 0) { + (*db_stats)->eff_dblength -= (Int8)num_entries * + ((*db_stats)->hsp_length); + } (*db_stats)->eff_searchsp = query_info->contexts[query_info->first_context].eff_searchsp; if (eff_len_options && diff --git a/algo/blast/api/blast_returns.h b/algo/blast/api/blast_returns.h index af3fb5cf..1e61283b 100644 --- a/algo/blast/api/blast_returns.h +++ b/algo/blast/api/blast_returns.h @@ -1,4 +1,4 @@ -/* $Id: blast_returns.h,v 1.14 2006/04/26 12:46:00 madden Exp $ +/* $Id: blast_returns.h,v 1.15 2007/07/10 15:28:07 papadopo Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -88,7 +88,7 @@ typedef struct Blast_SearchParams { char* pattern; /**< phi-blast pattern used. */ char* entrez_query; /**< query to entrez */ double ethresh; /**< PSI-BLAST threshold to keep HSPs in profile. */ - Int4 threshold; /**< threshold to start extension of hits. */ + double threshold; /**< threshold to start extension of hits. */ Int4 window_size; /**< max allowed distance between hits to init extension in 2-hit mode. */ } Blast_SearchParams; diff --git a/algo/blast/api/blast_seq.c b/algo/blast/api/blast_seq.c index 6854c54d..30484f0d 100644 --- a/algo/blast/api/blast_seq.c +++ b/algo/blast/api/blast_seq.c @@ -1,5 +1,5 @@ #ifndef SKIP_DOXYGEN_PROCESSING -static char const rcsid[] = "$Id: blast_seq.c,v 1.85 2006/07/05 15:52:35 papadopo Exp $"; +static char const rcsid[] = "$Id: blast_seq.c,v 1.88 2007/03/14 19:50:30 papadopo Exp $"; #endif /* SKIP_DOXYGEN_PROCESSING */ /* * =========================================================================== @@ -40,7 +40,6 @@ static char const rcsid[] = "$Id: blast_seq.c,v 1.85 2006/07/05 15:52:35 papadop #include <algo/blast/core/blast_util.h> #include <algo/blast/core/blast_encoding.h> #include <algo/blast/core/blast_setup.h> /* For BlastSeqLoc_RestrictToInterval */ -#include <algo/blast/core/blast_inline.h> /** @addtogroup CToolkitAlgoBlast * @@ -70,7 +69,7 @@ BlastSeqlocsHaveDuplicateIDs(SeqLoc* query_seqlocs) return FALSE; /* allocate hashtable */ - hashtable = (Uint4 *)calloc(1 << kLog2HashSize, sizeof(Uint4)); + hashtable = (Uint4 *)calloc((size_t)1 << kLog2HashSize, sizeof(Uint4)); id_entries = (SeqIdHash *)malloc((kNumSeqs + 1) * sizeof(SeqIdHash)); for (slp = query_seqlocs, curr_id_num = 1; slp; slp = slp->next) { @@ -81,7 +80,7 @@ BlastSeqlocsHaveDuplicateIDs(SeqLoc* query_seqlocs) /* hash the ID of the next query sequence */ SeqIdLabel(id, buffer, sizeof(buffer), OM_LABEL_CONTENT); - hashval = readdb_sequence_hash(buffer, strlen(buffer)); + hashval = readdb_sequence_hash(buffer, (int)strlen(buffer)); hashval = hashval >> (32 - kLog2HashSize); if (hashtable[hashval] != 0) { Int4 offset = hashtable[hashval]; @@ -156,7 +155,7 @@ BlastMaskLocFromSeqLoc(SeqLoc* mask_seqlocs, SeqLoc* query_seqlocs, retval = BlastMaskLocNew(kNumSeqs*kNumContexts); /* create hashtable for query IDs */ - hashtable = (Uint4 *)calloc(1 << kLog2HashSize, sizeof(Uint4)); + hashtable = (Uint4 *)calloc((size_t)1 << kLog2HashSize, sizeof(Uint4)); id_entries = (SeqIdHash *)malloc((kNumSeqs + 1) * sizeof(SeqIdHash)); /* add the ID of each query sequence to the hashtable */ @@ -166,7 +165,7 @@ BlastMaskLocFromSeqLoc(SeqLoc* mask_seqlocs, SeqLoc* query_seqlocs, Char buffer[64]; SeqIdLabel(seq_id, buffer, sizeof(buffer), OM_LABEL_CONTENT); - hashval = readdb_sequence_hash(buffer, strlen(buffer)); + hashval = readdb_sequence_hash(buffer, (int)strlen(buffer)); hashval = hashval >> (32 - kLog2HashSize); id_entries[curr_id_num].id = seq_id; @@ -191,7 +190,7 @@ BlastMaskLocFromSeqLoc(SeqLoc* mask_seqlocs, SeqLoc* query_seqlocs, mask_id = SeqLocId(current_mask); SeqIdLabel(mask_id, buffer, sizeof(buffer), OM_LABEL_CONTENT); - hashval = readdb_sequence_hash(buffer, strlen(buffer)); + hashval = readdb_sequence_hash(buffer, (int)strlen(buffer)); hashval = hashval >> (32 - kLog2HashSize); /* examine only the query IDs that hash to the same value */ @@ -588,7 +587,7 @@ Int2 BLAST_GeneticCodeFind(Int4 gc, Uint1** genetic_code) if (!gen_code_eaa) return -1; smtp = SeqMapTableFind(Seq_code_ncbistdaa, Seq_code_ncbieaa); - gen_code_length = StrLen(gen_code_eaa); + gen_code_length = (Int4)StrLen(gen_code_eaa); *genetic_code = gen_code_stdaa = (Uint1*) calloc(gen_code_length+1, 1); if (!gen_code_stdaa) @@ -740,10 +739,7 @@ Int2 BLAST_SetUpQuery(EBlastProgramType program_number, program_number == eBlastTypePhiBlastn) { encoding = eBlastEncodingNucleotide; num_frames = 2; - } else if (program_number == eBlastTypeBlastp || - program_number == eBlastTypeRpsBlast || - program_number == eBlastTypeTblastn || - program_number == eBlastTypePhiBlastp) { + } else if (Blast_QueryIsProtein(program_number)) { encoding = eBlastEncodingProtein; num_frames = 1; } else { /* blastx or rpstblastn, which is also essentially blastx */ @@ -785,8 +781,7 @@ Int2 BLAST_SetUpSubject(EBlastProgramType program_number, EBlastEncoding encoding; const Boolean kNucleotide = (program_number == eBlastTypeBlastn || program_number == eBlastTypePhiBlastn); - const Boolean kTranslated = (program_number == eBlastTypeTblastn || - program_number == eBlastTypeTblastx); + const Boolean kTranslated = Blast_SubjectIsTranslated(program_number); if (kNucleotide) encoding = eBlastEncodingNucleotide; diff --git a/algo/blast/api/blast_seq.h b/algo/blast/api/blast_seq.h index e0c4ade9..5e652ad8 100644 --- a/algo/blast/api/blast_seq.h +++ b/algo/blast/api/blast_seq.h @@ -1,4 +1,4 @@ -/* $Id: blast_seq.h,v 1.28 2006/04/20 15:33:33 papadopo Exp $ +/* $Id: blast_seq.h,v 1.29 2006/11/21 17:21:20 papadopo Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -41,6 +41,7 @@ extern "C" { #include <objseq.h> #include <algo/blast/core/blast_def.h> +#include <algo/blast/core/blast_query_info.h> #include <algo/blast/core/blast_options.h> /** @addtogroup CToolkitAlgoBlast diff --git a/algo/blast/api/blast_seqalign.c b/algo/blast/api/blast_seqalign.c index b3056a03..bbf2c094 100644 --- a/algo/blast/api/blast_seqalign.c +++ b/algo/blast/api/blast_seqalign.c @@ -1,4 +1,4 @@ -/* $Id: blast_seqalign.c,v 1.60 2006/06/13 14:42:44 papadopo Exp $ +/* $Id: blast_seqalign.c,v 1.62 2007/02/26 14:52:50 papadopo Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -29,7 +29,7 @@ */ #ifndef SKIP_DOXYGEN_PROCESSING -static char const rcsid[] = "$Id: blast_seqalign.c,v 1.60 2006/06/13 14:42:44 papadopo Exp $"; +static char const rcsid[] = "$Id: blast_seqalign.c,v 1.62 2007/02/26 14:52:50 papadopo Exp $"; #endif /* SKIP_DOXYGEN_PROCESSING */ #include <algo/blast/api/blast_seqalign.h> @@ -82,10 +82,6 @@ SBlastSeqalignArrayFree(SBlastSeqalignArray* seqalign_vec) return NULL; } -/** Creates a score set corresponding to one HSP. - * @param hsp HSP structure [in] - * @return Score set for this HSP. - */ ScorePtr GetScoreSetFromBlastHsp(BlastHSP* hsp) { @@ -743,10 +739,8 @@ BlastHSPToSeqAlign(EBlastProgramType program, BlastHSP* hsp, start1 = hsp->query.offset; start2 = hsp->subject.offset; - translate1 = (program == eBlastTypeBlastx || program == eBlastTypeTblastx || - program == eBlastTypeRpsTblastn); - translate2 = (program == eBlastTypeTblastn || program == eBlastTypeTblastx); - + translate1 = Blast_QueryIsTranslated(program); + translate2 = Blast_SubjectIsTranslated(program); /* If no eGapAlignDecline regions exists output seqalign will be regular Den-Seg or Std-seg */ diff --git a/algo/blast/api/blast_tabular.c b/algo/blast/api/blast_tabular.c index 3f29fcd5..6058f59c 100644 --- a/algo/blast/api/blast_tabular.c +++ b/algo/blast/api/blast_tabular.c @@ -1,4 +1,4 @@ -/* $Id: blast_tabular.c,v 1.34 2006/06/09 17:44:50 papadopo Exp $ +/* $Id: blast_tabular.c,v 1.39 2007/03/20 15:17:16 kans Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -29,7 +29,7 @@ */ #ifndef SKIP_DOXYGEN_PROCESSING -static char const rcsid[] = "$Id: blast_tabular.c,v 1.34 2006/06/09 17:44:50 papadopo Exp $"; +static char const rcsid[] = "$Id: blast_tabular.c,v 1.39 2007/03/20 15:17:16 kans Exp $"; #endif /* SKIP_DOXYGEN_PROCESSING */ #include <algo/blast/api/blast_tabular.h> @@ -40,6 +40,7 @@ static char const rcsid[] = "$Id: blast_tabular.c,v 1.34 2006/06/09 17:44:50 pap #include <algo/blast/api/blast_format.h> #include <algo/blast/api/blast_seqalign.h> #include <algo/blast/core/blast_seqsrc_impl.h> +#include <algo/blast/core/gencode_singleton.h> #include <txalign.h> @@ -82,14 +83,11 @@ Blast_TabularFormatDataSetUp(BlastTabularFormatData* tf_data, ASSERT(score_options && db_options); - tf_data->perform_traceback = - (score_options->gapped_calculation && - ext_options->ePrelimGapExt != eGreedyWithTracebackExt); - + tf_data->perform_traceback = score_options->gapped_calculation; tf_data->program = program; tf_data->hsp_stream = hsp_stream; tf_data->query = query; - tf_data->gen_code_string = db_options->gen_code_string; + tf_data->gen_code_string = GenCodeSingletonFind(db_options->genetic_code); /* Sequence source must be copied, to guarantee multi-thread safety. */ tf_data->seq_src = BlastSeqSrcCopy(seq_src); /* Effective lengths must be duplicated in query info structure, because @@ -171,9 +169,8 @@ FillNuclSequenceBuffers(EBlastProgramType program, BlastHSP* hsp, Boolean reverse; Boolean translate1, translate2; - translate1 = (program == eBlastTypeBlastx || program == eBlastTypeTblastx || - program == eBlastTypeRpsTblastn); - translate2 = (program == eBlastTypeTblastn || program == eBlastTypeTblastx); + translate1 = Blast_QueryIsTranslated(program); + translate2 = Blast_SubjectIsTranslated(program); reverse = (hsp->query.frame != hsp->subject.frame); @@ -364,7 +361,7 @@ void* Blast_TabularFormatThread(void* data) Blast_TracebackFromHSPList(program, hsp_list, query, seq_arg.seq, query_info, gap_align, gap_align->sbp, score_params, - ext_params->options, hit_params, gen_code_string); + ext_params->options, hit_params, gen_code_string, NULL); /* Return subject sequence unless it is needed for the sequence printout */ if (tf_data->format_options != eBlastTabularAddSequences) { diff --git a/algo/blast/api/dust_filter.c b/algo/blast/api/dust_filter.c index db8c9bd7..29097583 100644 --- a/algo/blast/api/dust_filter.c +++ b/algo/blast/api/dust_filter.c @@ -1,5 +1,5 @@ #ifndef SKIP_DOXYGEN_PROCESSING -static char const rcsid[] = "$Id: dust_filter.c,v 1.7 2006/05/24 21:17:50 camacho Exp $"; +static char const rcsid[] = "$Id: dust_filter.c,v 1.10 2007/01/24 14:31:16 madden Exp $"; #endif /* SKIP_DOXYGEN_PROCESSING */ /* @@ -44,14 +44,91 @@ static char const rcsid[] = "$Id: dust_filter.c,v 1.7 2006/05/24 21:17:50 camach #include <algo/blast/api/seqsrc_readdb.h> #include <algo/blast/core/blast_filter.h> #include <algo/blast/core/blast_util.h> -#include <algo/blast/core/blast_inline.h> -#include <algo/blast/core/blast_dust.h> +#include <blast_dust.h> /** @addtogroup CToolkitAlgoBlast * * @{ */ + +/** Look for dustable locations + * @param loc returns locations [in|out] + * @param reg dust specific locations [in] + * @param nreg number of DREGION [in] + * @return 0 if no error + */ +static Int2 +s_GetDustLocations (BlastSeqLoc** loc, DREGION* reg, Int4 nreg) +{ + BlastSeqLoc* tail = NULL; /* pointer to tail of loc linked list */ + + if (!loc) + return -1; + + *loc = NULL; + + /* point to dusted locations */ + if (nreg > 0) { + Int4 i; + for (i = 0; reg && i < nreg; i++) { + /* Cache the tail of the list to avoid the overhead of traversing the + * list when appending to it */ + tail = BlastSeqLocNew(tail ? &tail : loc, reg->from, reg->to); + reg = reg->next; + } + } + return 0; +} + +/** Dust provided sequence. + * @param sequence input sequence [in] + * @param length number of bases [in] + * @param offset where to start on input sequence [in] + * @param level dust parameter [in] + * @param window dust parameter [in] + * @param linker dust parameter [in] + * @param dust_loc returns locations [in|out] + * @return 0 if no error + */ +Int2 s_SeqBufferDust (Uint1* sequence, Int4 length, Int4 offset, + Int2 level, Int2 window, Int2 linker, + BlastSeqLoc** dust_loc) +{ + DREGION* reg,* regold; + Int4 nreg; + Int2 status = 0; + + /* place for dusted regions */ + regold = reg = (DREGION*) calloc(1, sizeof(DREGION)); + if (!reg) + return -1; + + nreg = DustSegs (sequence, length, offset, reg, (Int4)level, + (Int4)window, (Int4)linker); + + status = s_GetDustLocations(dust_loc, reg, nreg); + + /* clean up memory */ + reg = regold; + while (reg) + { + regold = reg; + reg = reg->next; + sfree (regold); + } + + return status; +} + +/** Returns locations that should be masked according to dust. + * @param query_blk input sequence [in] + * @param query_info number of bases etc. [in] + * @param query_seqloc locations to be dusted [in] + * @param filter_options dust parameters provided here [in] + * @param filter_maskloc return value [in|out] + * @return 0 if no error + */ static Int2 s_GetFilteringLocations(BLAST_SequenceBlk* query_blk, BlastQueryInfo* query_info, SeqLoc* query_seqloc, const SBlastFilterOptions* filter_options, BlastMaskLoc** filter_maskloc) { @@ -92,7 +169,7 @@ s_GetFilteringLocations(BLAST_SequenceBlk* query_blk, BlastQueryInfo* query_info if (BlastIsReverseStrand(kIsNucl, context) == TRUE) { /* Reverse this as it's on minus strand. */ BlastSeqLoc* filter_slp_rev = NULL; - SeqBufferDust(buffer, query_length, 0, dust_options->level, + s_SeqBufferDust(buffer, query_length, 0, dust_options->level, dust_options->window, dust_options->linker, &filter_slp); /* Reverse this relative to the part of the query being searched, leave it up to @@ -104,7 +181,7 @@ s_GetFilteringLocations(BLAST_SequenceBlk* query_blk, BlastQueryInfo* query_info } else { - SeqBufferDust(buffer, query_length, 0, dust_options->level, + s_SeqBufferDust(buffer, query_length, 0, dust_options->level, dust_options->window, dust_options->linker, &filter_slp); } diff --git a/algo/blast/api/hspstream_queue.c b/algo/blast/api/hspstream_queue.c index b94b1c89..9324f40d 100644 --- a/algo/blast/api/hspstream_queue.c +++ b/algo/blast/api/hspstream_queue.c @@ -1,4 +1,4 @@ -/* $Id: hspstream_queue.c,v 1.8 2005/04/12 17:59:08 dondosha Exp $ +/* $Id: hspstream_queue.c,v 1.9 2007/07/27 18:03:11 papadopo Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -34,7 +34,7 @@ #ifndef SKIP_DOXYGEN_PROCESSING static char const rcsid[] = - "$Id: hspstream_queue.c,v 1.8 2005/04/12 17:59:08 dondosha Exp $"; + "$Id: hspstream_queue.c,v 1.9 2007/07/27 18:03:11 papadopo Exp $"; #endif /* SKIP_DOXYGEN_PROCESSING */ @@ -203,6 +203,10 @@ BlastHSPListQueueNew(BlastHSPStream* hsp_stream, void* args) SetMethod(hsp_stream, eWrite, fnptr); fnptr.closeFn = &BlastHSPListQueueClose; SetMethod(hsp_stream, eClose, fnptr); + fnptr.batch_read = NULL; + SetMethod(hsp_stream, eBatchRead, fnptr); + fnptr.mergeFn = NULL; + SetMethod(hsp_stream, eMerge, fnptr); SetData(hsp_stream, args); return hsp_stream; diff --git a/algo/blast/api/repeats_filter.c b/algo/blast/api/repeats_filter.c index 7a7aa05c..61259273 100644 --- a/algo/blast/api/repeats_filter.c +++ b/algo/blast/api/repeats_filter.c @@ -1,5 +1,5 @@ #ifndef SKIP_DOXYGEN_PROCESSING -static char const rcsid[] = "$Id: repeats_filter.c,v 1.15 2006/05/24 21:17:50 camacho Exp $"; +static char const rcsid[] = "$Id: repeats_filter.c,v 1.18 2007/03/12 16:13:36 madden Exp $"; #endif /* SKIP_DOXYGEN_PROCESSING */ /* @@ -70,7 +70,7 @@ s_SetRepeatsSearchOptions(SBlastOptions* options) SBlastOptionsSetFilterString(options, REPEATS_SEARCH_FILTER_STRING))) return status; - options->lookup_options->lut_type = NA_LOOKUP_TABLE; + options->lookup_options->lut_type = eNaLookupTable; options->score_options->penalty = REPEATS_SEARCH_PENALTY; options->score_options->gap_open = REPEATS_SEARCH_GAP_OPEN; options->score_options->gap_extend = REPEATS_SEARCH_GAP_EXTEND; @@ -227,8 +227,10 @@ Blast_FindRepeatFilterSeqLoc(SeqLoc* query_seqloc, s_SetRepeatsSearchOptions(options); - status = - Blast_RunSearch(query_seqloc, seq_src, NULL, options, NULL, + status = + Blast_RunSearch(query_seqloc, (Blast_PsiCheckpointLoc *) NULL, + seq_src, (SeqLoc*) NULL, options, + (BlastTabularFormatData*) NULL, &results, &filter_loc, sum_returns); /* The ReadDBFILE structure will not be destroyed here, because the diff --git a/algo/blast/api/seqsrc_multiseq.c b/algo/blast/api/seqsrc_multiseq.c index a8ccdad3..1ffdfbef 100644 --- a/algo/blast/api/seqsrc_multiseq.c +++ b/algo/blast/api/seqsrc_multiseq.c @@ -1,4 +1,4 @@ -/* $Id: seqsrc_multiseq.c,v 1.24 2006/05/24 21:17:50 camacho Exp $ +/* $Id: seqsrc_multiseq.c,v 1.27 2007/05/16 18:11:11 camacho Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -30,7 +30,7 @@ */ #ifndef SKIP_DOXYGEN_PROCESSING -static char const rcsid[] = "$Id: seqsrc_multiseq.c,v 1.24 2006/05/24 21:17:50 camacho Exp $"; +static char const rcsid[] = "$Id: seqsrc_multiseq.c,v 1.27 2007/05/16 18:11:11 camacho Exp $"; #endif /* SKIP_DOXYGEN_PROCESSING */ #include <algo/blast/api/seqsrc_multiseq.h> @@ -171,6 +171,16 @@ s_MultiSeqGetNumSeqs(void* multiseq_handle, void* ignoreme) return seq_info->num_seqs; } +/** Returns zero in this implementation as not supported without an alias file. + * @param multiseq_handle Pointer to the structure containing sequences [in] + * @param ignoreme Unused by this implementation [in] + */ +static Int4 +s_MultiSeqGetNumSeqsStats(void* multiseq_handle, void* ignoreme) +{ + return 0; +} + /** Returns 0 as total length, indicating that this is NOT a database! * @param multiseq_handle Pointer to the structure containing sequences [in] * @param ignoreme Unused by this implementation [in] @@ -181,6 +191,16 @@ s_MultiSeqGetTotLen(void* multiseq_handle, void* ignoreme) return 0; } +/** Returns 0 for statistical total length as this is not supported without an alias file. + * @param multiseq_handle Pointer to the structure containing sequences [in] + * @param ignoreme Unused by this implementation [in] + */ +static Int8 +s_MultiSeqGetTotLenStats(void* multiseq_handle, void* ignoreme) +{ + return 0; +} + /** Needed for completeness only. * @param multiseq_handle Pointer to the structure containing sequences [in] * @param ignoreme Unused by this implementation [in] @@ -311,6 +331,13 @@ s_MultiSeqIteratorNext(void* multiseq_handle, BlastSeqSrcIterator* itr) return retval; } +/** Resets the internal bookmark iterator (N/A in this case) */ +static void +s_MultiSeqResetChunkIter(void* multiseq_handle) +{ + return; +} + /** Multi sequence source destructor: frees its internal data structure and the * BlastSeqSrc structure itself. * @param seq_src BlastSeqSrc structure to free [in] @@ -327,8 +354,6 @@ s_MultiSeqSrcFree(BlastSeqSrc* seq_src) seq_info = (MultiSeqInfo*)_BlastSeqSrcImpl_GetDataStructure(seq_src); seq_info = s_MultiSeqInfoFree(seq_info); - sfree(seq_src); - return NULL; } @@ -376,14 +401,17 @@ s_MultiSeqSrcNew(BlastSeqSrc* retval, void* args) _BlastSeqSrcImpl_SetCopyFnPtr(retval, &s_MultiSeqSrcCopy); _BlastSeqSrcImpl_SetDataStructure(retval, (void*) seq_info); _BlastSeqSrcImpl_SetGetNumSeqs(retval, &s_MultiSeqGetNumSeqs); + _BlastSeqSrcImpl_SetGetNumSeqsStats(retval, &s_MultiSeqGetNumSeqsStats); _BlastSeqSrcImpl_SetGetMaxSeqLen(retval, &s_MultiSeqGetMaxLength); _BlastSeqSrcImpl_SetGetAvgSeqLen(retval, &s_MultiSeqGetAvgLength); _BlastSeqSrcImpl_SetGetTotLen(retval, &s_MultiSeqGetTotLen); + _BlastSeqSrcImpl_SetGetTotLenStats(retval, &s_MultiSeqGetTotLenStats); _BlastSeqSrcImpl_SetGetName(retval, &s_MultiSeqGetName); _BlastSeqSrcImpl_SetGetIsProt(retval, &s_MultiSeqGetIsProt); _BlastSeqSrcImpl_SetGetSequence(retval, &s_MultiSeqGetSequence); _BlastSeqSrcImpl_SetGetSeqLen(retval, &s_MultiSeqGetSeqLen); _BlastSeqSrcImpl_SetIterNext(retval, &s_MultiSeqIteratorNext); + _BlastSeqSrcImpl_SetResetChunkIterator(retval, &s_MultiSeqResetChunkIter); _BlastSeqSrcImpl_SetReleaseSequence(retval, &s_MultiSeqReleaseSequence); return retval; diff --git a/algo/blast/api/seqsrc_multiseq.h b/algo/blast/api/seqsrc_multiseq.h index e5b92344..3655cfd3 100644 --- a/algo/blast/api/seqsrc_multiseq.h +++ b/algo/blast/api/seqsrc_multiseq.h @@ -1,4 +1,4 @@ -/* $Id: seqsrc_multiseq.h,v 1.9 2005/04/06 23:27:53 dondosha Exp $ +/* $Id: seqsrc_multiseq.h,v 1.10 2006/11/21 17:22:15 papadopo Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -33,7 +33,7 @@ #define MULTISEQ_SRC_H #include <algo/blast/core/blast_seqsrc.h> -#include <algo/blast/core/blast_def.h> +#include <algo/blast/core/blast_program.h> #include <algo/blast/core/blast_message.h> #include <objloc.h> diff --git a/algo/blast/api/seqsrc_readdb.c b/algo/blast/api/seqsrc_readdb.c index dc6e99cd..5b1d35bd 100644 --- a/algo/blast/api/seqsrc_readdb.c +++ b/algo/blast/api/seqsrc_readdb.c @@ -1,4 +1,4 @@ -/* $Id: seqsrc_readdb.c,v 1.54 2006/05/31 19:00:52 camacho Exp $ +/* $Id: seqsrc_readdb.c,v 1.57 2007/05/15 15:29:56 camacho Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -29,7 +29,7 @@ */ #ifndef SKIP_DOXYGEN_PROCESSING -static char const rcsid[] = "$Id: seqsrc_readdb.c,v 1.54 2006/05/31 19:00:52 camacho Exp $"; +static char const rcsid[] = "$Id: seqsrc_readdb.c,v 1.57 2007/05/15 15:29:56 camacho Exp $"; #endif /* SKIP_DOXYGEN_PROCESSING */ #include <algo/blast/api/seqsrc_readdb.h> @@ -73,6 +73,22 @@ s_ReaddbGetNumSeqs(void* readdb_handle, void* ignoreme) return dbnseqs; } +/** Retrieves a number of sequences in the BlastSeqSrc + * for use in calculating search space (and expect value). + * @param readdb_handle Pointer to initialized ReadDBFILEPtr structure [in] + * @param ignoreme Unused by this implementation [in] + */ +static Int4 +s_ReaddbGetNumSeqsStats(void* readdb_handle, void* ignoreme) +{ + ReadDBFILEPtr rdfp = (ReadDBFILEPtr) readdb_handle; + Int4 dbnseqs = 0; + Int8 dblength = 0; + + readdb_get_stats_numbers(rdfp, &dbnseqs, &dblength); + return dbnseqs; +} + /** Retrieves the total length of all sequences in the BlastSeqSrc. * @param readdb_handle Pointer to initialized ReadDBFILEPtr structure [in] * @param ignoreme Unused by this implementation [in] @@ -88,6 +104,22 @@ s_ReaddbGetTotLen(void* readdb_handle, void* ignoreme) return dblength; } +/** Retrieves the total length of all sequences in the BlastSeqSrc + * for use in calculating search space (and expect value). + * @param readdb_handle Pointer to initialized ReadDBFILEPtr structure [in] + * @param ignoreme Unused by this implementation [in] + */ +static Int8 +s_ReaddbGetTotLenStats(void* readdb_handle, void* ignoreme) +{ + ReadDBFILEPtr rdfp = (ReadDBFILEPtr) readdb_handle; + Int4 dbnseqs = 0; + Int8 dblength = 0; + + readdb_get_stats_numbers(rdfp, &dbnseqs, &dblength); + return dblength; +} + /** Retrieves the average length of sequences in the BlastSeqSrc. * @param readdb_handle Pointer to initialized ReadDBFILEPtr structure [in] * @param ignoreme Unused by this implementation [in] @@ -433,6 +465,18 @@ s_ReaddbIteratorNext(void* readdb_handle, BlastSeqSrcIterator* itr) return retval; } +/** Reset the ReadDBFilePtr's internal chunk "bookmark" + * @param readdb_handle Pointer to the ReadDBFILE structure [in] + */ +static void +s_ReaddbResetChunkIterator(void* readdb_handle) +{ + ReadDBFILEPtr rdfp = (ReadDBFILEPtr) readdb_handle; + ASSERT(rdfp); + rdfp->shared_info->last_oid_assigned = 0; + return; +} + /** Readdb sequence source destructor: frees its internal data structure and the * BlastSeqSrc structure itself. * @param bssp BlastSeqSrc structure to free [in] @@ -444,7 +488,6 @@ s_ReaddbSeqSrcFree(BlastSeqSrc* bssp) if (!bssp) return NULL; readdb_destruct((ReadDBFILEPtr)_BlastSeqSrcImpl_GetDataStructure(bssp)); - sfree(bssp); return NULL; } @@ -484,14 +527,17 @@ s_InitNewReaddbSeqSrc(BlastSeqSrc* retval, ReadDBFILE* rdfp) _BlastSeqSrcImpl_SetCopyFnPtr(retval, &s_ReaddbSeqSrcCopy); _BlastSeqSrcImpl_SetDataStructure(retval, (void*) rdfp); _BlastSeqSrcImpl_SetGetNumSeqs(retval, &s_ReaddbGetNumSeqs); + _BlastSeqSrcImpl_SetGetNumSeqsStats(retval, &s_ReaddbGetNumSeqsStats); _BlastSeqSrcImpl_SetGetMaxSeqLen(retval, &s_ReaddbGetMaxLength); _BlastSeqSrcImpl_SetGetAvgSeqLen(retval, &s_ReaddbGetAvgLength); _BlastSeqSrcImpl_SetGetTotLen(retval, &s_ReaddbGetTotLen); + _BlastSeqSrcImpl_SetGetTotLenStats(retval, &s_ReaddbGetTotLenStats); _BlastSeqSrcImpl_SetGetName(retval, &s_ReaddbGetName); _BlastSeqSrcImpl_SetGetIsProt(retval, &s_ReaddbGetIsProt); _BlastSeqSrcImpl_SetGetSequence(retval, &s_ReaddbGetSequence); _BlastSeqSrcImpl_SetGetSeqLen(retval, &s_ReaddbGetSeqLen); _BlastSeqSrcImpl_SetIterNext(retval, &s_ReaddbIteratorNext); + _BlastSeqSrcImpl_SetResetChunkIterator(retval, &s_ReaddbResetChunkIterator); _BlastSeqSrcImpl_SetReleaseSequence(retval, &s_ReaddbReleaseSequence); #ifdef KAPPA_PRINT_DIAGNOSTICS _BlastSeqSrcImpl_SetGetGis(retval, &s_ReaddbGetGis); diff --git a/algo/blast/api/twoseq_api.c b/algo/blast/api/twoseq_api.c index 16b2e30c..7e66ce17 100644 --- a/algo/blast/api/twoseq_api.c +++ b/algo/blast/api/twoseq_api.c @@ -1,4 +1,4 @@ -/* $Id: twoseq_api.c,v 1.54 2006/01/23 16:39:50 papadopo Exp $ +/* $Id: twoseq_api.c,v 1.59 2007/03/20 15:17:16 kans Exp $ *************************************************************************** * * * COPYRIGHT NOTICE * @@ -36,9 +36,10 @@ #include <algo/blast/core/blast_message.h> #include <algo/blast/core/blast_util.h> #include <algo/blast/core/blast_engine.h> -#include <algo/blast/core/mb_lookup.h> #include <algo/blast/core/blast_filter.h> +#include <algo/blast/core/blast_nalookup.h> #include <algo/blast/core/hspstream_collector.h> +#include <algo/blast/core/gencode_singleton.h> #include <algo/blast/api/seqsrc_multiseq.h> #include <algo/blast/api/blast_seqalign.h> #include <algo/blast/api/blast_seq.h> @@ -114,11 +115,11 @@ s_TwoSeqBasicFillOptions(const BLAST_SummaryOptions* basic_options, Int2 word_size = basic_options->word_size; char *matrix; - if (program_number == eBlastTypeTblastn || - program_number == eBlastTypeTblastx) { - if ((status = BLAST_GeneticCodeFind(db_options->genetic_code, - &db_options->gen_code_string))) - return status; + if (Blast_SubjectIsTranslated(program_number)) { + Uint1* gc = NULL; + BLAST_GeneticCodeFind(db_options->genetic_code, &gc); + GenCodeSingletonAdd(db_options->genetic_code, gc); + free(gc); } if (program_number == eBlastTypeBlastn) { @@ -137,7 +138,7 @@ s_TwoSeqBasicFillOptions(const BLAST_SummaryOptions* basic_options, query_length > MEGABLAST_CUTOFF) { do_megablast = TRUE; if (basic_options->gapped_calculation) - greedy_align = 1; /* one-pass, no ungapped */ + greedy_align = 1; } @@ -184,7 +185,6 @@ s_TwoSeqBasicFillOptions(const BLAST_SummaryOptions* basic_options, BLAST_FillInitialWordOptions(word_options, program_number, - (Boolean)greedy_align, 0, /* default window size. */ 0); /* default ungapped X dropoff */ |