diff options
Diffstat (limited to 'algo/blast/api')
-rw-r--r-- | algo/blast/api/blast_format.c | 433 | ||||
-rw-r--r-- | algo/blast/api/blast_format.h | 44 | ||||
-rw-r--r-- | algo/blast/api/blast_returns.c | 405 | ||||
-rw-r--r-- | algo/blast/api/blast_returns.h | 105 | ||||
-rw-r--r-- | algo/blast/api/blast_seq.c | 16 | ||||
-rw-r--r-- | algo/blast/api/blast_seqalign.c | 164 | ||||
-rw-r--r-- | algo/blast/api/blast_seqalign.h | 15 | ||||
-rw-r--r-- | algo/blast/api/blast_tabular.c | 238 | ||||
-rw-r--r-- | algo/blast/api/blast_tabular.h | 101 | ||||
-rw-r--r-- | algo/blast/api/hspstream_queue.c | 182 | ||||
-rw-r--r-- | algo/blast/api/hspstream_queue.h | 64 | ||||
-rw-r--r-- | algo/blast/api/multiseq_src.c | 6 | ||||
-rw-r--r-- | algo/blast/api/seqsrc_readdb.c | 46 | ||||
-rw-r--r-- | algo/blast/api/seqsrc_readdb.h | 9 | ||||
-rw-r--r-- | algo/blast/api/twoseq_api.c | 262 | ||||
-rw-r--r-- | algo/blast/api/twoseq_api.h | 77 |
16 files changed, 1585 insertions, 582 deletions
diff --git a/algo/blast/api/blast_format.c b/algo/blast/api/blast_format.c index eb63b53a..4591df8d 100644 --- a/algo/blast/api/blast_format.c +++ b/algo/blast/api/blast_format.c @@ -1,4 +1,4 @@ -/* $Id: blast_format.c,v 1.48 2004/05/03 15:24:04 dondosha Exp $ +/* $Id: blast_format.c,v 1.51 2004/06/07 18:40:34 dondosha Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -34,18 +34,19 @@ Contents: Formatting of BLAST results (SeqAlign) Detailed Contents: ****************************************************************************** - * $Revision: 1.48 $ + * $Revision: 1.51 $ * */ -static char const rcsid[] = "$Id: blast_format.c,v 1.48 2004/05/03 15:24:04 dondosha Exp $"; +static char const rcsid[] = "$Id: blast_format.c,v 1.51 2004/06/07 18:40:34 dondosha Exp $"; #include <algo/blast/api/blast_format.h> #include <algo/blast/api/blast_seq.h> #include <algo/blast/core/blast_filter.h> #include <algo/blast/core/blast_util.h> +#include <algo/blast/core/blast_seqsrc.h> +#include <algo/blast/api/blast_returns.h> #include <sequtil.h> #include <txalign.h> -#include <readdb.h> extern Uint1 LIBCALL BlastGetTypes PROTO((char* blast_program, Boolean* query_is_na, @@ -91,19 +92,23 @@ extern CharPtr LIBCALL BlastGetReference (Boolean html); extern CharPtr LIBCALL BlastGetVersionNumber PROTO((void)); extern CharPtr LIBCALL BlastGetReleaseDate PROTO((void)); -typedef struct TxDfDbInfo { - struct TxDfDbInfo* next; - Boolean is_protein; - char* name; - char* definition; - char* date; - Int8 total_length; - Int4 number_seqs; - Boolean subset; /* Print the subset message. */ -} TxDfDbInfo, *TxDfDbInfoPtr; +/** This function is defined in distrib/tools/blastool.c; 1st argument is + * a pointer to TxDfDbInfo, defined in blastpri.h, which is identical to + * BLAST_DbSummary, defined in algo/blast/api/blast_returns.h + */ +extern Boolean LIBCALL +PrintDbReport PROTO((BLAST_DbSummary* dbinfo, Int4 line_length, FILE *outfp)); + +/** The following 3 functions are from distrib/tools/readdb.h */ +extern Boolean LIBCALL +ReadDBBioseqFetchEnable PROTO((CharPtr program, CharPtr dbname, Boolean is_na, + Boolean now)); + +extern void LIBCALL ReadDBBioseqFetchDisable PROTO((void)); extern Boolean LIBCALL -PrintDbReport PROTO((TxDfDbInfo* dbinfo, Int4 line_length, FILE *outfp)); +PrintDbInformation PROTO((CharPtr database, Boolean is_aa, Int4 line_length, + FILE *outfp, Boolean html)); Int2 BlastFormattingOptionsNew(Uint1 program_number, char* file_out, Int4 num_descriptions, Int4 num_alignments, Int4 align_view, @@ -395,7 +400,7 @@ static MBXml* MBXmlInit(AsnIoPtr aip, CharPtr program, CharPtr database, Int2 BLAST_FormatResults(SeqAlignPtr head, char* blast_database, char* blast_program, Int4 num_queries, SeqLocPtr query_slp, BlastMaskLoc* blast_mask, - BlastFormattingOptions* format_options, Boolean is_ooframe) + const BlastFormattingOptions* format_options, Boolean is_ooframe) { SeqAlignPtr seqalign = head, sap, next_seqalign; SeqLocPtr mask_loc, next_mask_loc = NULL, tmp_loc = NULL, mask_loc_head; @@ -598,277 +603,20 @@ Int2 BLAST_FormatResults(SeqAlignPtr head, char* blast_database, return 0; } -static TxDfDbInfo* LIBCALL -TxDfDbInfoDestruct (TxDfDbInfo* dbinfo) -{ - TxDfDbInfo* next; - - if (dbinfo == NULL) - return NULL; - - while (dbinfo) - { - sfree(dbinfo->name); - sfree(dbinfo->definition); - sfree(dbinfo->date); - next = dbinfo->next; - sfree(dbinfo); - dbinfo = next; - } - - return dbinfo; -} - -static TxDfDbInfo* BLAST_GetDbInfo(ReadDBFILEPtr rdfp) -{ - TxDfDbInfo* dbinfo,* head = NULL,* dbinfo_var = NULL; - char* chptr; - - while (rdfp) { - dbinfo = calloc(1, sizeof(TxDfDbInfo)); - dbinfo->name = strdup(readdb_get_filename(rdfp)); - - if((chptr = readdb_get_title(rdfp)) == NULL) - chptr = readdb_get_filename(rdfp); - dbinfo->definition = strdup(chptr); - - dbinfo->date = strdup(readdb_get_date(rdfp)); - - dbinfo->is_protein = readdb_is_prot(rdfp); - - if (rdfp->aliaslen) - dbinfo->total_length = rdfp->aliaslen; - else - dbinfo->total_length = readdb_get_dblen(rdfp); - if (rdfp->aliasnseq) - dbinfo->number_seqs = rdfp->aliasnseq; - else - dbinfo->number_seqs = readdb_get_num_entries(rdfp); - if (head == NULL) { - head = dbinfo; - dbinfo_var = dbinfo; - } else { - dbinfo_var->next = dbinfo; - dbinfo_var = dbinfo_var->next; - } - rdfp = rdfp->next; - } - return head; -} -/* - adds the new string to the buffer, separating by a tilde. - Checks the size of the buffer for FormatBlastParameters and - allocates longer replacement if needed. -*/ - -static Boolean -add_string_to_bufferEx(char* buffer, char* *old, Int2* old_length, Boolean add_tilde) - -{ - char* new,* ptr; - Int2 length, new_length; - - length = (StringLen(*old)); - - if((Int2)(StringLen(buffer)+length+3) > *old_length) - { - new_length = *old_length + 255; - new = calloc(new_length, sizeof(char)); - if (*old_length > 0 && *old != NULL) - { - memcpy(new, *old, *old_length); - sfree(*old); - } - *old = new; - *old_length = new_length; - } - - ptr = *old; - ptr += length; - if (add_tilde) - { - *ptr = '~'; - ptr++; - } - - while (*buffer != NULLB) - { - *ptr = *buffer; - buffer++; ptr++; - } - - return TRUE; -} - -static Boolean -add_string_to_buffer(char* buffer, char* *old, Int2* old_length) - -{ - return add_string_to_bufferEx(buffer, old, old_length, TRUE); -} - - - -/* - Formats the BLAST parameters for the BLAST report. - One char* is returned, newlines are indicated by tildes ('~'). -*/ - - -static char* -FormatBlastParameters(Uint1 program_number, - BlastScoringOptions* score_options, - BlastScoreBlk* sbp, LookupTableOptions* lookup_options, - BlastInitialWordOptions* word_options, - BlastExtensionOptions* ext_options, - BlastHitSavingOptions* hit_options, - BlastQueryInfo* query_info, ReadDBFILEPtr rdfp, - BlastReturnStat* return_stats) -{ - Int4 cutoff = 0; - char buffer[128]; - char* ret_buffer; - Int2 ret_buffer_length; - Int4 num_entries; - Int8 total_length; - Int4 qlen; - double evalue; - Boolean single_query = (query_info->last_context <= 1); - Blast_KarlinBlk* kbp; - - ret_buffer = NULL; - ret_buffer_length = 0; - - - if (score_options->matrix) { - sprintf(buffer, "Matrix: %s", score_options->matrix); - add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); - } - - if (score_options->gapped_calculation) { - sprintf(buffer, "Gap Penalties: Existence: %ld, Extension: %ld", - (long) score_options->gap_open, - (long) score_options->gap_extend); - add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); - } - - if (rdfp) { - readdb_get_totals_ex(rdfp, &total_length, &num_entries, TRUE); - } else { - num_entries = 1; - total_length = 1000; - } - - sprintf(buffer, "Number of Sequences: %ld", (long) num_entries); - add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); - if (return_stats) { - sprintf(buffer, "Number of Hits to DB: %s", - Nlm_Int8tostr((Int8) return_stats->db_hits, 1)); - add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); - - sprintf(buffer, "Number of extensions: %ld", - (long) return_stats->init_extends); - add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); - sprintf(buffer, "Number of successful extensions: %ld", - (long) return_stats->good_init_extends); - add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); - - if (hit_options->expect_value > 0.1) { - sprintf(buffer, "Number of sequences better than %4.1f: %ld", - hit_options->expect_value, - (long) return_stats->number_of_seqs_better_E); - } else { - sprintf(buffer, "Number of sequences better than %3.1e: %ld", - hit_options->expect_value, - (long) return_stats->number_of_seqs_better_E); - } - add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); - - if (score_options->gapped_calculation) { - sprintf(buffer, - "Number of HSP's better than %4.1f without gapping: %ld", - hit_options->expect_value, - (long) return_stats->prelim_gap_no_contest); - add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); - sprintf(buffer, - "Number of HSP's successfully gapped in prelim test: %ld", - (long) return_stats->prelim_gap_passed); - add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); - } - } - /* Total query length does not include the first and last sentinel byte */ - qlen = query_info->context_offsets[query_info->last_context+1] - 1; - sprintf(buffer, "length of query: %ld", (long)qlen); - add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); - sprintf(buffer, "length of database: %s", Nlm_Int8tostr (total_length, 1)); - add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); - - if (single_query) { - sprintf(buffer, "effective search space used: %s", - Nlm_Int8tostr( - query_info->eff_searchsp_array[query_info->first_context], 1)); - add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); - } - sprintf(buffer, "T: %ld", (long) lookup_options->threshold); - add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); - sprintf(buffer, "A: %ld", (long) word_options->window_size); - add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); - - if (!score_options->gapped_calculation) - kbp = sbp->kbp[query_info->first_context]; - else - kbp = sbp->kbp_gap[query_info->first_context]; - - sprintf(buffer, "X1: %ld (%4.1f bits)", - (long)return_stats->x_drop_ungapped, word_options->x_dropoff); - add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); - if (score_options->gapped_calculation) { - sprintf(buffer, "X2: %ld (%4.1f bits)", - (long)return_stats->x_drop_gap, ext_options->gap_x_dropoff); - add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); - sprintf(buffer, "X3: %ld (%4.1f bits)", - (long)return_stats->x_drop_gap_final, - ext_options->gap_x_dropoff_final); - add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); - - sprintf(buffer, "S1: %ld (%4.1f bits)", - (long)return_stats->gap_trigger, ext_options->gap_trigger); - add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); - } - - cutoff = 0; - if (single_query) { - Int4 context = query_info->first_context; - double searchsp = (double) query_info->eff_searchsp_array[context]; - - /* For translated RPS blast the search space must be scaled down */ - if (program_number == blast_type_rpstblastn) - searchsp = searchsp / NUM_FRAMES; - - evalue = hit_options->expect_value; - BLAST_Cutoffs(&cutoff, &evalue, kbp, searchsp, FALSE, 0); - sprintf(buffer, "S2: %ld (%4.1f bits)", (long) cutoff, - (((cutoff)*(kbp->Lambda))-(kbp->logK))/NCBIMATH_LN2); - add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); - } - return ret_buffer; -} - Int2 PrintOutputFooter(Uint1 program_number, - BlastFormattingOptions* format_options, - BlastScoringOptions* score_options, BlastScoreBlk* sbp, - LookupTableOptions* lookup_options, - BlastInitialWordOptions* word_options, - BlastExtensionOptions* ext_options, - BlastHitSavingOptions* hit_options, - BlastQueryInfo* query_info, char* dbname, - BlastReturnStat* return_stats, - Boolean db_is_na) + const BlastFormattingOptions* format_options, + const BlastScoringOptions* score_options, const BlastScoreBlk* sbp, + const LookupTableOptions* lookup_options, + const BlastInitialWordOptions* word_options, + const BlastExtensionOptions* ext_options, + const BlastHitSavingOptions* hit_options, + const BlastEffectiveLengthsOptions* eff_len_options, + const BlastQueryInfo* query_info, const BlastSeqSrc* seq_src, + const BlastDiagnostics* diagnostics) { FILE *outfp; - TxDfDbInfo* dbinfo_head,* dbinfo; + BLAST_DbSummary* dbinfo_head,* dbinfo; char* params_buffer; - ReadDBFILEPtr rdfp = NULL; if (!format_options || !format_options->outfp) return -1; @@ -882,14 +630,12 @@ Int2 PrintOutputFooter(Uint1 program_number, fprintf(outfp, "<PRE>\n"); init_buff_ex(85); - if (dbname && (rdfp = readdb_new(dbname, !db_is_na))) { - dbinfo_head = BLAST_GetDbInfo(rdfp); + dbinfo_head = Blast_GetDbSummary(seq_src); - for (dbinfo = dbinfo_head; dbinfo; dbinfo = dbinfo->next) { - PrintDbReport(dbinfo, 70, outfp); - } - dbinfo_head = TxDfDbInfoDestruct(dbinfo_head); + for (dbinfo = dbinfo_head; dbinfo; dbinfo = dbinfo->next) { + PrintDbReport(dbinfo, 70, outfp); } + dbinfo_head = Blast_DbSummaryFree(dbinfo_head); if (sbp && sbp->kbp) { Blast_KarlinBlk* ka_params; @@ -908,20 +654,19 @@ Int2 PrintOutputFooter(Uint1 program_number, } params_buffer = - FormatBlastParameters(program_number, score_options, sbp, - lookup_options, word_options, ext_options, hit_options, query_info, - rdfp, return_stats); + Blast_GetParametersBuffer(program_number, score_options, sbp, + lookup_options, word_options, ext_options, + hit_options, eff_len_options, query_info, + seq_src, diagnostics); PrintTildeSepLines(params_buffer, 70, outfp); sfree(params_buffer); free_buff(); - readdb_destruct(rdfp); - return 0; } -Int2 BLAST_PrintOutputHeader(BlastFormattingOptions* format_options, +Int2 BLAST_PrintOutputHeader(const BlastFormattingOptions* format_options, Boolean is_megablast, char* dbname, Boolean is_protein) { if (format_options->align_view < 7) { @@ -965,24 +710,15 @@ Int2 BLAST_PrintOutputHeader(BlastFormattingOptions* format_options, #define BUFFER_LENGTH 256 -static void -PrintSeqDefline(SeqIdPtr sip, char* descr, char** buffer_ptr, +void +Blast_SeqIdGetDefLine(SeqIdPtr sip, char* descr, char** buffer_ptr, Boolean ncbi_gi, Boolean accession_only, Boolean seqid_only, Boolean believe_local_id) { - char* seqid_buffer = NULL, *title = NULL, *defline_buffer; + char* seqid_buffer = NULL, *title = NULL, *defline_buffer = NULL; Int4 gi; Boolean numeric_sip_type = FALSE; - if (!descr) { - /* Get the title */ - BioseqPtr bsp = BioseqLockById(sip); - title = strdup(BioseqGetTitle(bsp)); - BioseqUnlock(bsp); - } else { - title = descr; - } - if ((believe_local_id || sip->choice != SEQID_LOCAL) && (sip->choice != SEQID_GENERAL || StringCmp(((DbtagPtr)sip->data.ptrvalue)->db, "BL_ORD_ID"))) { @@ -998,8 +734,20 @@ PrintSeqDefline(SeqIdPtr sip, char* descr, char** buffer_ptr, GetAccessionFromSeqId(SeqIdFindBestAccession(sip), &gi, &seqid_buffer); } - } else if (seqid_only && title) { - seqid_buffer = StringTokMT(title, " \t\n\r", &title); + } else { + if (!descr) { + /* Get the title */ + BioseqPtr bsp = BioseqLockById(sip); + if (bsp) { + title = strdup(BioseqGetTitle(bsp)); + BioseqUnlock(bsp); + } + } else { + title = descr; + } + + if (seqid_only) + seqid_buffer = StringTokMT(title, " \t\n\r", &title); } if (numeric_sip_type && !seqid_buffer) { @@ -1021,35 +769,13 @@ PrintSeqDefline(SeqIdPtr sip, char* descr, char** buffer_ptr, *buffer_ptr = defline_buffer; } -static void ScoreAndEvalueToBuffers(double bit_score, double evalue, - char* *bit_score_buf, char* *evalue_buf) -{ - if (evalue < 1.0e-180) - sprintf(*evalue_buf, "0.0"); - else if (evalue < 1.0e-99) - sprintf(*evalue_buf, "%2.0e", evalue); - else if (evalue < 0.0009) - sprintf(*evalue_buf, "%3.1e", evalue); - else if (evalue < 1.0) - sprintf(*evalue_buf, "%4.3f", evalue); - else - sprintf(*evalue_buf, "%5.1f", evalue); - - if (bit_score > 9999) - sprintf(*bit_score_buf, "%4.3e", bit_score); - else if (bit_score > 99.9) - sprintf(*bit_score_buf, "%4.1f", bit_score); - else - sprintf(*bit_score_buf, "%4.2f", bit_score); -} - /* This function might serve as a starting point for a callback function * that prints results before the traceback stage, e.g. the on-the-fly * tabular output, a la the -D3 option of the old megablast. */ void BLAST_PrintIntermediateResults(BlastHSPResults* results, BlastQueryInfo* query_info, SeqLocPtr query_slp, - ReadDBFILEPtr rdfp, SeqIdPtr seqid, BlastScoreBlk* sbp, + BlastSeqSrc* seq_src, BlastScoreBlk* sbp, char* filename) { Int4 query_index, hit_index, hsp_index; @@ -1061,33 +787,24 @@ void BLAST_PrintIntermediateResults(BlastHSPResults* results, Int4 q_start, q_end, s_start, s_end; FILE *outfp; SeqLocPtr slp; - char* subject_descr; Blast_KarlinBlk* kbp; - char* bit_score_buff,* eval_buff; - double bit_score; + char bit_score_buff[10], eval_buff[10]; + char* eval_buff_ptr = NULL; BlastHSP* hsp; + ListNode* seqid_wrap = NULL; - if (!results || !query_info || !query_slp || (!rdfp && !seqid) || + if (!results || !query_info || !query_slp || !seq_src || !sbp || !filename) return; outfp = FileOpen(filename, "w"); - eval_buff = (char *) malloc(10); - bit_score_buff = (char *) malloc(10); - - if (!rdfp) { - /* Two sequences case */ - subject_id = seqid; - subject_descr = NULL; - } - for (query_index = 0, slp = query_slp; query_index < results->num_queries && slp; ++query_index, slp = slp->next) { hit_list = results->hitlist_array[query_index]; query_id = SeqLocId(slp); - PrintSeqDefline(query_id, NULL, &query_buffer, TRUE, FALSE, FALSE, + Blast_SeqIdGetDefLine(query_id, NULL, &query_buffer, TRUE, FALSE, FALSE, FALSE); fprintf(outfp, "#Query = %s\n\n", query_buffer); sfree(query_buffer); @@ -1100,11 +817,16 @@ void BLAST_PrintIntermediateResults(BlastHSPResults* results, for (hit_index = 0; hit_index < hit_list->hsplist_count; ++hit_index) { hsp_list = hit_list->hsplist_array[hit_index]; - if (rdfp) { - readdb_get_descriptor(rdfp, hsp_list->oid, &subject_id, - &subject_descr); + BLASTSeqSrcGetSeqId(seq_src, (void*) &hsp_list->oid); + if (seqid_wrap->choice == BLAST_SEQSRC_C_SEQID) { + subject_id = (SeqId*) seqid_wrap->ptr; + ListNodeFree(seqid_wrap); + } else { + /* Could not retrieve id for this subject sequence. This should + never happen, but if it does, skip this HSP. */ + continue; } - PrintSeqDefline(subject_id, subject_descr, &subject_buffer, + Blast_SeqIdGetDefLine(subject_id, NULL, &subject_buffer, TRUE, FALSE, FALSE, TRUE); fprintf(outfp, ">%s\n\n", subject_buffer); sfree(subject_buffer); @@ -1124,19 +846,18 @@ void BLAST_PrintIntermediateResults(BlastHSPResults* results, } kbp = sbp->kbp[hsp->context]; - bit_score = (hsp->score*kbp->Lambda - kbp->logK) / - NCBIMATH_LN2; - ScoreAndEvalueToBuffers(bit_score, hsp->evalue, - &bit_score_buff, &eval_buff); + eval_buff_ptr = eval_buff; + + ScoreAndEvalueToBuffers(hsp->bit_score, hsp->evalue, + bit_score_buff, &eval_buff_ptr, TRUE); fprintf(outfp, "[%ld %ld] [%ld %ld] %s %s\n", (long)q_start, (long)q_end, (long)s_start, (long)s_end, - bit_score_buff, eval_buff); + bit_score_buff, eval_buff_ptr); } } } } FileClose(outfp); } - diff --git a/algo/blast/api/blast_format.h b/algo/blast/api/blast_format.h index 28535ff6..24f3bfa9 100644 --- a/algo/blast/api/blast_format.h +++ b/algo/blast/api/blast_format.h @@ -1,4 +1,4 @@ -/* $Id: blast_format.h,v 1.20 2004/04/22 22:15:40 dondosha Exp $ +/* $Id: blast_format.h,v 1.22 2004/06/07 18:40:48 dondosha Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -32,7 +32,7 @@ Author: Ilya Dondoshansky Contents: Functions needed for formatting of BLAST results ****************************************************************************** - * $Revision: 1.20 $ + * $Revision: 1.22 $ * */ #ifndef __BLAST_FORMAT__ #define __BLAST_FORMAT__ @@ -48,10 +48,11 @@ extern "C" { #include <ncbi.h> #include <asn.h> #include <bxmlobj.h> -#include <readdb.h> #include <algo/blast/core/blast_options.h> #include <algo/blast/core/blast_hits.h> -#include <algo/blast/api/seqsrc_readdb.h> +#include <algo/blast/core/blast_seqsrc.h> +#include <algo/blast/core/blast_diagnostics.h> +#include <algo/blast/api/twoseq_api.h> /** Options for formatting BLAST results */ @@ -117,7 +118,7 @@ typedef struct MBXml { Int2 BLAST_FormatResults(SeqAlignPtr head, char* blast_database, char* blast_program, Int4 num_queries, SeqLocPtr query_slp, BlastMaskLoc* mask_loc, - BlastFormattingOptions* format_options, Boolean is_ooframe); + const BlastFormattingOptions* format_options, Boolean is_ooframe); /** Print the summary at the end of the BLAST report. * @param program_number Type of BLAST program [in] @@ -128,20 +129,23 @@ Int2 BLAST_FormatResults(SeqAlignPtr head, char* blast_database, * @param word_options Word finding options and parameters [in] * @param ext_options Extension options and parameters [in] * @param hit_options Hit saving options [in] + * @param eff_len_options Effective lengths options, containing user-specified + * values for database length or eff. search space [in] * @param query_info Query information [in] - * @param dbname BLAST database name [in] - * @param return_stats Data about this run [in] - * @param db_is_na TRUE if a nucleotide database [in] + * @param seq_src Source of subject sequences [in] + * @param diagnostics Data about this run [in] */ Int2 PrintOutputFooter(Uint1 program_number, - BlastFormattingOptions* format_options, - BlastScoringOptions* score_options, BlastScoreBlk* sbp, - LookupTableOptions* lookup_options, - BlastInitialWordOptions* word_options, - BlastExtensionOptions* ext_options, - BlastHitSavingOptions* hit_options, - BlastQueryInfo* query_info, char* dbname, - BlastReturnStat* return_stats, Boolean db_is_na); + const BlastFormattingOptions* format_options, + const BlastScoringOptions* score_options, + const BlastScoreBlk* sbp, + const LookupTableOptions* lookup_options, + const BlastInitialWordOptions* word_options, + const BlastExtensionOptions* ext_options, + const BlastHitSavingOptions* hit_options, + const BlastEffectiveLengthsOptions* eff_len_options, + const BlastQueryInfo* query_info, const BlastSeqSrc* seq_src, + const BlastDiagnostics* diagnostics); /** Prints the top part of the traditional BLAST output, including version, * reference(s) and database information. @@ -151,13 +155,17 @@ Int2 PrintOutputFooter(Uint1 program_number, * @param dbname BLAST database name [in] * @param is_protein Is the database protein or nucleotide? [in] */ -Int2 BLAST_PrintOutputHeader(BlastFormattingOptions* format_options, +Int2 BLAST_PrintOutputHeader(const BlastFormattingOptions* format_options, Boolean is_megablast, char* dbname, Boolean is_protein); void BLAST_PrintIntermediateResults(BlastHSPResults* results, BlastQueryInfo* query_info, SeqLocPtr query_slp, - ReadDBFILEPtr rdfp, SeqIdPtr seqid, BlastScoreBlk* sbp, + BlastSeqSrc* seq_src, BlastScoreBlk* sbp, char* filename); +void +Blast_SeqIdGetDefLine(SeqIdPtr sip, char* descr, char** buffer_ptr, + Boolean ncbi_gi, Boolean accession_only, + Boolean seqid_only, Boolean believe_local_id); #ifdef __cplusplus diff --git a/algo/blast/api/blast_returns.c b/algo/blast/api/blast_returns.c new file mode 100644 index 00000000..6d385346 --- /dev/null +++ b/algo/blast/api/blast_returns.c @@ -0,0 +1,405 @@ +/* $Id: blast_returns.c,v 1.1 2004/05/14 17:19:03 dondosha Exp $ +* =========================================================================== +* +* PUBLIC DOMAIN NOTICE +* National Center for Biotechnology Information +* +* This software/database is a "United States Government Work" under the +* terms of the United States Copyright Act. It was written as part of +* the author's offical duties as a United States Government employee and +* thus cannot be copyrighted. This software/database is freely available +* to the public for use. The National Library of Medicine and the U.S. +* Government have not placed any restriction on its use or reproduction. +* +* Although all reasonable efforts have been taken to ensure the accuracy +* and reliability of the software and data, the NLM and the U.S. +* Government do not and cannot warrant the performance or results that +* may be obtained by using this software or data. The NLM and the U.S. +* Government disclaim all warranties, express or implied, including +* warranties of performance, merchantability or fitness for any particular +* purpose. +* +* Please cite the author in any work or product based on this material. +* +* ===========================================================================*/ + +/***************************************************************************** + +File name: blast_returns.c + +Author: Ilya Dondoshansky + +Contents: Manipulating data returned from BLAST other than Seq-aligns + +Detailed Contents: + +****************************************************************************** + * $Revision: 1.1 $ + * */ + +static char const rcsid[] = "$Id: blast_returns.c,v 1.1 2004/05/14 17:19:03 dondosha Exp $"; + +#include <algo/blast/api/blast_returns.h> +#include <algo/blast/api/blast_seq.h> +#include <algo/blast/core/blast_filter.h> +#include <algo/blast/core/blast_util.h> +#include <algo/blast/core/blast_seqsrc.h> + +BLAST_DbSummary* LIBCALL +Blast_DbSummaryFree (BLAST_DbSummary* dbinfo) +{ + BLAST_DbSummary* next; + + if (dbinfo == NULL) + return NULL; + + while (dbinfo) + { + sfree(dbinfo->name); + sfree(dbinfo->definition); + sfree(dbinfo->date); + next = dbinfo->next; + sfree(dbinfo); + dbinfo = next; + } + + return dbinfo; +} + +BLAST_DbSummary* Blast_GetDbSummary(const BlastSeqSrc* seq_src) +{ + BLAST_DbSummary* dbinfo = NULL; + char* chptr = NULL; + + dbinfo = calloc(1, sizeof(BLAST_DbSummary)); + dbinfo->name = BLASTSeqSrcGetName(seq_src); + + if((chptr = BLASTSeqSrcGetDefinition(seq_src)) == NULL) + chptr = dbinfo->name; + + if (chptr) + dbinfo->definition = strdup(chptr); + + dbinfo->date = BLASTSeqSrcGetDate(seq_src); + + dbinfo->is_protein = BLASTSeqSrcGetIsProt(seq_src); + + if ((dbinfo->total_length = BLASTSeqSrcGetTotLen(seq_src)) == 0) + dbinfo->total_length = BLASTSeqSrcGetMaxSeqLen(seq_src); + dbinfo->number_seqs = BLASTSeqSrcGetNumSeqs(seq_src); + + return dbinfo; +} + +/* + adds the new string to the buffer, separating by a tilde. + Checks the size of the buffer for Blast_GetParametersBuffer and + allocates longer replacement if needed. +*/ + +static Boolean +add_string_to_buffer(char* buffer, char* *old, Int2* old_length) + +{ + char* new,* ptr; + Int2 length = 0, new_length; + + if (!buffer) + return FALSE; + + if (*old) + length = (strlen(*old)); + + if((Int2)(strlen(buffer)+length+3) > *old_length) + { + new_length = *old_length + 255; + new = calloc(new_length, sizeof(char)); + if (*old_length > 0 && *old != NULL) + { + memcpy(new, *old, *old_length); + sfree(*old); + } + *old = new; + *old_length = new_length; + } + + ptr = *old; + ptr += length; + + /* Add a tilde */ + *ptr = '~'; + ptr++; + + while (*buffer != NULLB) + { + *ptr = *buffer; + buffer++; ptr++; + } + + return TRUE; +} + +char* +Blast_GetParametersBuffer(Uint1 program_number, + const BlastScoringOptions* score_options, + const BlastScoreBlk* sbp, const LookupTableOptions* lookup_options, + const BlastInitialWordOptions* word_options, + const BlastExtensionOptions* ext_options, + const BlastHitSavingOptions* hit_options, + const BlastEffectiveLengthsOptions* eff_len_options, + const BlastQueryInfo* query_info, const BlastSeqSrc* seq_src, + const BlastDiagnostics* diagnostics) +{ + Int4 cutoff = 0; + char buffer[128]; + char* ret_buffer; + Int2 ret_buffer_length; + Int4 num_entries = 0; + Int8 total_length = 0; + Int4 qlen = 0; + double evalue; + Int2 num_frames; + Boolean single_query; + Blast_KarlinBlk* kbp; + BlastUngappedStats* ungapped_stats = NULL; + BlastGappedStats* gapped_stats = NULL; + BlastRawCutoffs* raw_cutoffs = NULL; + + ret_buffer = NULL; + ret_buffer_length = 0; + + if (program_number == blast_type_blastx || + program_number == blast_type_tblastx) + num_frames = NUM_FRAMES; + else if (program_number == blast_type_blastn) + num_frames = 2; + else + num_frames = 1; + + single_query = (query_info->last_context < num_frames); + + if (diagnostics) { + ungapped_stats = diagnostics->ungapped_stat; + gapped_stats = diagnostics->gapped_stat; + raw_cutoffs = diagnostics->cutoffs; + } + + if (score_options->matrix) { + sprintf(buffer, "Matrix: %s", score_options->matrix); + add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); + } + + if (score_options->gapped_calculation) { + sprintf(buffer, "Gap Penalties: Existence: %ld, Extension: %ld", + (long) score_options->gap_open, + (long) score_options->gap_extend); + add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); + } + + if (eff_len_options->db_length) + total_length = eff_len_options->db_length; + else if (seq_src) { + if ((total_length = BLASTSeqSrcGetTotLen(seq_src)) == 0) + total_length = BLASTSeqSrcGetMaxSeqLen(seq_src); + } + + if (program_number == blast_type_tblastn || + program_number == blast_type_rpstblastn || + program_number == blast_type_tblastx) + total_length /= 3; + + if (eff_len_options->dbseq_num) + num_entries = eff_len_options->dbseq_num; + else if (seq_src) + num_entries = BLASTSeqSrcGetNumSeqs(seq_src); + + sprintf(buffer, "Number of Sequences: %ld", (long) num_entries); + add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); + if (ungapped_stats) { + sprintf(buffer, "Number of Hits to DB: %s", + Nlm_Int8tostr(ungapped_stats->lookup_hits, 1)); + add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); + + sprintf(buffer, "Number of extensions: %ld", + (long) ungapped_stats->init_extends); + add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); + sprintf(buffer, "Number of successful extensions: %ld", + (long) ungapped_stats->good_init_extends); + add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); + } + + if (gapped_stats) { + if (hit_options->expect_value > 0.1) { + sprintf(buffer, "Number of sequences better than %4.1f: %ld", + hit_options->expect_value, + (long) gapped_stats->num_seqs_passed); + } else { + sprintf(buffer, "Number of sequences better than %3.1e: %ld", + hit_options->expect_value, + (long) gapped_stats->num_seqs_passed); + } + add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); + + if (score_options->gapped_calculation) { + sprintf(buffer, + "Number of HSP's better than %4.1f without gapping: %ld", + hit_options->expect_value, + (long) gapped_stats->seqs_ungapped_passed); + add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); + sprintf(buffer, + "Number of HSP's gapped: %ld", + (long) gapped_stats->extensions); + add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); + sprintf(buffer, + "Number of HSP's successfully gapped: %ld", + (long) gapped_stats->good_extensions); + add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); + sprintf(buffer, + "Number of extra gapped extensions for HSPs above %4.1f: %ld", + hit_options->expect_value, + (long) gapped_stats->extra_extensions); + add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); + } + } + + /* Query length makes sense only for single query sequence. */ + if (single_query) { + qlen = BLAST_GetQueryLength(query_info, query_info->first_context); + sprintf(buffer, "Length of query: %ld", (long)qlen); + add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); + } + + sprintf(buffer, "Length of database: %s", Nlm_Int8tostr (total_length, 1)); + add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); + + if (single_query) { + Int4 length_adjustment = + query_info->length_adjustments[query_info->first_context]; + Int4 eff_qlen; + Int8 eff_dblen; + sprintf(buffer, "Length adjustment: %ld", (long) length_adjustment); + add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); + + /** FIXME: Should this be different for RPS BLAST? */ + eff_qlen = qlen - length_adjustment; + sprintf(buffer, "Effective length of query: %ld", (long)eff_qlen); + add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); + + eff_dblen = total_length - num_entries*length_adjustment; + sprintf(buffer, "Effective length of database: %s", + Nlm_Int8tostr (eff_dblen , 1)); + add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); + sprintf(buffer, "Effective search space: %8.0f", + ((double) eff_dblen)*((double) eff_qlen)); + add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); + sprintf(buffer, "Effective search space used: %8.0f", + (double)query_info->eff_searchsp_array[query_info->first_context]); + add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); + } + sprintf(buffer, "Neighboring words threshold: %ld", + (long) lookup_options->threshold); + add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); + sprintf(buffer, "Window for multiple hits: %ld", + (long) word_options->window_size); + add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); + + if (raw_cutoffs) { + kbp = sbp->kbp[query_info->first_context]; + sprintf(buffer, "X1: %ld (%4.1f bits)", + (long)raw_cutoffs->x_drop_ungapped, + raw_cutoffs->x_drop_ungapped*kbp->Lambda/NCBIMATH_LN2); + add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); + if (score_options->gapped_calculation) { + sprintf(buffer, "X2: %ld (%4.1f bits)", + (long)raw_cutoffs->x_drop_gap, ext_options->gap_x_dropoff); + add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); + sprintf(buffer, "X3: %ld (%4.1f bits)", + (long)raw_cutoffs->x_drop_gap_final, + ext_options->gap_x_dropoff_final); + add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); + + sprintf(buffer, "S1: %ld (%4.1f bits)", + (long)raw_cutoffs->gap_trigger, ext_options->gap_trigger); + add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); + } + } + cutoff = 0; + if (single_query) { + Int4 context = query_info->first_context; + double searchsp = (double) query_info->eff_searchsp_array[context]; + + /* For translated RPS blast the search space must be scaled down */ + if (program_number == blast_type_rpstblastn) + searchsp = searchsp / NUM_FRAMES; + + evalue = hit_options->expect_value; + if (!score_options->gapped_calculation) + kbp = sbp->kbp[query_info->first_context]; + else + kbp = sbp->kbp_gap[query_info->first_context]; + + BLAST_Cutoffs(&cutoff, &evalue, kbp, searchsp, FALSE, 0); + sprintf(buffer, "S2: %ld (%4.1f bits)", (long) cutoff, + (((cutoff)*(kbp->Lambda))-(kbp->logK))/NCBIMATH_LN2); + add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length); + } + return ret_buffer; +} + +/** Save the Karlin-Altschul parameters calculated in the BLAST search. + * @param sbp Internal scoring block structure [in] + * @param context Index into the array of structures containing + * Karlin-Altschul parameters [in] + * @param sum_returns Returns summary structure [out] +*/ +static void +Blast_SummaryFillKAParameters(const BlastScoreBlk* sbp, Int4 context, + BLAST_SummaryReturn* sum_returns) +{ + Blast_KarlinBlk* kbp; + + if (!sbp) + return; + + if (sbp->kbp) { + kbp = sbp->kbp[context]; + sum_returns->ka_params = + (BLAST_KAParameters*) malloc(sizeof(BLAST_KAParameters)); + sum_returns->ka_params->Lambda = kbp->Lambda; + sum_returns->ka_params->K = kbp->K; + sum_returns->ka_params->H = kbp->H; + } + + if (sbp->kbp_gap) { + kbp = sbp->kbp_gap[context]; + sum_returns->ka_params_gap = + (BLAST_KAParameters*) malloc(sizeof(BLAST_KAParameters)); + sum_returns->ka_params_gap->Lambda = kbp->Lambda; + sum_returns->ka_params_gap->K = kbp->K; + sum_returns->ka_params_gap->H = kbp->H; + } +} + +void Blast_SummaryReturnFill(Uint1 program_number, + const BlastScoringOptions* score_options, + const BlastScoreBlk* sbp, + const LookupTableOptions* lookup_options, + const BlastInitialWordOptions* word_options, + const BlastExtensionOptions* ext_options, + const BlastHitSavingOptions* hit_options, + const BlastEffectiveLengthsOptions* eff_len_options, + const BlastQueryInfo* query_info, + const BlastSeqSrc* seq_src, + const BlastDiagnostics* diagnostics, + BLAST_SummaryReturn** sum_returns_out) +{ + BLAST_SummaryReturn* sum_returns = + (BLAST_SummaryReturn*) calloc(1, sizeof(BLAST_SummaryReturn)); + Blast_SummaryFillKAParameters(sbp, query_info->first_context, sum_returns); + sum_returns->params_buffer = + Blast_GetParametersBuffer(program_number, score_options, sbp, + lookup_options, word_options, ext_options, + hit_options, eff_len_options, query_info, + seq_src, diagnostics); + *sum_returns_out = sum_returns; +} diff --git a/algo/blast/api/blast_returns.h b/algo/blast/api/blast_returns.h new file mode 100644 index 00000000..74c78992 --- /dev/null +++ b/algo/blast/api/blast_returns.h @@ -0,0 +1,105 @@ +/* $Id: blast_returns.h,v 1.1 2004/05/14 17:19:03 dondosha Exp $ +* =========================================================================== +* +* PUBLIC DOMAIN NOTICE +* National Center for Biotechnology Information +* +* This software/database is a "United States Government Work" under the +* terms of the United States Copyright Act. It was written as part of +* the author's offical duties as a United States Government employee and +* thus cannot be copyrighted. This software/database is freely available +* to the public for use. The National Library of Medicine and the U.S. +* Government have not placed any restriction on its use or reproduction. +* +* Although all reasonable efforts have been taken to ensure the accuracy +* and reliability of the software and data, the NLM and the U.S. +* Government do not and cannot warrant the performance or results that +* may be obtained by using this software or data. The NLM and the U.S. +* Government disclaim all warranties, express or implied, including +* warranties of performance, merchantability or fitness for any particular +* purpose. +* +* Please cite the author in any work or product based on this material. +* +* ===========================================================================*/ + +/***************************************************************************** + +File name: blast_returns.h + +Author: Ilya Dondoshansky + +Contents: Manipulation of data returned from BLAST other than Seq-aligns + +****************************************************************************** + * $Revision: 1.1 $ + * */ +#ifndef __BLAST_RETURNS__ +#define __BLAST_RETURNS__ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef NCBI_C_TOOLKIT +#define NCBI_C_TOOLKIT +#endif + +#include <algo/blast/core/blast_options.h> +#include <algo/blast/core/blast_hits.h> +#include <algo/blast/core/blast_seqsrc.h> +#include <algo/blast/core/blast_diagnostics.h> +#include <algo/blast/api/twoseq_api.h> + +typedef struct BLAST_DbSummary { + struct BLAST_DbSummary* next; + Boolean is_protein; + char* name; + char* definition; + char* date; + Int8 total_length; + Int4 number_seqs; + Boolean subset; /* Print the subset message. */ +} BLAST_DbSummary; + +BLAST_DbSummary* LIBCALL +Blast_DbSummaryFree (BLAST_DbSummary* dbinfo); + +/** Retrieves necessary information from a sequence source and fills the + * BLAST_DbSummary structure. + */ +BLAST_DbSummary* Blast_GetDbSummary(const BlastSeqSrc* seq_src); + +/** Formats the BLAST parameters for the BLAST report. + * One char* is returned, newlines are indicated by tildes ('~'). + */ +char* +Blast_GetParametersBuffer(Uint1 program_number, + const BlastScoringOptions* score_options, + const BlastScoreBlk* sbp, const LookupTableOptions* lookup_options, + const BlastInitialWordOptions* word_options, + const BlastExtensionOptions* ext_options, + const BlastHitSavingOptions* hit_options, + const BlastEffectiveLengthsOptions* eff_len_options, + const BlastQueryInfo* query_info, const BlastSeqSrc* seq_src, + const BlastDiagnostics* diagnostics); + +/** Fills the summary returns */ +void Blast_SummaryReturnFill(Uint1 program_number, + const BlastScoringOptions* score_options, + const BlastScoreBlk* sbp, + const LookupTableOptions* lookup_options, + const BlastInitialWordOptions* word_options, + const BlastExtensionOptions* ext_options, + const BlastHitSavingOptions* hit_options, + const BlastEffectiveLengthsOptions* eff_len_options, + const BlastQueryInfo* query_info, + const BlastSeqSrc* seq_src, + const BlastDiagnostics* diagnostics, + BLAST_SummaryReturn** sum_returns_out); + +#ifdef __cplusplus +} +#endif +#endif /* !__BLAST_FORMAT__ */ + diff --git a/algo/blast/api/blast_seq.c b/algo/blast/api/blast_seq.c index c4bb1a2c..2eb93b5d 100644 --- a/algo/blast/api/blast_seq.c +++ b/algo/blast/api/blast_seq.c @@ -1,4 +1,4 @@ -static char const rcsid[] = "$Id: blast_seq.c,v 1.39 2004/04/16 14:44:07 papadopo Exp $"; +static char const rcsid[] = "$Id: blast_seq.c,v 1.40 2004/05/14 17:20:50 dondosha Exp $"; /* * =========================================================================== * @@ -33,7 +33,7 @@ Author: Ilya Dondoshansky Contents: Functions converting between SeqLocs and structures used in BLAST. ****************************************************************************** - * $Revision: 1.39 $ + * $Revision: 1.40 $ * */ #include <seqport.h> @@ -274,7 +274,7 @@ static Int4 BLAST_SetUpQueryInfo(SeqLocPtr slp, Uint1 program, Uint4 max_length = 0; if (translate) - num_frames = 6; + num_frames = NUM_FRAMES; else if (is_na) num_frames = 2; else @@ -302,14 +302,14 @@ static Int4 BLAST_SetUpQueryInfo(SeqLocPtr slp, Uint1 program, } if ((context_offsets = (Int4*) - malloc((total_contexts+1)*sizeof(Int4))) == NULL) + calloc((total_contexts+1), sizeof(Int4))) == NULL) return -1; if ((query_info->eff_searchsp_array = - (Int8*) malloc(total_contexts*sizeof(Int8))) == NULL) + (Int8*) calloc(total_contexts, sizeof(Int8))) == NULL) return -1; if ((query_info->length_adjustments = - (Int4*) malloc(total_contexts*sizeof(Int4))) == NULL) + (Int4*) calloc(total_contexts, sizeof(Int4))) == NULL) return -1; context_offsets[0] = 0; @@ -542,7 +542,7 @@ BLAST_GetSequence(SeqLocPtr slp, BlastQueryInfo* query_info, } } - if (num_frames == 6) { + if (num_frames == NUM_FRAMES) { /* Sequence must be translated in 6 frames. This can only happen for query - subject sequences are translated later. */ Int4 gc; @@ -639,7 +639,7 @@ Int2 BLAST_SetUpQuery(Uint1 program_number, SeqLocPtr query_slp, num_frames = 1; } else { encoding = NCBI4NA_ENCODING; - num_frames = 6; + num_frames = NUM_FRAMES; } if ((status=BLAST_GetSequence(query_slp, *query_info, query_options, diff --git a/algo/blast/api/blast_seqalign.c b/algo/blast/api/blast_seqalign.c index ea6baeb6..9e32cb19 100644 --- a/algo/blast/api/blast_seqalign.c +++ b/algo/blast/api/blast_seqalign.c @@ -1,4 +1,4 @@ -/* $Id: blast_seqalign.c,v 1.31 2004/04/19 15:03:20 papadopo Exp $ +/* $Id: blast_seqalign.c,v 1.35 2004/06/08 17:47:24 dondosha Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -32,10 +32,10 @@ Author: Ilya Dondoshansky Contents: Conversion of BLAST results to the SeqAlign form ****************************************************************************** - * $Revision: 1.31 $ + * $Revision: 1.35 $ * */ -static char const rcsid[] = "$Id: blast_seqalign.c,v 1.31 2004/04/19 15:03:20 papadopo Exp $"; +static char const rcsid[] = "$Id: blast_seqalign.c,v 1.35 2004/06/08 17:47:24 dondosha Exp $"; #include <algo/blast/api/blast_seqalign.h> @@ -46,14 +46,12 @@ extern ScorePtr MakeBlastScore (ScorePtr PNTR old, CharPtr scoretype, Nlm_FloatHi prob, Int4 score); static ScorePtr -GetScoreSetFromBlastHsp(Uint1 program_number, BlastHSP* hsp, - BlastScoreBlk* sbp, BlastScoringOptions* score_options) +GetScoreSetFromBlastHsp(BlastHSP* hsp) { ScorePtr score_set=NULL; double prob; Int4 score; char* scoretype; - Blast_KarlinBlk* kbp; score = hsp->score; if (score > 0) @@ -77,16 +75,9 @@ GetScoreSetFromBlastHsp(Uint1 program_number, BlastHSP* hsp, MakeBlastScore(&score_set, scoretype, prob, 0); } - if (!score_options->gapped_calculation) { - kbp = sbp->kbp[hsp->context]; - } else { - kbp = sbp->kbp_gap[hsp->context]; - } - /* Calculate bit score from the raw score */ - prob = ((hsp->score*kbp->Lambda) - kbp->logK)/NCBIMATH_LN2; - if (prob >= 0.) - MakeBlastScore(&score_set, "bit_score", prob, 0); + if (hsp->bit_score >= 0.) + MakeBlastScore(&score_set, "bit_score", hsp->bit_score, 0); if (hsp->num_ident > 0) MakeBlastScore(&score_set, "num_ident", 0.0, hsp->num_ident); @@ -121,10 +112,8 @@ static Int2 AddGiListToScoreSet(ScorePtr score_set, ValNodePtr gi_list) ************************************************************************/ static DenseDiagPtr -BLAST_HSPToDenseDiag(Uint1 program_number, DenseDiagPtr* old, - BlastHSP* hsp, Boolean reverse, - Int4 query_length, Int4 subject_length, BlastScoreBlk* sbp, - BlastScoringOptions* score_options) +BLAST_HSPToDenseDiag(DenseDiagPtr* old, BlastHSP* hsp, Boolean reverse, + Int4 query_length, Int4 subject_length) { DenseDiagPtr ddp, new; @@ -180,8 +169,7 @@ BLAST_HSPToDenseDiag(Uint1 program_number, DenseDiagPtr* old, new->starts[1] = subject_length - hsp->subject.offset - hsp->subject.length; } } - new->scores = - GetScoreSetFromBlastHsp(program_number, hsp, sbp, score_options); + new->scores = GetScoreSetFromBlastHsp(hsp); /* Go to the end of the chain, and then attach "new" */ if (*old) @@ -210,10 +198,8 @@ BLAST_HSPToDenseDiag(Uint1 program_number, DenseDiagPtr* old, * ************************************************************************/ static StdSeg* -BLAST_HSPToStdSeg(Uint1 program_number, StdSeg** old, - BlastHSP* hsp, Int4 query_length, Int4 subject_length, SeqIdPtr sip, - Boolean reverse, BlastScoreBlk* sbp, - BlastScoringOptions* score_options) +BLAST_HSPToStdSeg(StdSeg** old, BlastHSP* hsp, Int4 query_length, + Int4 subject_length, SeqIdPtr sip, Boolean reverse) { StdSeg* ssp,* new; SeqIdPtr query_sip, subject_sip; @@ -288,8 +274,7 @@ BLAST_HSPToStdSeg(Uint1 program_number, StdSeg** old, } new->loc = slp; - new->scores = - GetScoreSetFromBlastHsp(program_number, hsp, sbp, score_options); + new->scores = GetScoreSetFromBlastHsp(hsp); /* Go to the end of the chain, and then attach "new" */ if (*old) @@ -327,9 +312,8 @@ BLAST_HSPToStdSeg(Uint1 program_number, StdSeg** old, static Int2 BLAST_UngappedHSPToSeqAlign(Uint1 program_number, BlastHSPList* hsp_list, SeqIdPtr query_id, - SeqIdPtr subject_id, BlastScoreBlk* sbp, Int4 query_length, - Int4 subject_length, BlastScoringOptions* score_options, - SeqAlignPtr* seqalign_ptr) + SeqIdPtr subject_id, Int4 query_length, + Int4 subject_length, SeqAlignPtr* seqalign_ptr) { BlastHSP* hsp; DenseDiagPtr ddp_head=NULL, ddp; @@ -361,13 +345,12 @@ BLAST_UngappedHSPToSeqAlign(Uint1 program_number, sip->next = SeqIdDup(subject_id); if (getdensediag) { - ddp = BLAST_HSPToDenseDiag(program_number, &ddp_head, hsp, FALSE, - query_length, subject_length, sbp, score_options); + ddp = BLAST_HSPToDenseDiag(&ddp_head, hsp, FALSE, query_length, + subject_length); ddp->id = sip; } else { - ssp = BLAST_HSPToStdSeg(program_number, &ssp_head, hsp, - query_length, subject_length, sip, FALSE, sbp, - score_options); + ssp = BLAST_HSPToStdSeg(&ssp_head, hsp, query_length, + subject_length, sip, FALSE); ssp->ids = sip; } sip = NULL; /* This SeqIdPtr is now on the SeqAlign. */ @@ -454,8 +437,8 @@ Boolean GapCollectDataForSeqalign(GapEditBlock* edit_block, index=0; for (i = 0, curr=curr_in; curr && i < numseg; curr=curr->next, i++) { switch(curr->op_type) { - case GAPALIGN_DECLINE: - case GAPALIGN_SUB: + case eGapAlignDecline: + case eGapAlignSub: if (strand1 != Seq_strand_minus) { if(translate1 == FALSE) begin1 = get_current_pos(start1, curr->num); @@ -494,7 +477,7 @@ Boolean GapCollectDataForSeqalign(GapEditBlock* edit_block, break; - case GAPALIGN_DEL: + case eGapAlignDel: begin1 = -1; if (strand2 != Seq_strand_minus) { if(translate2 == FALSE) @@ -528,7 +511,7 @@ Boolean GapCollectDataForSeqalign(GapEditBlock* edit_block, break; - case GAPALIGN_INS: + case eGapAlignIns: if (strand1 != Seq_strand_minus) { if(translate1 == FALSE) begin1 = get_current_pos(start1, curr->num); @@ -581,7 +564,7 @@ static void GapCorrectUASequence(GapEditBlock* edit_block) for (curr=edit_block->esp; curr; curr = curr->next) { - if(curr->op_type == GAPALIGN_DECLINE && last_indel == TRUE) { + if(curr->op_type == eGapAlignDecline && last_indel == TRUE) { /* This is invalid condition and regions should be exchanged */ @@ -596,7 +579,7 @@ static void GapCorrectUASequence(GapEditBlock* edit_block) last_indel = FALSE; - if(curr->op_type == GAPALIGN_INS || curr->op_type == GAPALIGN_DEL) { + if(curr->op_type == eGapAlignIns || curr->op_type == eGapAlignDel) { last_indel = TRUE; curr_last2 = curr_last; } @@ -769,14 +752,14 @@ GapEditBlockToSeqAlign(GapEditBlock* edit_block, SeqIdPtr subject_id, SeqIdPtr q for (curr=edit_block->esp; curr; curr = curr->next) { numseg++; - if(/*edit_block->discontinuous && */curr->op_type == GAPALIGN_DECLINE) + if(/*edit_block->discontinuous && */curr->op_type == eGapAlignDecline) is_disc_align = TRUE; } start1 = edit_block->start1; start2 = edit_block->start2; - /* If no GAPALIGN_DECLINE regions exists output seqalign will be + /* If no eGapAlignDecline regions exists output seqalign will be regular Den-Seg or Std-seg */ if(is_disc_align == FALSE) { /* Please note, that edit_block passed only for data like @@ -812,11 +795,11 @@ GapEditBlockToSeqAlign(GapEditBlock* edit_block, SeqIdPtr subject_id, SeqIdPtr q for (numseg = 0, curr = curr_last; curr; curr = curr->next, numseg++) { - if(curr->op_type == GAPALIGN_DECLINE) { + if(curr->op_type == eGapAlignDecline) { if(numseg != 0) { /* End of aligned area */ break; } else { - while(curr && curr->op_type == GAPALIGN_DECLINE) { + while(curr && curr->op_type == eGapAlignDecline) { numseg++; curr = curr->next; } @@ -935,7 +918,7 @@ BLAST_OOFEditBlockToSeqAlign(Uint1 program, GapEditBlock* edit_block, slp2 = NULL; switch (curr->op_type) { - case 0: /* deletion of three nucleotides. */ + case eGapAlignDel: /* deletion of three nucleotides. */ first_shift = FALSE; @@ -960,7 +943,7 @@ BLAST_OOFEditBlockToSeqAlign(Uint1 program, GapEditBlock* edit_block, break; - case 6: /* insertion of three nucleotides. */ + case eGapAlignIns: /* insertion of three nucleotides. */ /* If gap is followed after frameshift - we have to add this element for the alignment to be correct */ @@ -1070,7 +1053,7 @@ BLAST_OOFEditBlockToSeqAlign(Uint1 program, GapEditBlock* edit_block, break; - case 3: /* Substitution. */ + case eGapAlignSub: /* Substitution. */ first_shift = FALSE; @@ -1090,14 +1073,8 @@ BLAST_OOFEditBlockToSeqAlign(Uint1 program, GapEditBlock* edit_block, /* Nucleotide scale shifted by op_type */ seq_int2 = SeqIntNew(); - /* Adjusting last segment and new start point in - nucleotide coordinates */ - /* if(seq_int2_last != NULL) { - seq_int2_last->to = seq_int2_last->to - (3 - curr->op_type); - start2 = start2 - (3 - curr->op_type); - } */ - - seq_int2->from = get_current_pos(&start2, curr->num*curr->op_type); + seq_int2->from = + get_current_pos(&start2, curr->num*(Uint1)curr->op_type); seq_int2->to = start2 - 1; /* Chop off three bases and one residue at a time. @@ -1125,10 +1102,10 @@ BLAST_OOFEditBlockToSeqAlign(Uint1 program, GapEditBlock* edit_block, seq_int2_last = seq_int2; /* Will be used to adjust "to" value */ break; - case 1: /* gap of two nucleotides. */ - case 2: /* Gap of one nucleotide. */ - case 4: /* Insertion of one nucleotide. */ - case 5: /* Insertion of two nucleotides. */ + case eGapAlignDel2: /* gap of two nucleotides. */ + case eGapAlignDel1: /* Gap of one nucleotide. */ + case eGapAlignIns1: /* Insertion of one nucleotide. */ + case eGapAlignIns2: /* Insertion of two nucleotides. */ if(first_shift == TRUE) { /* Second frameshift in a row */ /* Protein coordinates */ @@ -1147,7 +1124,8 @@ BLAST_OOFEditBlockToSeqAlign(Uint1 program, GapEditBlock* edit_block, /* Nucleotide scale shifted by op_type */ seq_int2 = SeqIntNew(); - seq_int2->from = get_current_pos(&start2, curr->op_type); + seq_int2->from = + get_current_pos(&start2, (Uint1)curr->op_type); seq_int2->to = start2 - 1; if(seq_int2->to >= original_length2) { @@ -1181,10 +1159,8 @@ BLAST_OOFEditBlockToSeqAlign(Uint1 program, GapEditBlock* edit_block, /* If this substitution is following simple frameshift we do not need to start new segment, but may continue old one */ - /* printf("curr_num = %d (%d)\n", curr->num, curr->op_type); */ - if(seq_int2_last != NULL) { - get_current_pos(&start2, curr->num*(curr->op_type-3)); + get_current_pos(&start2, curr->num*((Uint1)curr->op_type-3)); if(strand2 != Seq_strand_minus) { seq_int2_last->to = start2 - 1; } else { @@ -1206,13 +1182,14 @@ BLAST_OOFEditBlockToSeqAlign(Uint1 program, GapEditBlock* edit_block, seq_int1_last++; } - } else if (curr->op_type > 3) { + } else if ((Uint1)curr->op_type > 3) { /* Protein piece is empty */ ValNodeAddPointer(&slp1, SEQLOC_EMPTY, SeqIdDup(id1)); /* Simulating insertion of nucleotides */ seq_int2 = SeqIntNew(); - seq_int2->from = get_current_pos(&start2, - curr->num*(curr->op_type-3)); + seq_int2->from = + get_current_pos(&start2, + curr->num*((Uint1)curr->op_type-3)); seq_int2->to = start2 - 1; if(seq_int2->to >= original_length2) { @@ -1279,9 +1256,8 @@ BLAST_OOFEditBlockToSeqAlign(Uint1 program, GapEditBlock* edit_block, static Int2 BLAST_GapInfoToSeqAlign(Uint1 program_number, BlastHSPList* hsp_list, - SeqIdPtr query_id, SeqIdPtr subject_id, BlastScoreBlk* sbp, - Int4 query_length, BlastScoringOptions* score_options, - SeqAlignPtr* head_seqalign) + SeqIdPtr query_id, SeqIdPtr subject_id, Int4 query_length, + Boolean is_ooframe, SeqAlignPtr* head_seqalign) { Int2 status = 0; BlastHSP** hsp_array; @@ -1294,7 +1270,7 @@ BLAST_GapInfoToSeqAlign(Uint1 program_number, BlastHSPList* hsp_list, for (index=0; index<hsp_list->hspcnt; index++) { hsp_array[index]->gap_info->original_length1 = query_length; - if (score_options->is_ooframe) { + if (is_ooframe) { seqalign = BLAST_OOFEditBlockToSeqAlign(program_number, hsp_array[index]->gap_info, query_id, subject_id); @@ -1310,9 +1286,7 @@ BLAST_GapInfoToSeqAlign(Uint1 program_number, BlastHSPList* hsp_list, last_seqalign->next = seqalign; last_seqalign = last_seqalign->next; } - seqalign->score = - GetScoreSetFromBlastHsp(program_number, hsp_array[index], sbp, - score_options); + seqalign->score = GetScoreSetFromBlastHsp(hsp_array[index]); } return status; @@ -1320,9 +1294,8 @@ BLAST_GapInfoToSeqAlign(Uint1 program_number, BlastHSPList* hsp_list, Int2 BLAST_ResultsToSeqAlign(Uint1 program_number, BlastHSPResults* results, SeqLocPtr query_slp, - BlastSeqSrc* bssp, SeqLocPtr subject_slp, - BlastScoringOptions* score_options, BlastScoreBlk* sbp, - Boolean is_gapped, SeqAlignPtr* head_seqalign) + BlastSeqSrc* seq_src, Boolean is_gapped, Boolean is_ooframe, + SeqAlignPtr* head_seqalign) { Int4 query_index, subject_index; SeqLocPtr slp = query_slp; @@ -1331,13 +1304,16 @@ Int2 BLAST_ResultsToSeqAlign(Uint1 program_number, BlastHSPList* hsp_list; SeqAlignPtr seqalign = NULL, last_seqalign = NULL; Int4 subject_length = 0; + ListNode* seqid_wrap = NULL; + char* bad_id_str = NULL; /* In case an unknown id is returned from sequence + source */ *head_seqalign = NULL; + if (!results) + return 0; - if (!bssp) { - subject_id = SeqLocId(subject_slp); - subject_length = SeqLocLen(subject_slp); - } + if (!seq_src) + return -1; for (query_index = 0; slp && query_index < results->num_queries; ++query_index, slp = slp->next) { @@ -1351,22 +1327,30 @@ Int2 BLAST_ResultsToSeqAlign(Uint1 program_number, hsp_list = hit_list->hsplist_array[subject_index]; if (!hsp_list) continue; - if (bssp) { - char* id_str = BLASTSeqSrcGetSeqIdStr(bssp, (void*) &hsp_list->oid); - subject_id = SeqIdParse(id_str); - subject_length = BLASTSeqSrcGetSeqLen(bssp, (void*) &hsp_list->oid); - sfree(id_str); + bad_id_str = NULL; + seqid_wrap = BLASTSeqSrcGetSeqId(seq_src, (void*) &hsp_list->oid); + if (seqid_wrap->choice == BLAST_SEQSRC_C_SEQID) { + subject_id = (SeqId*) seqid_wrap->ptr; + ListNodeFree(seqid_wrap); + } else { + /* Should not happen: wrong type of subject id returned; + Create a fake local id. */ + bad_id_str = strdup("lcl|unknown"); + subject_id = SeqIdParse(bad_id_str); } + subject_length = + BLASTSeqSrcGetSeqLen(seq_src, (void*) &hsp_list->oid); if (is_gapped) { BLAST_GapInfoToSeqAlign(program_number, hsp_list, query_id, - subject_id, sbp, SeqLocLen(slp), score_options, &seqalign); + subject_id, SeqLocLen(slp), is_ooframe, &seqalign); } else { BLAST_UngappedHSPToSeqAlign(program_number, hsp_list, query_id, - subject_id, sbp, SeqLocLen(slp), subject_length, - score_options, &seqalign); + subject_id, SeqLocLen(slp), subject_length, &seqalign); } - if (bssp) + /* If unknown seqid was allocated here, free it, since it has been + duplicated in the Seq-align */ + if (bad_id_str) SeqIdSetFree(subject_id); if (seqalign) { diff --git a/algo/blast/api/blast_seqalign.h b/algo/blast/api/blast_seqalign.h index d78377dd..c6d009d7 100644 --- a/algo/blast/api/blast_seqalign.h +++ b/algo/blast/api/blast_seqalign.h @@ -1,4 +1,4 @@ -/* $Id: blast_seqalign.h,v 1.13 2004/03/12 15:18:53 coulouri Exp $ +/* $Id: blast_seqalign.h,v 1.16 2004/06/08 17:47:24 dondosha Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -32,7 +32,7 @@ Author: Ilya Dondoshansky Contents: Functions to convert BLAST results to the SeqAlign form ****************************************************************************** - * $Revision: 1.13 $ + * $Revision: 1.16 $ * */ #ifndef __BLAST_SEQALIGN__ #define __BLAST_SEQALIGN__ @@ -55,17 +55,14 @@ extern "C" { * @param program_number Type of BLAST program [in] * @param results The BLAST results [in] * @param query_slp List of query SeqLoc's [in] - * @param bssp Pointer to the BLAST database wrapper structure [in] - * @param subject_slp Subject SeqLoc (for two sequences search) [in] - * @param score_options Scoring options block [in] - * @param sbp Scoring and statistical information [in] + * @param seq_src Pointer to the BLAST database wrapper structure [in] * @param is_gapped Is this a gapped alignment search? [in] + * @param is_ooframe Is this a search with out-of-frame gapping? [in] * @param head_seqalign List of SeqAlign's [out] */ Int2 BLAST_ResultsToSeqAlign(Uint1 program_number, BlastHSPResults* results, - SeqLocPtr query_slp, BlastSeqSrc* bssp, SeqLocPtr subject_slp, - BlastScoringOptions* score_options, BlastScoreBlk* sbp, - Boolean is_gapped, SeqAlignPtr* head_seqalign); + SeqLocPtr query_slp, BlastSeqSrc* seq_src, + Boolean is_gapped, Boolean is_ooframe, SeqAlignPtr* head_seqalign); Boolean GapCollectDataForSeqalign(GapEditBlock* edit_block, GapEditScript* curr_in, Int4 numseg, diff --git a/algo/blast/api/blast_tabular.c b/algo/blast/api/blast_tabular.c new file mode 100644 index 00000000..7963dade --- /dev/null +++ b/algo/blast/api/blast_tabular.c @@ -0,0 +1,238 @@ +/* $Id: blast_tabular.c,v 1.3 2004/06/14 20:43:30 dondosha Exp $ +* =========================================================================== +* +* PUBLIC DOMAIN NOTICE +* National Center for Biotechnology Information +* +* This software/database is a "United States Government Work" under the +* terms of the United States Copyright Act. It was written as part of +* the author's offical duties as a United States Government employee and +* thus cannot be copyrighted. This software/database is freely available +* to the public for use. The National Library of Medicine and the U.S. +* Government have not placed any restriction on its use or reproduction. +* +* Although all reasonable efforts have been taken to ensure the accuracy +* and reliability of the software and data, the NLM and the U.S. +* Government do not and cannot warrant the performance or results that +* may be obtained by using this software or data. The NLM and the U.S. +* Government disclaim all warranties, express or implied, including +* warranties of performance, merchantability or fitness for any particular +* purpose. +* +* Please cite the author in any work or product based on this material. +* +* ===========================================================================*/ + +/***************************************************************************** + +File name: blast_tabular.c + +Author: Ilya Dondoshansky + +Contents: On-the-fly tabular formatting of BLAST results + +Detailed Contents: + +****************************************************************************** + * $Revision: 1.3 $ + * */ + +static char const rcsid[] = "$Id: blast_tabular.c,v 1.3 2004/06/14 20:43:30 dondosha Exp $"; + +#include <algo/blast/api/blast_tabular.h> +#include <algo/blast/core/blast_util.h> +#include <algo/blast/core/blast_setup.h> +#include <algo/blast/core/blast_engine.h> +#include <algo/blast/core/blast_traceback.h> +#include <algo/blast/api/blast_format.h> +#include <txalign.h> + +BlastTabularFormatData* +Blast_TabularFormatDataInit(Uint1 program, BlastHSPStream* hsp_stream, + BlastSeqSrc* seq_src, BLAST_SequenceBlk* query, BlastQueryInfo* query_info, + const BlastScoringOptions* score_options, BlastScoreBlk* sbp, + const BlastEffectiveLengthsOptions* eff_len_options, + const BlastExtensionOptions* ext_options, + const BlastHitSavingOptions* hit_options, + const BlastDatabaseOptions* db_options, + SeqLoc* query_slp, FILE* outfp) +{ + BlastTabularFormatData* tf_data = + (BlastTabularFormatData*) calloc(1, sizeof(BlastTabularFormatData)); + tf_data->perform_traceback = + (score_options->gapped_calculation && + ext_options->eTbackExt != eSkipTbck); + + tf_data->program = program; + tf_data->hsp_stream = hsp_stream; + tf_data->query = query; + tf_data->gen_code_string = db_options->gen_code_string; + tf_data->query_slp = query_slp; + tf_data->outfp = outfp; + /* Sequence source must be copied, to guarantee multi-thread safety. */ + tf_data->seq_src = BlastSeqSrcCopy(seq_src); + /* Effective lengths must be duplicated in query info structure, because + they might be changing in the preliminary search. */ + tf_data->query_info = BlastQueryInfoDup(query_info); + + /* If traceback will have to be performed before tabular output, + do the preparation for it here. */ + if (tf_data->perform_traceback) { + BLAST_GapAlignSetUp(program, seq_src, + score_options, eff_len_options, ext_options, hit_options, + query_info, sbp, &tf_data->score_params, &tf_data->ext_params, + &tf_data->hit_params, &tf_data->eff_len_params, &tf_data->gap_align); + } + + return tf_data; +} + +void BlastTabularFormatDataFree(BlastTabularFormatData* tf_data) +{ + /* Free only the structures that have been initialized internally */ + tf_data->query_info = BlastQueryInfoFree(tf_data->query_info); + tf_data->score_params = BlastScoringParametersFree(tf_data->score_params); + tf_data->ext_params = BlastExtensionParametersFree(tf_data->ext_params); + tf_data->hit_params = BlastHitSavingParametersFree(tf_data->hit_params); + tf_data->eff_len_params = + BlastEffectiveLengthsParametersFree(tf_data->eff_len_params); + tf_data->gap_align = BLAST_GapAlignStructFree(tf_data->gap_align); + tf_data->seq_src = BlastSeqSrcFree(tf_data->seq_src); + sfree(tf_data); +} + +/* This function might serve as a starting point for a callback function + * that prints results before the traceback stage, e.g. the on-the-fly + * tabular output, a la the -D3 option of the old megablast. + */ +void* Blast_TabularFormatThread(void* data) +{ + BlastTabularFormatData* tf_data; + Uint1 program; + BlastHSPList* hsp_list = NULL; + BlastSeqSrc* seq_src; + BLAST_SequenceBlk* query = NULL; + BlastQueryInfo* query_info = NULL; + BlastScoringParameters* score_params = NULL; + BlastExtensionParameters* ext_params = NULL; + BlastHitSavingParameters* hit_params = NULL; + BlastEffectiveLengthsParameters* eff_len_params = NULL; + Uint1* gen_code_string = NULL; + BlastGapAlignStruct* gap_align = NULL; + Int4 query_index, index; + char* query_buffer = NULL; + char* subject_buffer = NULL; + Int4 q_start=0, q_end=0, s_start=0, s_end=0; + SeqLocPtr slp; + char bit_score_buff[10], eval_buff[10]; + char* eval_buff_ptr = NULL; + BlastHSP* hsp; + SeqId** query_id_array = NULL; + Int4 align_length = 0; + Int4 num_gaps = 0, num_gap_opens = 0, num_mismatches = 0; + double perc_ident = 0; + GetSeqArg seq_arg; + Boolean one_seq_update_params; + + tf_data = (BlastTabularFormatData*) data; + if (!tf_data || !tf_data->query_slp || !tf_data->hsp_stream || + !tf_data->seq_src || !tf_data->outfp) + return NULL; + + program = tf_data->program; + seq_src = tf_data->seq_src; + + if (tf_data->perform_traceback) { + query = tf_data->query; + query_info = tf_data->query_info; + score_params = tf_data->score_params; + ext_params = tf_data->ext_params; + hit_params = tf_data->hit_params; + eff_len_params = tf_data->eff_len_params; + gap_align = tf_data->gap_align; + gen_code_string = tf_data->gen_code_string; + + memset((void*) &seq_arg, 0, sizeof(seq_arg)); + seq_arg.encoding = Blast_TracebackGetEncoding(program); + } + + query_id_array = + (SeqId**) malloc(ValNodeLen(tf_data->query_slp)*sizeof(SeqId*)); + + for (index = 0, slp = tf_data->query_slp; slp; ++index, slp = slp->next) { + query_id_array[index] = SeqLocId(slp); + } + + one_seq_update_params = (BLASTSeqSrcGetTotLen(seq_src) == 0); + + while (BlastHSPStreamRead(tf_data->hsp_stream, &hsp_list) + != kBlastHSPStream_Eof) { + if (!hsp_list) { + /* This should not happen, but just in case */ + continue; + } + + /* Perform traceback if necessary */ + if (tf_data->perform_traceback) { + BlastSequenceBlkClean(seq_arg.seq); + seq_arg.oid = hsp_list->oid; + if (BLASTSeqSrcGetSequence(seq_src, (void*) &seq_arg) < 0) + continue; + + if (one_seq_update_params) { + Int2 status; + /* This is not a database search, so effective search spaces + need to be recalculated based on this subject sequence length */ + if ((status = BLAST_OneSubjectUpdateParameters(program, + seq_arg.seq->length, + score_params->options, + query_info, gap_align->sbp, + ext_params, hit_params, NULL, + eff_len_params)) != 0) { + continue; + } + } + + Blast_TracebackFromHSPList(program, hsp_list, query, + seq_arg.seq, query_info, gap_align, gap_align->sbp, score_params, + ext_params->options, hit_params, gen_code_string); + BLASTSeqSrcRetSequence(seq_src, (void*)&seq_arg); + /* Recalculate the bit scores, since they might have changed. */ + Blast_HSPListGetBitScores(hsp_list, + score_params->options->gapped_calculation, gap_align->sbp); + } + subject_buffer = + BLASTSeqSrcGetSeqIdStr(seq_src, (void*) &hsp_list->oid); + + for (index = 0; index < hsp_list->hspcnt; ++index) { + hsp = hsp_list->hsp_array[index]; + query_index = + Blast_GetQueryIndexFromContext(hsp->context, program); + Blast_SeqIdGetDefLine(query_id_array[query_index], NULL, + &query_buffer, TRUE, FALSE, TRUE, FALSE); + + eval_buff_ptr = eval_buff; + ScoreAndEvalueToBuffers(hsp->bit_score, hsp->evalue, + bit_score_buff, &eval_buff_ptr, FALSE); + + /* Calculate percentage of identities */ + Blast_HSPCalcLengthAndGaps(hsp, &align_length, &num_gaps, + &num_gap_opens); + perc_ident = ((double)hsp->num_ident)/align_length * 100; + num_mismatches = align_length - hsp->num_ident - num_gaps; + + Blast_HSPGetAdjustedOffsets(hsp, &q_start, &q_end, &s_start, &s_end); + + fprintf(tf_data->outfp, + "%s\t%s\t%.2f\t%ld\t%ld\t%ld\t%ld\t%ld\t%ld\t%ld\t%s\t%s\n", + query_buffer, subject_buffer, perc_ident, + (long) align_length, (long) num_mismatches, + (long) num_gap_opens, (long) q_start, (long) q_end, + (long) s_start, (long) s_end, eval_buff, bit_score_buff); + } + fflush(tf_data->outfp); + } + /* Deallocate the formatting thread data structure */ + BlastTabularFormatDataFree(tf_data); + return NULL; +} diff --git a/algo/blast/api/blast_tabular.h b/algo/blast/api/blast_tabular.h new file mode 100644 index 00000000..7befa54f --- /dev/null +++ b/algo/blast/api/blast_tabular.h @@ -0,0 +1,101 @@ +/* $Id: blast_tabular.h,v 1.2 2004/06/14 20:43:30 dondosha Exp $ +* =========================================================================== +* +* PUBLIC DOMAIN NOTICE +* National Center for Biotechnology Information +* +* This software/database is a "United States Government Work" under the +* terms of the United States Copyright Act. It was written as part of +* the author's offical duties as a United States Government employee and +* thus cannot be copyrighted. This software/database is freely available +* to the public for use. The National Library of Medicine and the U.S. +* Government have not placed any restriction on its use or reproduction. +* +* Although all reasonable efforts have been taken to ensure the accuracy +* and reliability of the software and data, the NLM and the U.S. +* Government do not and cannot warrant the performance or results that +* may be obtained by using this software or data. The NLM and the U.S. +* Government disclaim all warranties, express or implied, including +* warranties of performance, merchantability or fitness for any particular +* purpose. +* +* Please cite the author in any work or product based on this material. +* +* ===========================================================================*/ + +/***************************************************************************** + +File name: blast_tabular.h + +Author: Ilya Dondoshansky + +Contents: Functions needed for formatting of BLAST results + +****************************************************************************** + * $Revision: 1.2 $ + * */ +#ifndef __BLAST_TABULAR__ +#define __BLAST_TABULAR__ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef NCBI_C_TOOLKIT +#define NCBI_C_TOOLKIT +#endif + +#include <ncbi.h> +#include <asn.h> +#include <algo/blast/core/blast_hits.h> +#include <algo/blast/core/lookup_wrap.h> +#include <algo/blast/core/blast_seqsrc.h> +#include <algo/blast/core/blast_hspstream.h> +#include <algo/blast/core/blast_gapalign.h> +#include <objloc.h> + +/** Data structure containing all information necessary for production of the + * tabular output. + */ +typedef struct BlastTabularFormatData { + Uint1 program; /**< Type of BLAST program */ + BlastHSPStream* hsp_stream; /**< Source of the BLAST results */ + BlastSeqSrc* seq_src; /**< Source of the subject sequences */ + BLAST_SequenceBlk* query; /**< Query sequence */ + BlastQueryInfo* query_info; /**< Query information, including context + offsets and effective lengths. */ + BlastScoringParameters* score_params; + BlastExtensionParameters* ext_params; + BlastHitSavingParameters* hit_params; + BlastEffectiveLengthsParameters* eff_len_params; + Uint1* gen_code_string; + BlastGapAlignStruct* gap_align; + SeqLoc* query_slp; /**< Source of query sequences identifiers */ + FILE* outfp; /**< Output stream */ + Boolean perform_traceback; /**< Must gapped extension with traceback be + performed before formatting? */ +} BlastTabularFormatData; + +/** Function initializing the BlastTabularFormatData data structure fields. */ +BlastTabularFormatData* +Blast_TabularFormatDataInit(Uint1 program, BlastHSPStream* hsp_stream, + BlastSeqSrc* seq_src, BLAST_SequenceBlk* query, BlastQueryInfo* query_info, + const BlastScoringOptions* scoring_options, BlastScoreBlk* sbp, + const BlastEffectiveLengthsOptions* eff_len_options, + const BlastExtensionOptions* ext_options, + const BlastHitSavingOptions* hit_options, + const BlastDatabaseOptions* db_options, SeqLoc* query_slp, FILE* outfp); + +/** Free the tabular formatting data structure and all its internally + * allocated substructures. + */ +void BlastTabularFormatDataFree(BlastTabularFormatData* tf_data); + +/** Driver for the thread producing tabular output. */ +void* Blast_TabularFormatThread(void* data); + +#ifdef __cplusplus +} +#endif +#endif /* !__BLAST_TABULAR__ */ + diff --git a/algo/blast/api/hspstream_queue.c b/algo/blast/api/hspstream_queue.c new file mode 100644 index 00000000..ea5dac83 --- /dev/null +++ b/algo/blast/api/hspstream_queue.c @@ -0,0 +1,182 @@ +/* $Id: hspstream_queue.c,v 1.2 2004/06/08 17:46:35 dondosha Exp $ + * =========================================================================== + * + * PUBLIC DOMAIN NOTICE + * National Center for Biotechnology Information + * + * This software/database is a "United States Government Work" under the + * terms of the United States Copyright Act. It was written as part of + * the author's official duties as a United States Government employee and + * thus cannot be copyrighted. This software/database is freely available + * to the public for use. The National Library of Medicine and the U.S. + * Government have not placed any restriction on its use or reproduction. + * + * Although all reasonable efforts have been taken to ensure the accuracy + * and reliability of the software and data, the NLM and the U.S. + * Government do not and cannot warrant the performance or results that + * may be obtained by using this software or data. The NLM and the U.S. + * Government disclaim all warranties, express or implied, including + * warranties of performance, merchantability or fitness for any particular + * purpose. + * + * Please cite the author in any work or product based on this material. + * + * =========================================================================== + * + * Author: Ilya Dondoshansky + * + */ + +/** @file hspstream_queue.c + * Implementation of the BlastHSPStream interface for producing BLAST results + * on the fly. + */ + +static char const rcsid[] = + "$Id: hspstream_queue.c,v 1.2 2004/06/08 17:46:35 dondosha Exp $"; + + +#include <algo/blast/core/blast_hits.h> +#include <algo/blast/api/hspstream_queue.h> +#include <ncbithr.h> + +/** Default hit saving stream methods */ + +static BlastHSPStream* +BlastHSPListQueueFree(BlastHSPStream* hsp_stream) +{ + BlastHSPListQueueData* stream_data = + (BlastHSPListQueueData*) GetData(hsp_stream); + ListNode* node; + + NlmSemaDestroy(stream_data->m_resultsSema); + NlmMutexDestroy(stream_data->m_resultsMutex); + + for (node = stream_data->m_queueStart; node; node = node->next) { + node->ptr = (void*) Blast_HSPListFree((BlastHSPList*)node->ptr); + } + stream_data->m_queueStart = ListNodeFree(stream_data->m_queueStart); + sfree(stream_data); + sfree(hsp_stream); + return NULL; +} + +static int +BlastHSPListQueueRead(BlastHSPStream* hsp_stream, + BlastHSPList** hsp_list_out) +{ + BlastHSPListQueueData* stream_data = + (BlastHSPListQueueData*) GetData(hsp_stream); + int status = kBlastHSPStream_Error; + + /* Lock the mutex */ + NlmMutexLockEx(&stream_data->m_resultsMutex); + + if (!stream_data->m_writingDone) { + while (!stream_data->m_writingDone && !stream_data->m_queueStart) { + /* Decrement the semaphore count to 0, then wait for it to be + * incremented. Note that mutex must be locked whenever the + * contents of the stream are checked, but it must be unlocked + * for the semaphore wait. */ + NlmMutexUnlock(stream_data->m_resultsMutex); + NlmSemaWait(stream_data->m_resultsSema); + NlmMutexLockEx(&stream_data->m_resultsMutex); + } + } + + if (!stream_data->m_queueStart) { + /* Nothing in the queue, but no more writing to the queue is expected. */ + *hsp_list_out = NULL; + status = kBlastHSPStream_Eof; + } else { + ListNode* start_node = stream_data->m_queueStart; + + *hsp_list_out = (BlastHSPList*) start_node->ptr; + + stream_data->m_queueStart = start_node->next; + start_node->next = NULL; + ListNodeFree(start_node); + if (!stream_data->m_queueStart) + stream_data->m_queueEnd = NULL; + status = kBlastHSPStream_Success; + } + + NlmMutexUnlock(stream_data->m_resultsMutex); + + return status; +} + +static int +BlastHSPListQueueWrite(BlastHSPStream* hsp_stream, + BlastHSPList** hsp_list) +{ + BlastHSPListQueueData* stream_data = + (BlastHSPListQueueData*) GetData(hsp_stream); + + /* If input is empty, don't do anything, but return success */ + if (*hsp_list == NULL) + return kBlastHSPStream_Success; + + /* If stream is closed for writing, return error */ + if (stream_data->m_writingDone) + return kBlastHSPStream_Error; + + NlmMutexLockEx(&stream_data->m_resultsMutex); + stream_data->m_queueEnd = + ListNodeAddPointer(&stream_data->m_queueEnd, 0, (void*)(*hsp_list)); + if (!stream_data->m_queueStart) + stream_data->m_queueStart = stream_data->m_queueEnd; + /* Free the caller from this pointer's ownership. */ + *hsp_list = NULL; + /* Increment the semaphore count. */ + NlmSemaPost(stream_data->m_resultsSema); + NlmMutexUnlock(stream_data->m_resultsMutex); + + return kBlastHSPStream_Success; +} + +static void +BlastHSPListQueueClose(BlastHSPStream* hsp_stream) +{ + BlastHSPListQueueData* stream_data = + (BlastHSPListQueueData*) GetData(hsp_stream); + NlmMutexLockEx(&stream_data->m_resultsMutex); + stream_data->m_writingDone = TRUE; + /* Increment the semaphore count so the reading thread can get out of + * the waiting state and check the m_writingDone variable. */ + NlmSemaPost(stream_data->m_resultsSema); + NlmMutexUnlock(stream_data->m_resultsMutex); +} + +static BlastHSPStream* +BlastHSPListQueueNew(BlastHSPStream* hsp_stream, void* args) +{ + BlastHSPStreamFunctionPointerTypes fnptr; + + fnptr.dtor = &BlastHSPListQueueFree; + SetMethod(hsp_stream, eDestructor, fnptr); + fnptr.method = &BlastHSPListQueueRead; + SetMethod(hsp_stream, eRead, fnptr); + fnptr.method = &BlastHSPListQueueWrite; + SetMethod(hsp_stream, eWrite, fnptr); + fnptr.closeFn = &BlastHSPListQueueClose; + SetMethod(hsp_stream, eClose, fnptr); + + SetData(hsp_stream, args); + return hsp_stream; +} + +BlastHSPStream* Blast_HSPListQueueInit() +{ + BlastHSPListQueueData* stream_data = + (BlastHSPListQueueData*) calloc(1, sizeof(BlastHSPListQueueData)); + BlastHSPStreamNewInfo info; + + /* At the start of the search there is nothing in the results queue, so + * initialize the semaphore count with 0. */ + stream_data->m_resultsSema = NlmSemaInit(0); + info.constructor = &BlastHSPListQueueNew; + info.ctor_argument = (void*)stream_data; + + return BlastHSPStreamNew(&info); +} diff --git a/algo/blast/api/hspstream_queue.h b/algo/blast/api/hspstream_queue.h new file mode 100644 index 00000000..1b00b0e7 --- /dev/null +++ b/algo/blast/api/hspstream_queue.h @@ -0,0 +1,64 @@ +/* $Id: hspstream_queue.h,v 1.2 2004/06/08 17:46:35 dondosha Exp $ + * =========================================================================== + * + * PUBLIC DOMAIN NOTICE + * National Center for Biotechnology Information + * + * This software/database is a "United States Government Work" under the + * terms of the United States Copyright Act. It was written as part of + * the author's official duties as a United States Government employee and + * thus cannot be copyrighted. This software/database is freely available + * to the public for use. The National Library of Medicine and the U.S. + * Government have not placed any restriction on its use or reproduction. + * + * Although all reasonable efforts have been taken to ensure the accuracy + * and reliability of the software and data, the NLM and the U.S. + * Government do not and cannot warrant the performance or results that + * may be obtained by using this software or data. The NLM and the U.S. + * Government disclaim all warranties, express or implied, including + * warranties of performance, merchantability or fitness for any particular + * purpose. + * + * Please cite the author in any work or product based on this material. + * + * =========================================================================== + * + * Author: Ilya Dondoshansky + * + */ + +/** @file hspstream_queue.h + * Implementation of the BlastHSPStream interface for producing results on the + * fly. + */ + +#ifndef HSPSTREAM_QUEUE_H +#define HSPSTREAM_QUEUE_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <ncbithr.h> +#include <algo/blast/core/blast_options.h> +#include <algo/blast/core/blast_hits.h> +#include <algo/blast/core/blast_seqsrc.h> +#include <algo/blast/core/blast_hspstream.h> + +/** Data structure for the queue implementation of BlastHSPStream */ +typedef struct BlastHSPListQueueData { + ListNode* m_queueStart; + ListNode* m_queueEnd; + Boolean m_writingDone; + TNlmMutex m_resultsMutex; + TNlmSemaphore m_resultsSema; +} BlastHSPListQueueData; + +/** Function to initialize the queue implementation of BlastHSPStream */ +BlastHSPStream* Blast_HSPListQueueInit(void); + +#ifdef __cplusplus +} +#endif + +#endif /* HSPSTREAM_QUEUE_H */ diff --git a/algo/blast/api/multiseq_src.c b/algo/blast/api/multiseq_src.c index 31018649..c8e57990 100644 --- a/algo/blast/api/multiseq_src.c +++ b/algo/blast/api/multiseq_src.c @@ -1,4 +1,4 @@ -/* $Id: multiseq_src.c,v 1.5 2004/04/28 19:50:02 dondosha Exp $ +/* $Id: multiseq_src.c,v 1.6 2004/06/08 17:46:35 dondosha Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -31,7 +31,7 @@ * */ -static char const rcsid[] = "$Id: multiseq_src.c,v 1.5 2004/04/28 19:50:02 dondosha Exp $"; +static char const rcsid[] = "$Id: multiseq_src.c,v 1.6 2004/06/08 17:46:35 dondosha Exp $"; #include <algo/blast/api/multiseq_src.h> #include <algo/blast/core/blast_util.h> @@ -57,7 +57,7 @@ static MultiSeqInfo* MultiSeqInfoNew(const SeqLoc* seqloc_list, Uint1 program) index < num_seqs; ++index, seqloc_ptr = seqloc_ptr->next) { retval->seqloc_array[index] = seqloc_ptr; BLAST_SetUpSubject(program, seqloc_ptr, &retval->seqblk_array[index]); - max_length = MAX(max_length, retval->seqblk_array[index]->length); + max_length = MAX(max_length, (Uint4)retval->seqblk_array[index]->length); } retval->max_length = max_length; diff --git a/algo/blast/api/seqsrc_readdb.c b/algo/blast/api/seqsrc_readdb.c index 1688c3cf..935dcb55 100644 --- a/algo/blast/api/seqsrc_readdb.c +++ b/algo/blast/api/seqsrc_readdb.c @@ -1,4 +1,4 @@ -/* $Id: seqsrc_readdb.c,v 1.27 2004/04/28 19:39:01 dondosha Exp $ +/* $Id: seqsrc_readdb.c,v 1.29 2004/06/07 17:14:58 dondosha Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -30,7 +30,7 @@ * */ -static char const rcsid[] = "$Id: seqsrc_readdb.c,v 1.27 2004/04/28 19:39:01 dondosha Exp $"; +static char const rcsid[] = "$Id: seqsrc_readdb.c,v 1.29 2004/06/07 17:14:58 dondosha Exp $"; #include <algo/blast/api/seqsrc_readdb.h> #include <algo/blast/core/blast_def.h> @@ -197,8 +197,10 @@ static Int2 ReaddbRetSequence(void* readdb_handle, void* args) ASSERT(readdb_args); - if (readdb_args->seq->sequence_start_allocated) + if (readdb_args->seq->sequence_start_allocated) { sfree(readdb_args->seq->sequence_start); + readdb_args->seq->sequence_start_allocated = FALSE; + } return 0; } @@ -215,21 +217,26 @@ static char* ReaddbGetSeqIdStr(void* readdb_handle, void* args) ReadDBFILEPtr rdfp = (ReadDBFILEPtr) readdb_handle; Int4* oid = (Int4*) args; SeqIdPtr sip = NULL; + char* descr = NULL; char *seqid_str = NULL; if (!rdfp || !oid) return NULL; - if ( !(seqid_str = (char*) malloc(sizeof(char)*SEQIDLEN_MAX))) - return NULL; - - if (!readdb_get_descriptor(rdfp, *oid, &sip, NULL)) { + if (!readdb_get_descriptor(rdfp, *oid, &sip, &descr)) { sfree(seqid_str); return NULL; } - SeqIdWrite(sip, seqid_str, PRINTID_FASTA_LONG, SEQIDLEN_MAX-1); - + if (sip->choice != SEQID_GENERAL || + strcmp(((DbtagPtr)sip->data.ptrvalue)->db, "BL_ORD_ID")) { + if ( !(seqid_str = (char*) malloc(sizeof(char)*SEQIDLEN_MAX))) + return NULL; + SeqIdWrite(sip, seqid_str, PRINTID_FASTA_LONG, SEQIDLEN_MAX-1); + sfree(descr); + } else { + seqid_str = strtok(descr, " \t\n\r"); + } sip = SeqIdSetFree(sip); return seqid_str; @@ -445,6 +452,7 @@ BlastSeqSrc* ReaddbSeqSrcNew(BlastSeqSrc* retval, void* args) /* Initialize the BlastSeqSrc structure fields with user-defined function * pointers and rdfp */ SetDeleteFnPtr(retval, &ReaddbSeqSrcFree); + SetCopyFnPtr(retval, &ReaddbSeqSrcCopy); SetDataStructure(retval, (void*) rdfp); SetGetNumSeqs(retval, &ReaddbGetNumSeqs); SetGetMaxSeqLen(retval, &ReaddbGetMaxLength); @@ -478,9 +486,11 @@ BlastSeqSrc* ReaddbSeqSrcNew(BlastSeqSrc* retval, void* args) while (rdfp && rdfp->stop < rargs->final_db_seq) rdfp = rdfp->next; /* Set last sequence for this and all subsequent rdfp's to the one - in the arguments, making the subsequent rdfp's ranges empty. */ + in the arguments, making the subsequent rdfp's ranges empty. + Note that final_db_seq in arguments is 1 beyond the last sequence + number to search. */ for ( ; rdfp; rdfp = rdfp->next) - rdfp->stop = rargs->final_db_seq; + rdfp->stop = rargs->final_db_seq - 1; } return retval; @@ -495,6 +505,20 @@ BlastSeqSrc* ReaddbSeqSrcFree(BlastSeqSrc* bssp) return NULL; } +BlastSeqSrc* ReaddbSeqSrcCopy(BlastSeqSrc* bssp) +{ + ReadDBFILE* rdfp = NULL; + + if (!bssp) + return NULL; + + rdfp = readdb_attach((ReadDBFILEPtr)GetDataStructure(bssp)); + + SetDataStructure(bssp, (void*) rdfp); + + return bssp; +} + BlastSeqSrc* ReaddbBlastSeqSrcInit(const char* dbname, Boolean is_prot, int first_seq, int last_seq, void* extra_arg) diff --git a/algo/blast/api/seqsrc_readdb.h b/algo/blast/api/seqsrc_readdb.h index 0121a380..482222af 100644 --- a/algo/blast/api/seqsrc_readdb.h +++ b/algo/blast/api/seqsrc_readdb.h @@ -1,4 +1,4 @@ -/* $Id: seqsrc_readdb.h,v 1.9 2004/02/18 19:38:20 dondosha Exp $ +/* $Id: seqsrc_readdb.h,v 1.10 2004/06/07 17:15:18 dondosha Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -69,6 +69,13 @@ BlastSeqSrc* ReaddbSeqSrcNew(BlastSeqSrc* bssp, void* args); */ BlastSeqSrc* ReaddbSeqSrcFree(BlastSeqSrc* bssp); +/** Readdb sequence source copier: + * creates a new copy of the ReadDBFILE structure by calling readdb_attach. + * @param bssp BlastSeqSrc structure to copy [in] + * @return New BlastSeqSrc structure + */ +BlastSeqSrc* ReaddbSeqSrcCopy(BlastSeqSrc* bssp); + /** Initialize the sequence source structure. * @param dbname BLAST database name [in] * @param is_prot Is this a protein or nucleotide database? [in] diff --git a/algo/blast/api/twoseq_api.c b/algo/blast/api/twoseq_api.c index 9f3294ab..1a866bae 100644 --- a/algo/blast/api/twoseq_api.c +++ b/algo/blast/api/twoseq_api.c @@ -1,4 +1,4 @@ -/* $Id: twoseq_api.c,v 1.6 2004/05/05 15:30:13 dondosha Exp $ +/* $Id: twoseq_api.c,v 1.13 2004/06/08 17:47:24 dondosha Exp $ *************************************************************************** * * * COPYRIGHT NOTICE * @@ -40,10 +40,15 @@ #include <algo/blast/core/blast_util.h> #include <algo/blast/core/blast_engine.h> #include <algo/blast/core/mb_lookup.h> +#include <algo/blast/core/blast_filter.h> +#include <algo/blast/core/hspstream_collector.h> #include <algo/blast/api/multiseq_src.h> #include <algo/blast/api/blast_seqalign.h> #include <algo/blast/api/blast_seq.h> #include <algo/blast/api/twoseq_api.h> +#include <algo/blast/api/blast_returns.h> +/* For the AdjustOffSetsInSeqalign function */ +#include <sequtil.h> Int2 BLAST_SummaryOptionsInit(BLAST_SummaryOptions **options) { @@ -64,6 +69,9 @@ Int2 BLAST_SummaryOptionsInit(BLAST_SummaryOptions **options) new_options->gapped_calculation = TRUE; new_options->nucleotide_match = 1; new_options->nucleotide_mismatch = -3; + new_options->word_threshold = 0; + new_options->init_seed_method = eDefaultSeedType; + *options = new_options; return 0; } @@ -87,7 +95,7 @@ BLAST_FillOptions(Uint1 program_number, BlastEffectiveLengthsOptions* eff_len_options, PSIBlastOptions* psi_options, BlastDatabaseOptions* db_options, - BlastSeqSrc* seq_src, Int4 query_length, + Int4 query_length, Int4 subject_length, RPSInfo *rps_info) { Boolean do_megablast = FALSE; @@ -107,11 +115,12 @@ BLAST_FillOptions(Uint1 program_number, /* If one of the sequences is large enough, set up a megablast search */ - if (query_length > MEGABLAST_CUTOFF || + if (basic_options->hint == eFast || + query_length > MEGABLAST_CUTOFF || subject_length > MEGABLAST_CUTOFF) { do_megablast = TRUE; if (basic_options->gapped_calculation) - greedy_align = TRUE; /* one-pass, no ungapped */ + greedy_align = 1; /* one-pass, no ungapped */ } /* For a megablast search or a blastn search with @@ -122,20 +131,21 @@ BLAST_FillOptions(Uint1 program_number, /* If megablast was turned on but the input indicates a sensitive search is desired, switch to discontiguous - megablast (hardwired to the 12-of-21 optimal template) */ + megablast (hardwired to the 11-of-21 optimal template) */ - if (word_size == 0 && do_megablast && - basic_options->hint == eSensitive) { - word_size = 12; + if (do_megablast && basic_options->hint == eSensitive) { + if (word_size == 0) + word_size = 11; do_discontig = TRUE; do_ag_blast = FALSE; } } + BLAST_FillLookupTableOptions(lookup_options, program_number, do_megablast, - 0, /* default threshold */ + basic_options->word_threshold, word_size, do_ag_blast, 0, /* no variable wordsize */ @@ -158,24 +168,27 @@ BLAST_FillOptions(Uint1 program_number, BLAST_FillInitialWordOptions(word_options, program_number, - greedy_align, + (Boolean)greedy_align, 0, /* default for ungapped extensions */ 0, /* no variable wordsize */ do_ag_blast, do_megablast, 0); /* default ungapped X dropoff */ + /* If we need to enforce a single-hit method, reset window size to 0. + To enforce two-hit method, set window size to a default non-zero + value */ + if (basic_options->init_seed_method == eOneHit) + word_options->window_size = 0; + else if (basic_options->init_seed_method == eTwoHits) + word_options->window_size = BLAST_WINDOW_SIZE_PROT; + BLAST_FillExtensionOptions(ext_options, program_number, greedy_align, - 0, /* default gapped X dropoff */ + basic_options->gap_x_dropoff, 0); /* default final X dropoff */ - if (greedy_align == 1) { - ext_options->algorithm_type = EXTEND_GREEDY; - word_options->ungapped_extension = FALSE; - } - if (basic_options->matrix == NULL) matrix = "BLOSUM62"; else @@ -183,12 +196,12 @@ BLAST_FillOptions(Uint1 program_number, BLAST_FillScoringOptions(score_options, program_number, - greedy_align, + (Boolean)greedy_align, basic_options->nucleotide_mismatch, basic_options->nucleotide_match, matrix, - 0, /* default gap open penalty */ - 0); /* default gap extend penalty */ + basic_options->gap_open, + basic_options->gap_extend); score_options->gapped_calculation = basic_options->gapped_calculation; @@ -197,23 +210,95 @@ BLAST_FillOptions(Uint1 program_number, 0); /* default number of alignments saved */ hit_options->percent_identity = 0; /* no percent identity cutoff */ - - BLAST_FillEffectiveLengthsOptions(eff_len_options, - 1, /* one sequence */ - subject_length, /* this long */ - 0); /* default search space */ + + eff_len_options->db_length = basic_options->db_length; + return 0; } Int2 -BLAST_TwoSequencesSearch(const BLAST_SummaryOptions *basic_options, - BioseqPtr bsp1, BioseqPtr bsp2, SeqAlign **seqalign_out) +BLAST_TwoSequencesSearch(BLAST_SummaryOptions *basic_options, + BioseqPtr bsp1, BioseqPtr bsp2, + SeqAlign **seqalign_out) { + Uint1 program_type = blast_type_undefined; SeqLocPtr query_slp = NULL; /* sequence variables */ SeqLocPtr subject_slp = NULL; + Boolean seq1_is_aa, seq2_is_aa; + Int2 status = 0; + + /* sanity checks */ + + *seqalign_out = NULL; + if (bsp1 == NULL || bsp2 == NULL) + return 0; + + seq1_is_aa = ISA_aa(bsp1->mol); + seq2_is_aa = ISA_aa(bsp2->mol); + + /* Find program type consistent with the sequences. */ + if (!seq1_is_aa && !seq2_is_aa) { + if (basic_options->program == eTblastx) + program_type = blast_type_tblastx; + else + program_type = blast_type_blastn; + } else if (seq1_is_aa && seq2_is_aa) { + program_type = blast_type_blastp; + } else if (!seq1_is_aa && seq2_is_aa) { + program_type = blast_type_blastx; + } else if (seq1_is_aa && !seq2_is_aa) { + program_type = blast_type_tblastn; + } + + /* Check if program type in options is consistent with the one determined + from sequences. */ + if (basic_options->program == eChoose) + basic_options->program = program_type; + else if (basic_options->program != program_type) + return -1; + + /* Convert the bioseqs into seqlocs. */ + + ValNodeAddPointer(&query_slp, SEQLOC_WHOLE, + SeqIdDup(SeqIdFindBest(bsp1->id, SEQID_GI))); + if (!query_slp) + return -1; + ValNodeAddPointer(&subject_slp, SEQLOC_WHOLE, + SeqIdDup(SeqIdFindBest(bsp2->id, SEQID_GI))); + if (!subject_slp) + return -1; + + status = BLAST_TwoSeqLocSets(basic_options, query_slp, subject_slp, + seqalign_out, NULL, NULL); + SeqLocFree(query_slp); + SeqLocFree(subject_slp); + + return status; +} + +static Int4 SeqLocListLen(SeqLoc* seqloc) +{ + Int4 length = 0; + + for ( ; seqloc; seqloc = seqloc->next) + length += SeqLocLen(seqloc); + + return length; +} + +/** Compares one list of SeqLoc's against another list of SeqLoc's using the + * BLAST algorithm. + */ +Int2 +BLAST_TwoSeqLocSets(const BLAST_SummaryOptions *basic_options, + SeqLoc* query_seqloc, SeqLoc* subject_seqloc, + SeqAlign **seqalign_out, + SeqLoc** filter_out, + BLAST_SummaryReturn* *extra_returns) +{ + Uint1 program_type; BlastSeqSrc *seq_src = NULL; BLAST_SequenceBlk *query = NULL; - Uint1 program_type; BlastQueryInfo* query_info = NULL; ListNode* lookup_segments = NULL; /* query filtering structures */ @@ -229,39 +314,31 @@ BLAST_TwoSequencesSearch(const BLAST_SummaryOptions *basic_options, BlastEffectiveLengthsOptions* eff_len_options = NULL; BlastScoreBlk* sbp = NULL; BlastHSPResults* results = NULL; - BlastReturnStat* return_stats = NULL; + BlastDiagnostics* diagnostics = NULL; PSIBlastOptions* psi_options = NULL; BlastDatabaseOptions* db_options = NULL; RPSInfo *rps_info = NULL; - - Int2 status; - - /* sanity checks */ - - *seqalign_out = NULL; - if (bsp1 == NULL || bsp2 == NULL) - return 0; - - if (bsp1->mol != bsp2->mol) - return 0; - - /* decide which blast program to execute */ + Int2 status = 0; + BlastHSPStream* hsp_stream; switch(basic_options->program) { - case eBlastp: - program_type = blast_type_blastp; - break; case eBlastn: - program_type = blast_type_blastn; - break; - case eChoose: - if (ISA_aa(bsp1->mol)) - program_type = blast_type_blastp; - else - program_type = blast_type_blastn; - break; + program_type = blast_type_blastn; + break; + case eBlastp: + program_type = blast_type_blastp; + break; + case eBlastx: + program_type = blast_type_blastx; + break; + case eTblastn: + program_type = blast_type_tblastn; + break; + case eTblastx: + program_type = blast_type_tblastx; + break; default: - return -1; + return -1; } /* fill in the engine-specific options */ @@ -273,36 +350,35 @@ BLAST_TwoSequencesSearch(const BLAST_SummaryOptions *basic_options, if (status != 0) goto bail_out; + if (program_type == blast_type_tblastn || + program_type == blast_type_tblastx) { + if ((status = BLAST_GeneticCodeFind(db_options->genetic_code, + &db_options->gen_code_string))) + return status; + } + + seq_src = MultiSeqSrcInit(subject_seqloc, program_type); + if (seq_src == NULL) + goto bail_out; + status = BLAST_FillOptions(program_type, basic_options, lookup_options, query_options, word_options, ext_options, hit_options, score_options, eff_len_options, psi_options, db_options, - NULL, bsp1->length, bsp2->length, rps_info); + SeqLocListLen(query_seqloc), + BLASTSeqSrcGetMaxSeqLen(seq_src), + rps_info); if (status != 0) goto bail_out; - /* Convert the bioseqs into seqlocs. */ - - ValNodeAddPointer(&query_slp, SEQLOC_WHOLE, - SeqIdDup(SeqIdFindBest(bsp1->id, SEQID_GI))); - ValNodeAddPointer(&subject_slp, SEQLOC_WHOLE, - SeqIdDup(SeqIdFindBest(bsp2->id, SEQID_GI))); - if (query_slp == NULL || subject_slp == NULL) { - status = -1; - goto bail_out; - } /* convert the seqlocs into SequenceBlks, and fill in query_info */ - status = BLAST_SetUpQuery(program_type, query_slp, query_options, + status = BLAST_SetUpQuery(program_type, query_seqloc, query_options, &query_info, &query); if (status != 0) goto bail_out; - seq_src = MultiSeqSrcInit(subject_slp, program_type); - if (seq_src == NULL) - goto bail_out; - /* perform final setup */ status = BLAST_ValidateOptions(program_type, ext_options, @@ -311,37 +387,58 @@ BLAST_TwoSequencesSearch(const BLAST_SummaryOptions *basic_options, goto bail_out; status = BLAST_MainSetUp(program_type, query_options, score_options, - hit_options, query, query_info, &lookup_segments, + hit_options, query, query_info, 1.0, &lookup_segments, &filter_loc, &sbp, NULL); if (status != 0) goto bail_out; - return_stats = (BlastReturnStat*) calloc(1, sizeof(BlastReturnStat)); - if (return_stats == NULL) - goto bail_out; + if (extra_returns) { + if ((diagnostics = Blast_DiagnosticsInit()) == NULL) + goto bail_out; + } - Blast_HSPResultsInit(query_info->num_queries, &results); LookupTableWrapInit(query, lookup_options, lookup_segments, sbp, &lookup_wrap, NULL); + /* Initialize the HSPList collector stream. Results should not be sorted + before reading from it. */ + hsp_stream = + Blast_HSPListCollectorInit(program_type, hit_options, + query_info->num_queries, FALSE); /* finally, do the search */ status = BLAST_SearchEngine(program_type, query, query_info, seq_src, sbp, score_options, lookup_wrap, word_options, ext_options, hit_options, eff_len_options, - psi_options, db_options, results, return_stats); + psi_options, db_options, hsp_stream, diagnostics, &results); + + hsp_stream = BlastHSPStreamFree(hsp_stream); if (status != 0) goto bail_out; /* Convert results to the SeqAlign form */ - status = BLAST_ResultsToSeqAlign(program_type, results, query_slp, NULL, - subject_slp, score_options, sbp, score_options->gapped_calculation, - seqalign_out); + status = BLAST_ResultsToSeqAlign(program_type, results, query_seqloc, + seq_src, score_options->gapped_calculation, + score_options->is_ooframe, seqalign_out); bail_out: - if (return_stats) - sfree(return_stats); + AdjustOffSetsInSeqAlign(*seqalign_out, query_seqloc, subject_seqloc); + + if (!status && extra_returns) { + Blast_SummaryReturnFill(program_type, score_options, sbp, + lookup_options, word_options, ext_options, + hit_options, eff_len_options, query_info, + seq_src, diagnostics, extra_returns); + } + + if (filter_out) { + *filter_out = + BlastMaskLocToSeqLoc(program_type, filter_loc, query_seqloc); + } + + Blast_DiagnosticsFree(diagnostics); + BlastMaskLocFree(filter_loc); BlastSeqSrcFree(seq_src); LookupTableWrapFree(lookup_wrap); ListNodeFreeData(lookup_segments); @@ -358,8 +455,9 @@ bail_out: BlastEffectiveLengthsOptionsFree(eff_len_options); PSIBlastOptionsFree(psi_options); BlastDatabaseOptionsFree(db_options); - SeqLocFree(query_slp); - SeqLocFree(subject_slp); return status; } + + + diff --git a/algo/blast/api/twoseq_api.h b/algo/blast/api/twoseq_api.h index f449b699..4ba5f768 100644 --- a/algo/blast/api/twoseq_api.h +++ b/algo/blast/api/twoseq_api.h @@ -1,4 +1,4 @@ -/* $Id: twoseq_api.h,v 1.2 2004/03/24 19:14:21 papadopo Exp $ +/* $Id: twoseq_api.h,v 1.3 2004/05/14 17:24:03 dondosha Exp $ *************************************************************************** * * * COPYRIGHT NOTICE * @@ -52,9 +52,13 @@ * megablast with word size 12 is used. */ enum blast_type { - eChoose = 0, /**< blastn for nuc. sequences, blastp otherwise */ + eChoose = 0, /**< Choose type of search by sequences molecule type: + n-n=blastn, p-p=blastp, n-p=blastx, p-n=tblastn */ eBlastn = 1, /**< blastn or megablast (determined automatically) */ - eBlastp = 2 /**< blastp search on two protein sequences */ + eBlastp = 2, /**< blastp search between protein sequences */ + eBlastx = 3, /**< blastx for nucleotide vs protein sequences */ + eTblastn = 4, /**< tblastn for protein vs nucleotide sequences */ + eTblastx = 5 /**< tblastx for translated nucleotide sequences */ }; /** @@ -66,6 +70,14 @@ enum blast_hint { eFast = 1 /**< trade off sensitivity for speed */ }; +typedef enum seed_type { + eDefaultSeedType = 0, /**< BLAST will decide which method to use based on + program and other information. */ + eOneHit = 1, /**< Require only one initial hit for extension */ + eTwoHits = 2 /**< Require more than one hit within a window + for extension */ +} seed_type; + /** * The main user-visible setup structure for the API. This * only makes a (small) subset of the complete options available @@ -99,8 +111,43 @@ typedef struct { for matching letters (default 1) */ Int4 nucleotide_mismatch; /**< For nucleotide searches, the penalty for mismatching letters (default -3) */ + Int4 gap_open; /**< Cost of opening a gap. Default=0, invokes + default values: 5 for nucleotide; + depends on matrix for protein search.*/ + Int4 gap_extend; /**< Cost of extending a gap. Default=0, + invokes default values: 2 for nucleotide; + depends on matrix for protein search.*/ + Int4 gap_x_dropoff; /**< Dropoff value for the gapped extension. + Default=0, invokes default values. */ + double db_length; /**< Database length to use in statistical + calculations. + Default=0 means "database length" is set + to the subject sequence length for each + subject sequence. */ + Int4 word_threshold; /**< Threshold for finding neighboring words + in protein searches. */ + seed_type init_seed_method; /**< Single-hit or multiple-hit choice of + initial seeds for extension. */ } BLAST_SummaryOptions; +/** Small structure containing the just those Karlin-Altschul parameters needed + * for the BLAST formatting */ +typedef struct BLAST_KAParameters { + double Lambda; + double K; + double H; +} BLAST_KAParameters; + +/** Structure holding all calculated data returned from a BLAST search other + * than the alignment. + */ +typedef struct BLAST_SummaryReturn { + BLAST_KAParameters* ka_params; /**< Ungapped Karlin-Altschul parameters */ + BLAST_KAParameters* ka_params_gap;/**< Gapped Karlin-Altschul parameters */ + char* params_buffer; /**< Buffer holding the bottom of BLAST report. */ +} BLAST_SummaryReturn; + + /** * Allocate storage for an API setup structure and set the * default options for it. @@ -129,9 +176,31 @@ BLAST_SummaryOptions* BLAST_SummaryOptionsFree(BLAST_SummaryOptions *options); * If search failed or no alignments were found, set to NULL [out] * @return 0 for a successful search, nonzero if search failed */ -Int2 BLAST_TwoSequencesSearch(const BLAST_SummaryOptions *options, +Int2 BLAST_TwoSequencesSearch(BLAST_SummaryOptions *options, Bioseq *bsp1, Bioseq *bsp2, SeqAlign **seqalign_out); +/** + * Perform a BLAST search on the two input sequences and return + * the list of alignments the search generates + * @param options Structure describing how the search will be configured [in] + * @param seqloc1 The first list of sequences (queries) to be compared. + * Filtering is applied only to these sequences [in] + * @param seqloc2 The second list of sequences (subjects) to be compared [in] + * @param seqalign_out The list of alignments generated by the search. + * Alignments are sorted by query; then by subject among + * same query alignments. + * If search failed or no alignments were found, + * set to NULL [out] + * @param filter_out Masking locations [out] + * @param extra_returns Data needed to print the bottom of BLAST report [out] + * @return 0 for a successful search, nonzero if search failed + */ +Int2 BLAST_TwoSeqLocSets(const BLAST_SummaryOptions *options, + SeqLoc* seqloc1, SeqLoc* seqloc2, + SeqAlign **seqalign_out, + SeqLoc** filter_out, + BLAST_SummaryReturn* *extra_returns); + #endif /* !_TWOSEQ_API_H_ */ |