diff options
author | Aaron M. Ucko <ucko@debian.org> | 2005-06-17 02:24:46 +0000 |
---|---|---|
committer | Aaron M. Ucko <ucko@debian.org> | 2005-06-17 02:24:46 +0000 |
commit | d76c50353c9e74f6915ca6352afb29ae53d45777 (patch) | |
tree | 9045ebb0b61573bfa4ff3d8778899c5a2cb76d6c /demo | |
parent | 402b112099aa816a02fd502b7f0261a99fe7126a (diff) |
Load /tmp/.../ncbi-tools6-6.1.20050605 into
branches/upstream/current.
Diffstat (limited to 'demo')
-rw-r--r-- | demo/.BLAST_VERSION | 2 | ||||
-rw-r--r-- | demo/bl2seq.c | 50 | ||||
-rw-r--r-- | demo/blast_driver.c | 136 | ||||
-rw-r--r-- | demo/blastall.c | 11 | ||||
-rw-r--r-- | demo/blastpgp.c | 85 | ||||
-rw-r--r-- | demo/copymat.c | 9 | ||||
-rw-r--r-- | demo/fastacmd.c | 18 | ||||
-rw-r--r-- | demo/formatrpsdb.c | 10 | ||||
-rw-r--r-- | demo/megablast.c | 141 | ||||
-rw-r--r-- | demo/nps2gps.c | 997 | ||||
-rw-r--r-- | demo/rpsblast.c | 271 | ||||
-rw-r--r-- | demo/tbl2asn.c | 119 | ||||
-rw-r--r-- | demo/trna2tbl.c | 238 |
13 files changed, 1739 insertions, 348 deletions
diff --git a/demo/.BLAST_VERSION b/demo/.BLAST_VERSION index 0d3ad67a..0b6e4313 100644 --- a/demo/.BLAST_VERSION +++ b/demo/.BLAST_VERSION @@ -1 +1 @@ -2.2.10 +2.2.11 diff --git a/demo/bl2seq.c b/demo/bl2seq.c index a2cc5f1a..e0866598 100644 --- a/demo/bl2seq.c +++ b/demo/bl2seq.c @@ -1,4 +1,4 @@ -static char const rcsid[] = "$Id: bl2seq.c,v 6.75 2005/03/16 00:43:40 dondosha Exp $"; +static char const rcsid[] = "$Id: bl2seq.c,v 6.77 2005/06/02 20:45:32 dondosha Exp $"; /************************************************************************** * * @@ -27,6 +27,12 @@ static char const rcsid[] = "$Id: bl2seq.c,v 6.75 2005/03/16 00:43:40 dondosha E *************************************************************************** * * $Log: bl2seq.c,v $ +* Revision 6.77 2005/06/02 20:45:32 dondosha +* Use BlastFormattingInfo structure for formatting +* +* Revision 6.76 2005/05/02 17:00:27 coulouri +* change default to new engine +* * Revision 6.75 2005/03/16 00:43:40 dondosha * Correction to previous commit to make reported deflines the same as before * @@ -550,8 +556,8 @@ static Args myargs [] = { "F", NULL, NULL, TRUE, 'U', ARG_BOOLEAN, 0.0, 0, NULL}, /* ARG_LCASE */ { "Input sequences in the form of accession.version", "F", NULL, NULL, FALSE, 'A', ARG_BOOLEAN, 0.0, 0, NULL}, /* ARG_ACCN */ - {"Force use of old engine", - "T", NULL, NULL, TRUE, 'V', ARG_BOOLEAN, 0.0, 0, NULL} /* ARG_FORCE_OLD */ + {"Force use of the legacy BLAST engine", + "F", NULL, NULL, TRUE, 'V', ARG_BOOLEAN, 0.0, 0, NULL} /* ARG_FORCE_OLD */ }; /** @@ -807,7 +813,7 @@ Int2 Main_new(void) BioseqPtr query_bsp=NULL, subject_bsp=NULL; BioseqPtr bsp1=NULL, bsp2=NULL; BioseqPtr fake_bsp=NULL, fake_subject_bsp=NULL; - BlastFormattingOptions* format_options; + BlastFormattingInfo* format_info = NULL; BLAST_SummaryOptions* options=NULL; Blast_SummaryReturn* extra_returns=NULL; Boolean believe_query= FALSE; @@ -818,7 +824,7 @@ Int2 Main_new(void) DbtagPtr dbtagptr; EBlastProgramType program_number; Int2 status; /* return value */ - Int4 align_view=0; /* Used for formatting */ + EAlignView align_view = eAlignViewPairwise; /* Used for formatting */ SeqAlignPtr seqalign=NULL; SeqEntryPtr sep=NULL, sep1=NULL; SeqLocPtr slp1, slp2; /* Used for actual search. */ @@ -826,14 +832,16 @@ Int2 Main_new(void) SeqLocPtr lcase_mask=NULL; /* For lower-case masking info from query FASTA. */ SeqLoc* repeat_mask = NULL; /* Repeat mask locations */ Uint1 strand_option = 0; /* FIXME */ + SBlastOptions* search_options = NULL; /* Needed for formatting. */ strand_option = (Uint1) myargs[ARG_STRAND].intvalue; entrez_lookup = (Boolean) myargs[ARG_ACCN].intvalue; seqannot_output = (myargs[ARG_ASNOUT].strvalue != NULL); believe_query = (seqannot_output || entrez_lookup); + /* Non-zero value for -m option means tabular output. */ if (myargs[ARG_FORMAT].intvalue != 0) - align_view = 9; /* Means tabular output. */ + align_view = eAlignViewTabularWithComments; BlastProgram2Number(myargs[ARG_PROGRAM].strvalue, &program_number); @@ -914,17 +922,17 @@ Int2 Main_new(void) asnout = AsnIoClose(asnout); } - if ((status = BlastFormattingOptionsNew(program_number, - myargs[ARG_OUT].strvalue, 0, 1, align_view, - &format_options)) != 0) - return status; - - if (myargs[ARG_HTML].intvalue) { - format_options->html = TRUE; - format_options->align_options += TXALIGN_HTML; - format_options->print_options += TXALIGN_HTML; - } - format_options->believe_query = believe_query; + /* Pass NULL for the database name, since there is no database. */ + BlastFormattingInfoNewBasic(align_view, options, slp1, + myargs[ARG_OUT].strvalue, + &search_options, &format_info); + + /* Always show gis in the output, hence pass TRUE for respective + argument. */ + BlastFormattingInfoSetUpOptions(format_info, 0, 1, + (Boolean) myargs[ARG_HTML].intvalue, + (Boolean) myargs[ARG_USEMEGABLAST].intvalue, + TRUE, believe_query); if (mask_at_hash) { /* Masking locations in SeqLoc form are no longer needed */ @@ -933,14 +941,14 @@ Int2 Main_new(void) /* Format the results */ status = - BLAST_FormatResults(seqalign, NULL, myargs[ARG_PROGRAM].strvalue, - 1, slp1, filter_loc, format_options, FALSE, NULL, + BLAST_FormatResults(seqalign, 1, slp1, filter_loc, format_info, extra_returns); - status = Blast_PrintOutputFooter(program_number, format_options, NULL, extra_returns); + status = Blast_PrintOutputFooter(format_info, extra_returns); - format_options = BlastFormattingOptionsFree(format_options); + format_info = BlastFormattingInfoFree(format_info); extra_returns = Blast_SummaryReturnFree(extra_returns); + search_options = SBlastOptionsFree(search_options); if (entrez_lookup) { BioseqFree(query_bsp); diff --git a/demo/blast_driver.c b/demo/blast_driver.c index 48ac0c75..f9b9a759 100644 --- a/demo/blast_driver.c +++ b/demo/blast_driver.c @@ -1,4 +1,4 @@ -/* $Id: blast_driver.c,v 1.92 2005/04/27 20:01:00 dondosha Exp $ +/* $Id: blast_driver.c,v 1.96 2005/06/02 20:59:38 dondosha Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -32,10 +32,10 @@ Author: Ilya Dondoshansky Contents: Main function for running BLAST ****************************************************************************** - * $Revision: 1.92 $ + * $Revision: 1.96 $ * */ -static char const rcsid[] = "$Id: blast_driver.c,v 1.92 2005/04/27 20:01:00 dondosha Exp $"; +static char const rcsid[] = "$Id: blast_driver.c,v 1.96 2005/06/02 20:59:38 dondosha Exp $"; #include <ncbi.h> #include <sqnutils.h> @@ -247,7 +247,6 @@ s_FillOptions(SBlastOptions* options) Boolean greedy_with_ungapped = FALSE; Boolean is_gapped = FALSE; EBlastProgramType program_number = options->program; - Boolean use_pssm = FALSE; /* The following options are for blastn only */ if (program_number == eBlastTypeBlastn) { @@ -268,7 +267,7 @@ s_FillOptions(SBlastOptions* options) BLAST_FillLookupTableOptions(lookup_options, program_number, mb_lookup, myargs[ARG_THRESHOLD].intvalue, (Int2)myargs[ARG_WORDSIZE].intvalue, - variable_wordsize, use_pssm); + variable_wordsize); /* Fill the rest of the lookup table options */ lookup_options->mb_template_length = (Uint1) myargs[ARG_TEMPL_LEN].intvalue; @@ -391,9 +390,9 @@ Int2 Nlm_Main(void) Int2 status = 0; SeqLoc* lcase_mask = NULL; SeqLoc* query_slp = NULL; - FILE *infp, *outfp; + FILE *infp, *outfp = NULL; SBlastOptions* options = NULL; - BlastFormattingOptions* format_options = NULL; + BlastFormattingInfo* format_info = NULL; Int2 ctr = 1; Int4 num_queries=0; int tabular_output = FALSE; @@ -403,6 +402,7 @@ Int2 Nlm_Main(void) Blast_SummaryReturn* sum_returns = NULL; Boolean phi_blast = FALSE; ValNode* phivnps = NULL; + Blast_SummaryReturn* full_sum_returns = NULL; if (! GetArgs (buf, NUMARG, myargs)) return (1); @@ -439,8 +439,7 @@ Int2 Nlm_Main(void) program_number == eBlastTypeTblastx || program_number == eBlastTypePhiBlastn); - phi_blast = (program_number == eBlastTypePhiBlastn || - program_number == eBlastTypePhiBlastp); + phi_blast = Blast_ProgramIsPhiBlast(program_number); if ((dbname = myargs[ARG_DB].strvalue) == NULL) { Int4 letters_read; @@ -475,59 +474,50 @@ Int2 Nlm_Main(void) s_FillOptions(options); - tabular_output = myargs[ARG_TABULAR].intvalue; - - if (tabular_output) { - if ((outfp = FileOpen(myargs[ARG_OUT].strvalue, "w")) == NULL) { - ErrPostEx(SEV_FATAL, 1, 0, "blast: Unable to open output file %s\n", - myargs[ARG_OUT].strvalue); - return (1); - } - } else { - if ((status = BlastFormattingOptionsNew(program_number, - myargs[ARG_OUT].strvalue, - myargs[ARG_DESCRIPTIONS].intvalue, - myargs[ARG_ALIGNMENTS].intvalue, - myargs[ARG_FORMAT].intvalue, &format_options)) != 0) - return status; - - if (myargs[ARG_HTML].intvalue) { - format_options->html = TRUE; - format_options->align_options += TXALIGN_HTML; - format_options->print_options += TXALIGN_HTML; - } - - if (myargs[ARG_SHOWGI].intvalue) { - format_options->align_options += TXALIGN_SHOW_GI; - format_options->print_options += TXALIGN_SHOW_GI; - } - - if (dbname) { - BLAST_PrintOutputHeader(format_options, - (Boolean)myargs[ARG_GREEDY].intvalue, blast_program, dbname, !db_is_na); - } - - if (myargs[ARG_UNGAPPED].intvalue != 0) - format_options->print_options += TXALIGN_SHOW_NO_OF_SEGS; - - } - if ((infp = FileOpen(myargs[ARG_QUERY].strvalue, "r")) == NULL) { ErrPostEx(SEV_FATAL, 1, 0, "blast: Unable to open input file %s\n", myargs[ARG_QUERY].strvalue); return (1); } - if (tabular_output) { - believe_defline = TRUE; - } - if (s_ParseIntervalLocationArgument(myargs[ARG_QUERY_LOC].strvalue, &q_from, &q_to)) { ErrPostEx(SEV_FATAL, 1, 0, "Invalid query sequence location\n"); return -1; } + tabular_output = myargs[ARG_TABULAR].intvalue; + + /* For on-the-fly tabular and for ASN.1 output, the believe query defline + option must be set to true, otherwise it should be false. */ + believe_defline = + (tabular_output || myargs[ARG_FORMAT].intvalue == eAlignViewAsnText || + myargs[ARG_FORMAT].intvalue == eAlignViewAsnBinary); + + + if (tabular_output) { + if ((outfp = FileOpen(myargs[ARG_OUT].strvalue, "w")) == NULL) { + ErrPostEx(SEV_FATAL, 1, 0, "blast: Unable to open output file %s\n", + myargs[ARG_OUT].strvalue); + return (1); + } + } else { + BlastFormattingInfoNew(myargs[ARG_FORMAT].intvalue, options, + blast_program, dbname, + myargs[ARG_OUT].strvalue, &format_info); + + BlastFormattingInfoSetUpOptions(format_info, + myargs[ARG_DESCRIPTIONS].intvalue, + myargs[ARG_ALIGNMENTS].intvalue, + (Boolean) myargs[ARG_HTML].intvalue, + (Boolean) myargs[ARG_GREEDY].intvalue, + (Boolean) myargs[ARG_SHOWGI].intvalue, + believe_defline); + + if (dbname) + BLAST_PrintOutputHeader(format_info); + } + /* Get the query (queries), loop if necessary. */ while (1) { SeqAlign* seqalign = NULL; @@ -539,7 +529,7 @@ Int2 Nlm_Main(void) if ((Boolean)myargs[ARG_LCASE].intvalue) { letters_read = BLAST_GetQuerySeqLoc(infp, query_is_na, - (Uint1)myargs[ARG_STRAND].intvalue, 0, q_from, q_to, &lcase_mask, + (Uint1)myargs[ARG_STRAND].intvalue, 10000, q_from, q_to, &lcase_mask, &query_slp, &ctr, &num_queries, believe_defline); } else { letters_read = @@ -622,9 +612,8 @@ Int2 Nlm_Main(void) if (!status) { if (phi_blast) { status = - PHIBlastFormatResults(program_number, phivnps, dbname, - query_slp, format_options, - sum_returns); + PHIBlastFormatResults(phivnps, query_slp, format_info, + sum_returns); phivnps = PHIBlastResultsFree(phivnps); } else if (!tabular_output) { if (myargs[ARG_ASNOUT].strvalue) { @@ -636,40 +625,39 @@ Int2 Nlm_Main(void) /* Format the results */ status = - BLAST_FormatResults(seqalign, dbname, blast_program, - num_queries, query_slp, - filter_loc, format_options, - (Boolean)myargs[ARG_FRAMESHIFT].intvalue, - NULL, sum_returns); + BLAST_FormatResults(seqalign, num_queries, query_slp, + filter_loc, format_info, sum_returns); seqalign = SeqAlignSetFree(seqalign); } - Blast_PrintOutputFooter(program_number, format_options, dbname, - sum_returns); } - /* Clean the summary returns substructures. */ + /* Update the cumulative summary returns structure and clean the returns + substructures for the current search iteration. */ + Blast_SummaryReturnUpdate(sum_returns, &full_sum_returns); Blast_SummaryReturnClean(sum_returns); filter_loc = Blast_ValNodeMaskListFree(filter_loc); query_slp = SeqLocSetFree(query_slp); } /* End loop on sets of queries */ + if (infp) + FileClose(infp); + subject_slp = SeqLocSetFree(subject_slp); - options = SBlastOptionsFree(options); + + /* Print the footer with cumulative summary information. */ + Blast_PrintOutputFooter(format_info, full_sum_returns); + sum_returns = Blast_SummaryReturnFree(sum_returns); + full_sum_returns = Blast_SummaryReturnFree(full_sum_returns); - if (!tabular_output) { - if(format_options->html && myargs[ARG_FORMAT].intvalue < 7) { - fprintf(format_options->outfp, "</PRE>\n</BODY>\n</HTML>\n"); - } - BlastFormattingOptionsFree(format_options); - } else { - FileClose(outfp); - } + if (!tabular_output) + format_info = BlastFormattingInfoFree(format_info); + else + FileClose(outfp); + + options = SBlastOptionsFree(options); - if (infp) - FileClose(infp); - return status; } diff --git a/demo/blastall.c b/demo/blastall.c index 202ff6b3..00fad90f 100644 --- a/demo/blastall.c +++ b/demo/blastall.c @@ -1,6 +1,6 @@ -static char const rcsid[] = "$Id: blastall.c,v 6.150 2005/02/07 15:30:39 dondosha Exp $"; +static char const rcsid[] = "$Id: blastall.c,v 6.151 2005/05/05 14:41:32 coulouri Exp $"; -/* $Id: blastall.c,v 6.150 2005/02/07 15:30:39 dondosha Exp $ +/* $Id: blastall.c,v 6.151 2005/05/05 14:41:32 coulouri Exp $ ************************************************************************** * * * COPYRIGHT NOTICE * @@ -28,6 +28,9 @@ static char const rcsid[] = "$Id: blastall.c,v 6.150 2005/02/07 15:30:39 dondosh ************************************************************************** * * $Log: blastall.c,v $ + * Revision 6.151 2005/05/05 14:41:32 coulouri + * plug object manager entity id leak - rt ticket 15084082 + * * Revision 6.150 2005/02/07 15:30:39 dondosha * Removed restriction on the value of longest intron option * @@ -1847,8 +1850,6 @@ Int2 Main (void) } } /* show text align, loop over seqalign/seqannots for concat */ ObjMgrClearHold(); - - ObjMgrFreeCache(0); } /* if outfp */ for (sap_iter=0; sap_iter < num_queries; sap_iter++) { /* upper bound is num_queries, take care not to do this unless concat */ @@ -1965,6 +1966,8 @@ Int2 Main (void) if (!options->is_megablast_search) BlastDeleteUserErrorString(err_ticket); + + ObjMgrFreeCache(0); } /* while(TRUE) - main loop of the program over all FASTA entries */ #ifdef BLAST_CS_API diff --git a/demo/blastpgp.c b/demo/blastpgp.c index c0320b17..2c69bfff 100644 --- a/demo/blastpgp.c +++ b/demo/blastpgp.c @@ -1,6 +1,6 @@ -static char const rcsid[] = "$Id: blastpgp.c,v 6.127 2005/04/04 14:57:47 papadopo Exp $"; +static char const rcsid[] = "$Id: blastpgp.c,v 6.130 2005/05/18 17:35:49 papadopo Exp $"; -/* $Id: blastpgp.c,v 6.127 2005/04/04 14:57:47 papadopo Exp $ */ +/* $Id: blastpgp.c,v 6.130 2005/05/18 17:35:49 papadopo Exp $ */ /************************************************************************** * * * COPYRIGHT NOTICE * @@ -26,8 +26,18 @@ static char const rcsid[] = "$Id: blastpgp.c,v 6.127 2005/04/04 14:57:47 papadop * appreciated. * * * ************************************************************************** - * $Revision: 6.127 $ + * $Revision: 6.130 $ * $Log: blastpgp.c,v $ + * Revision 6.130 2005/05/18 17:35:49 papadopo + * add warnings if new composition-based statistics options are selected + * + * Revision 6.129 2005/05/17 17:51:20 papadopo + * make the -t argument a string and handle values of 'T' or 'F' manually (required for backward compatibility) + * + * Revision 6.128 2005/05/16 17:41:00 papadopo + * From Alejandro Schaffer: Added support for compositional adjustment + * of matrices via enhanced -t flag, which now is integer to allow 4 options. + * * Revision 6.127 2005/04/04 14:57:47 papadopo * remove requirement for a fasta format query file if restarting from scoremat * @@ -528,6 +538,8 @@ star_callback(Int4 sequence_number, Int4 number_of_positive_hits) return 0; } +#define EVALUE_EXPAND 10 + #define YES_TO_DECLINE_TO_ALIGN #define NUMARG (sizeof(myargs)/sizeof(myargs[0])) @@ -617,8 +629,13 @@ static Args myargs[] = { NULL, NULL, NULL, TRUE, 'l', ARG_STRING, 0.0, 0, NULL}, {"Use lower case filtering of FASTA sequence", /* 41 */ "F", NULL,NULL,TRUE,'U',ARG_BOOLEAN, 0.0,0,NULL}, - { "Use composition based statistics", /* 42 */ - "T", NULL, NULL, FALSE, 't', ARG_BOOLEAN, 0.0, 0, NULL}, + { "Use composition based statistics\n" /* 42 */ + "0 or F or f: no composition-based statistics\n" + "1 or T or t: Composition-based statistics as in NAR 29:2994--3005, 2001\n" + "2: Composition-based score adjustment as in Bioinformatics 21:902-911, 2005, conditioned on sequence properties in round 1\n" + "3: Composition-based score adjustment as in Bioinformatics 21:902-911, 2005, unconditionally in round 1\n", + + "1", NULL, NULL, FALSE, 't', ARG_STRING, 0.0, 0, NULL}, { "ASN.1 Scoremat input of checkpoint data:\n" "0: no scoremat input\n" "1: Restart is from ASCII scoremat checkpoint file,\n" @@ -990,7 +1007,33 @@ PGPBlastOptionsPtr PGPReadBlastOptions(void) if (myargs[34].floatvalue) options->searchsp_eff = (Nlm_FloatHi) myargs[34].floatvalue; - options->tweak_parameters = (Boolean) myargs[42].intvalue; + switch (myargs[42].strvalue[0]) { + case 'F': + case 'f': + case '0': + options->tweak_parameters = NO_COMP_ADJUSTMENT; + break; + case 'T': + case 't': + case '1': + options->tweak_parameters = COMP_BASED_STATISTICS; + break; + case '2': + ErrPostEx(SEV_WARNING, 1, 0, "this argument for composition-" + "based statistics is currently experimental\n"); + options->tweak_parameters = COMP_MATRIX_ADJUSTMENT; + break; + case '3': + ErrPostEx(SEV_WARNING, 1, 0, "this argument for composition-" + "based statistics is currently experimental\n"); + options->tweak_parameters = COMP_BASED_STATISTICS | + COMP_MATRIX_ADJUSTMENT; + break; + default: + ErrPostEx(SEV_FATAL, 1, 0, "invalid argument for composition-" + "based statistics; see -t options\n"); + break; + } options->smith_waterman = (Boolean) myargs[33].intvalue; if (bop->options->tweak_parameters) { @@ -998,6 +1041,14 @@ PGPBlastOptionsPtr PGPReadBlastOptions(void) hitlist_size */ bop->options->original_expect_value = bop->options->expect_value; bop->options->hitlist_size *= 2; + if (bop->options->tweak_parameters > 1) { + if ((NULL == myargs[29].strvalue) && (NULL == myargs[39].strvalue)) { + /*round 1 and not recovering from checkpoint*/ + bop->options->expect_value = EVALUE_EXPAND * bop->options->expect_value; + } + else + bop->options->tweak_parameters = 1; + } } @@ -1089,6 +1140,16 @@ Boolean PGPFormatHeader(PGPBlastOptionsPtr bop) fprintf(bop->outfp, "\n"); BlastPrintReference(html, 90, bop->outfp); fprintf(bop->outfp, "\n"); + if (bop->options->tweak_parameters > 1) { + CAdjustmentPrintReference(html, 90, bop->outfp); + fprintf(bop->outfp, "\n"); + } + if ((1== bop->options->tweak_parameters) || + ((1 < bop->options->tweak_parameters) && (bop->options->maxNumPasses > 1))) { + CBStatisticsPrintReference(html, 90, (1 == bop->options->tweak_parameters), + (bop->options->maxNumPasses > 1), bop->outfp); + fprintf(bop->outfp, "\n"); + } AcknowledgeBlastQuery(bop->query_bsp, 70, bop->outfp, bop->believe_query, html); PrintDbInformation(bop->blast_database, TRUE, 70, bop->outfp, html); @@ -1677,6 +1738,7 @@ Int2 Main (void) BioseqBlastEngineCore for the second pass. */ bop->options->original_expect_value = bop->options->expect_value; + search->pbp->cutoff_e = bop->options->expect_value; bop->options->hitlist_size *= 2; } @@ -1846,9 +1908,14 @@ Int2 Main (void) head = SeqAlignSetFree(head); /* Here we will print out footer of BLAST output */ - - if(!bop->is_xml_output && !tabular_output) { - PGPFormatFooter(bop, search); + + /*need to temporarily adjust cutoff_e for printing*/ + if(!bop->is_xml_output && !tabular_output) { + if ((bop->options->tweak_parameters > 1) && (1 == thisPassNum)) + search->pbp->cutoff_e = EVALUE_EXPAND * bop->options->expect_value; + PGPFormatFooter(bop, search); + if ((bop->options->tweak_parameters > 1) && (1 == thisPassNum)) + search->pbp->cutoff_e = bop->options->original_expect_value; } /* PGPOneQueryCleanup */ diff --git a/demo/copymat.c b/demo/copymat.c index b9a5b96b..4f802e39 100644 --- a/demo/copymat.c +++ b/demo/copymat.c @@ -1,4 +1,4 @@ -static char const rcsid[] = "$Id: copymat.c,v 6.41 2005/02/14 14:11:55 camacho Exp $"; +static char const rcsid[] = "$Id: copymat.c,v 6.42 2005/05/20 18:57:51 camacho Exp $"; /* * =========================================================================== @@ -36,6 +36,9 @@ Contents: main routines for copymatrices program to convert score matrices output by makematrices into a single byte-encoded file. $Log: copymat.c,v $ +Revision 6.42 2005/05/20 18:57:51 camacho +Update to use new signature to BLAST_FillLookupTableOptions + Revision 6.41 2005/02/14 14:11:55 camacho Changes to use SBlastScoreMatrix @@ -608,8 +611,8 @@ Boolean RPSCreateLookupFile(ScoreRow *combinedMatrix, Int4 numProfiles, sbp = BlastScoreBlkNew(BLASTAA_SEQ_CODE, 1); RPSPsiMatrixAttach(sbp, posMatrix); LookupTableOptionsNew(eBlastTypeBlastp, &lookup_options); - BLAST_FillLookupTableOptions(lookup_options, eBlastTypeBlastp, FALSE, - (Int4) (myargs[3].floatvalue*scalingFactor), myargs[4].intvalue, FALSE, TRUE); /* add last arg for psi-blast?? */ + BLAST_FillLookupTableOptions(lookup_options, eBlastTypePsiBlast, FALSE, + (Int4) (myargs[3].floatvalue*scalingFactor), myargs[4].intvalue, FALSE); BlastSeqLocNew(&lookup_segment, 0, all_length); diff --git a/demo/fastacmd.c b/demo/fastacmd.c index 4387f4f3..87f6d742 100644 --- a/demo/fastacmd.c +++ b/demo/fastacmd.c @@ -1,6 +1,6 @@ -static char const rcsid[] = "$Id: fastacmd.c,v 6.34 2004/12/04 03:39:48 camacho Exp $"; +static char const rcsid[] = "$Id: fastacmd.c,v 6.35 2005/05/05 15:54:06 dondosha Exp $"; -/* $Id: fastacmd.c,v 6.34 2004/12/04 03:39:48 camacho Exp $ +/* $Id: fastacmd.c,v 6.35 2005/05/05 15:54:06 dondosha Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -31,12 +31,15 @@ static char const rcsid[] = "$Id: fastacmd.c,v 6.34 2004/12/04 03:39:48 camacho * * Initial Version Creation Date: 05/20/1997 * -* $Revision: 6.34 $ +* $Revision: 6.35 $ * * File Description: * FASTA retrievel system using ISAM indexes * * $Log: fastacmd.c,v $ +* Revision 6.35 2005/05/05 15:54:06 dondosha +* Enhanced comment for the -s option and fixed typo in -i option description +* * Revision 6.34 2004/12/04 03:39:48 camacho * Set range of data on -D option * @@ -173,11 +176,12 @@ static Args myargs [] = { " T - protein \n" " F - nucleotide", "G", NULL,NULL,TRUE,'p',ARG_STRING,0.0,0,NULL}, - { "Search string: GIs, accessions and loci may be used delimited\n" - " by comma.", /* 2 */ + { "Comma-delimited search string(s).\n" + " GIs, accessions, loci, or fullSeq-id strings may be used,\n" + " e.g. 555, AC147927, 'gnl|dbname|tag'", /* 2 */ NULL, NULL, NULL, TRUE, 's', ARG_STRING, 0.0, 0, NULL}, - { "Input file wilth GIs/accessions/loci for batch\n" - " retrieval",/* 3 */ + { "Input file with GIs/accessions/loci for batch\n" + " retrieval", /* 3 */ NULL, NULL, NULL, TRUE, 'i', ARG_STRING, 0.0, 0, NULL}, { "Retrieve duplicate accessions", /* 4 */ "F", NULL, NULL, TRUE, 'a', ARG_BOOLEAN, 0.0, 0, NULL}, diff --git a/demo/formatrpsdb.c b/demo/formatrpsdb.c index aabda5dd..0a53c78f 100644 --- a/demo/formatrpsdb.c +++ b/demo/formatrpsdb.c @@ -1,4 +1,4 @@ -static char const rcsid[] = "$Id: formatrpsdb.c,v 1.15 2005/02/22 14:17:31 camacho Exp $"; +static char const rcsid[] = "$Id: formatrpsdb.c,v 1.16 2005/05/20 18:57:51 camacho Exp $"; /***************************************************************************** @@ -38,6 +38,9 @@ static char const rcsid[] = "$Id: formatrpsdb.c,v 1.15 2005/02/22 14:17:31 camac *************************************************************************** $Log: formatrpsdb.c,v $ + Revision 1.16 2005/05/20 18:57:51 camacho + Update to use new signature to BLAST_FillLookupTableOptions + Revision 1.15 2005/02/22 14:17:31 camacho Fix bioseq data type @@ -633,12 +636,11 @@ Int2 RPSAddFirstSequence(RPS_DbInfo *info, } if (BLAST_FillLookupTableOptions(info->lookup_options, - eBlastTypeBlastp, + eBlastTypePsiBlast, FALSE, /* no megablast */ threshold, /* neighboring threshold */ BLAST_WORDSIZE_PROT, - FALSE, /* no variable words */ - TRUE /* use a PSSM */ + FALSE /* no variable words */ ) != 0) { ErrPostEx(SEV_ERROR, 0, 0, "Cannot set lookup table options"); return 1; diff --git a/demo/megablast.c b/demo/megablast.c index dc400340..32e0d167 100644 --- a/demo/megablast.c +++ b/demo/megablast.c @@ -1,5 +1,5 @@ -static char const rcsid[] = "$Id: megablast.c,v 6.158 2005/04/27 14:55:09 papadopo Exp $"; -/* $Id: megablast.c,v 6.158 2005/04/27 14:55:09 papadopo Exp $ +static char const rcsid[] = "$Id: megablast.c,v 6.162 2005/06/02 20:45:05 dondosha Exp $"; +/* $Id: megablast.c,v 6.162 2005/06/02 20:45:05 dondosha Exp $ ************************************************************************** * * * COPYRIGHT NOTICE * @@ -26,6 +26,20 @@ static char const rcsid[] = "$Id: megablast.c,v 6.158 2005/04/27 14:55:09 papado * * ************************************************************************** * $Log: megablast.c,v $ + * Revision 6.162 2005/06/02 20:45:05 dondosha + * 1. Print footer only once after the loop over sets of queries; + * 2. Format XML output in a valid XML; + * 3. Use BlastFormattingInfo structure for formatting. + * + * Revision 6.161 2005/05/20 18:57:51 camacho + * Update to use new signature to BLAST_FillLookupTableOptions + * + * Revision 6.160 2005/05/02 17:44:07 coulouri + * bring default evalue and output type in line with other blast programs + * + * Revision 6.159 2005/05/02 17:00:28 coulouri + * change default to new engine + * * Revision 6.158 2005/04/27 14:55:09 papadopo * change signature of BlastFillHitSavingOptions * @@ -528,6 +542,7 @@ static char const rcsid[] = "$Id: megablast.c,v 6.158 2005/04/27 14:55:09 papado #include <algo/blast/api/blast_api.h> #include <algo/blast/api/blast_seq.h> #include <algo/blast/api/repeats_filter.h> +#include <algo/blast/core/blast_setup.h> #endif #define DEFLINE_BUF 255 @@ -581,7 +596,6 @@ MegaBlastPrintEndpoints(VoidPtr ptr) BLAST_HSPPtr hsp; Int2 context; Char context_sign; - Uint4 header_index = 0; Int4 subject_gi, score; FILE *fp = (FILE *) search->output; @@ -611,7 +625,7 @@ MegaBlastPrintEndpoints(VoidPtr ptr) !StringICmp(db_tag->db, "TI")) && db_tag->tag->id != 0) { subject_buffer = (CharPtr) Malloc(16); - sprintf(subject_buffer, "%ld", db_tag->tag->id); + sprintf(subject_buffer, "%ld", (long) db_tag->tag->id); } else { subject_buffer = StringTokMT(subject_descr, " \t", &subject_descr); subject_descr = subject_buffer; @@ -680,9 +694,8 @@ MegaBlastPrintEndpoints(VoidPtr ptr) query_buffer = StringTokMT(title, " ", &title); else { Int4 query_gi; - Boolean numeric_query_id = - GetAccessionFromSeqId(query_bsp->id, &query_gi, - &query_buffer); + GetAccessionFromSeqId(query_bsp->id, &query_gi, + &query_buffer); } BioseqUnlock(query_bsp); } else { @@ -720,7 +733,7 @@ MegaBlastPrintEndpoints(VoidPtr ptr) hsp->subject.end += s_shift; if (numeric_sip_type) - fprintf(fp, "'%ld'=='%c%s' (%d %d %d %d) %d\n", subject_gi, + fprintf(fp, "'%ld'=='%c%s' (%d %d %d %d) %d\n", (long) subject_gi, context_sign, query_buffer, hsp->subject.offset, q_start, hsp->subject.end, q_end, score); else @@ -859,9 +872,8 @@ MegaBlastPrintSegments(VoidPtr ptr) query_buffer = StringTokMT(title, " ", &title); else { Int4 query_gi; - Boolean numeric_query_id = - GetAccessionFromSeqId(query_bsp->id, &query_gi, - &query_buffer); + GetAccessionFromSeqId(query_bsp->id, &query_gi, + &query_buffer); } BioseqUnlock(query_bsp); } else { @@ -876,7 +888,7 @@ MegaBlastPrintSegments(VoidPtr ptr) if (numeric_sip_type) sprintf(buffer, "\n#'>%ld'=='%c%s' (%d %d %d %d) %d\na {\n s %d\n b %d %d\n e %d %d\n", - subject_gi, strand, query_buffer, + (long) subject_gi, strand, query_buffer, s_start, q_start, s_end, q_end, score, score, s_start, q_start, s_end, q_end); else @@ -1030,7 +1042,7 @@ static Args myargs [] = { { "Query File", NULL, NULL, NULL, FALSE, 'i', ARG_FILE_IN, 0.0, 0, NULL}, /* ARG_QUERY */ { "Expectation value", - "1000000.0", NULL, NULL, FALSE, 'e', ARG_FLOAT, 0.0, 0, NULL},/* ARG_EVALUE */ + "10.0", NULL, NULL, FALSE, 'e', ARG_FLOAT, 0.0, 0, NULL},/* ARG_EVALUE */ { "alignment view options:\n0 = pairwise,\n1 = query-anchored showing identities,\n2 = query-anchored no identities,\n3 = flat query-anchored, show identities,\n4 = flat query-anchored, no identities,\n5 = query-anchored no identities and blunt ends,\n6 = flat query-anchored, no identities and blunt ends,\n7 = XML Blast output,\n8 = tabular, \n9 tabular with comment lines,\n10 ASN, text\n11 ASN, binary", "0", NULL, NULL, FALSE, 'm', ARG_INT, 0.0, 0, NULL}, /* ARG_FORMAT */ { "BLAST report Output File", @@ -1050,7 +1062,7 @@ static Args myargs [] = { { "Number of database sequence to show alignments for (B)", "250", NULL, NULL, FALSE, 'b', ARG_INT, 0.0, 0, NULL}, /* ARG_ALIGNMENTS */ { "Type of output:\n0 - alignment endpoints and score,\n1 - all ungapped segments endpoints,\n2 - traditional BLAST output,\n3 - tab-delimited one line format", - "0", NULL, NULL, FALSE, 'D', ARG_INT, 0.0, 0, NULL}, /* ARG_OUTTYPE */ + "2", NULL, NULL, FALSE, 'D', ARG_INT, 0.0, 0, NULL}, /* ARG_OUTTYPE */ { "Number of processors to use", "1", NULL, NULL, FALSE, 'a', ARG_INT, 0.0, 0, NULL}, /* ARG_THREADS */ { "ASN.1 SeqAlign file; must be used in conjunction with -D2 option", @@ -1108,8 +1120,8 @@ static Args myargs [] = { "0", NULL, NULL, FALSE, 'H', ARG_INT, 0.0, 0, NULL}, /* ARG_MAXHSP */ #endif #if MB_ALLOW_NEW - {"Force use of old engine", - "T", NULL, NULL, TRUE, 'V', ARG_BOOLEAN, 0.0, 0, NULL} /* ARG_FORCE_OLD */ + {"Force use of the legacy BLAST engine", + "F", NULL, NULL, TRUE, 'V', ARG_BOOLEAN, 0.0, 0, NULL} /* ARG_FORCE_OLD */ #endif }; @@ -1119,7 +1131,7 @@ static Args myargs [] = { static Int2 Main_old (void) { - AsnIoPtr aip, xml_aip; + AsnIoPtr aip, xml_aip = NULL; BioseqPtr query_bsp, PNTR query_bsp_array; BioSourcePtr source; BLAST_MatrixPtr matrix; @@ -1762,11 +1774,9 @@ BLAST_FillOptions(SBlastOptions* options) BlastExtensionOptions* ext_options = options->ext_options; BlastHitSavingOptions* hit_options = options->hit_options ; BlastScoringOptions* score_options = options->score_options; - BlastEffectiveLengthsOptions* eff_len_options = options->eff_len_options; Boolean ag_blast = FALSE, variable_wordsize = FALSE, mb_lookup = TRUE; Boolean greedy=TRUE; /* greedy alignment should be done. */ double lambda=0; - Int2 status; const Uint1 program_number = eBlastTypeBlastn; if (myargs[ARG_DYNAMIC].intvalue != 0) @@ -1795,7 +1805,7 @@ BLAST_FillOptions(SBlastOptions* options) } BLAST_FillLookupTableOptions(lookup_options, program_number, mb_lookup, - 0, myargs[ARG_WORDSIZE].intvalue, variable_wordsize, FALSE); + 0, myargs[ARG_WORDSIZE].intvalue, variable_wordsize); /* Fill the rest of the lookup table options */ lookup_options->mb_template_length = (Uint1) myargs[ARG_TEMPL_LEN].intvalue; @@ -1850,25 +1860,24 @@ BLAST_FillOptions(SBlastOptions* options) static Int2 Main_new(void) { const Boolean query_is_na = TRUE; - const Boolean db_is_na = TRUE; Boolean believe_query = FALSE; char* program_name = "blastn"; Int2 status = 0; Int4 start=0, end=0; /* start and end of sequence to be searched as specified by ARG_QUERYLOC */ SeqLoc* lcase_mask = NULL; - SeqLoc* filter_loc=NULL; /* All masking locations */ SeqLoc* query_slp = NULL; FILE *infp=NULL, *outfp=NULL; SBlastOptions* options = NULL; - BlastFormattingOptions* format_options; Int2 ctr = 1; Int4 num_queries_total=0; /* total number of queries read. */ Boolean tabular_output = FALSE; BlastTabularFormatData* tf_data = NULL; int num_threads; char* dbname = myargs[ARG_DB].strvalue; + BlastFormattingInfo* format_info = NULL; Blast_SummaryReturn* sum_returns = NULL; - Int4 align_view; + Blast_SummaryReturn* full_sum_returns = NULL; + EAlignView align_view = eAlignViewMax; if (myargs[ARG_OUTTYPE].intvalue == 3) tabular_output = TRUE; @@ -1886,19 +1895,19 @@ static Int2 Main_new(void) BLAST_FillOptions(options); - align_view = myargs[ARG_FORMAT].intvalue; + if (myargs[ARG_FORMAT].intvalue >= eAlignViewPairwise && + myargs[ARG_FORMAT].intvalue < eAlignViewMax) + align_view = (EAlignView) myargs[ARG_FORMAT].intvalue; + else + ErrPostEx(SEV_FATAL, 1, 0, "Unsupported value for the -m option"); - /* If tabular output is requested, set believe_query to TRUE, - to guarantee meaningful sequence ids in the output. */ - if (tabular_output) - believe_query = TRUE; - else - believe_query = (Boolean) myargs[ARG_BELIEVEQUERY].intvalue; + believe_query = (Boolean) myargs[ARG_BELIEVEQUERY].intvalue; /* If ASN.1 output is requested and believe_query is not set to TRUE, exit with an error. */ if (!believe_query && (myargs[ARG_ASNOUT].strvalue || - align_view == 10 || align_view == 11)) { + align_view == eAlignViewAsnText || + align_view == eAlignViewAsnBinary)) { ErrPostEx(SEV_FATAL, 1, 0, "-J option must be TRUE to produce ASN.1 output; before " "changing -J to TRUE please also ensure that all query " @@ -1907,31 +1916,30 @@ static Int2 Main_new(void) } if (!tabular_output) { - if ((status = BlastFormattingOptionsNew(options->program, - myargs[ARG_OUT].strvalue, - myargs[ARG_DESCRIPTIONS].intvalue, - myargs[ARG_ALIGNMENTS].intvalue, - align_view, &format_options)) != 0) - return status; - format_options->html = (Boolean) myargs[ARG_HTML].intvalue; - if (myargs[ARG_SHOWGIS].intvalue == 0) - { /* These are default in new api, so we turn off if not requested. */ - format_options->align_options -= TXALIGN_SHOW_GI; - format_options->print_options -= TXALIGN_SHOW_GI; - } - - if (dbname) { - BLAST_PrintOutputHeader(format_options, TRUE, program_name, dbname, - !db_is_na); - } - } - else - { /* tabular output requires raw FILE*. */ + BlastFormattingInfoNew(myargs[ARG_FORMAT].intvalue, options, + program_name, dbname, + myargs[ARG_OUT].strvalue, &format_info); + + /* Pass TRUE for the "is megablast" argument. Since megablast is always + gapped, pass FALSE for the "is ungapped" argument. */ + BlastFormattingInfoSetUpOptions(format_info, + myargs[ARG_DESCRIPTIONS].intvalue, + myargs[ARG_ALIGNMENTS].intvalue, + (Boolean) myargs[ARG_HTML].intvalue, + TRUE, + (Boolean) myargs[ARG_SHOWGIS].intvalue, + believe_query); + + if (dbname) + BLAST_PrintOutputHeader(format_info); + } else { /* tabular output requires raw FILE*. */ if ((outfp = FileOpen(myargs[ARG_OUT].strvalue, "w")) == NULL) { ErrPostEx(SEV_FATAL, 1, 0, "blast: Unable to open output file %s\n", myargs[ARG_OUT].strvalue); return (1); } + /* If tabular output is requested, automatically set believe_query to + TRUE, to guarantee meaningful sequence ids in the output. */ believe_query = TRUE; } @@ -2027,29 +2035,33 @@ static Int2 Main_new(void) /* Format the results; note that seqalign and filter locations are freed inside. */ status = - BLAST_FormatResults(seqalign, myargs[ARG_DB].strvalue, - program_name, num_queries, query_slp, - filter_loc, format_options, FALSE, NULL, - sum_returns); + BLAST_FormatResults(seqalign, num_queries, query_slp, + filter_loc, format_info, sum_returns); seqalign = SeqAlignSetFree(seqalign); - Blast_PrintOutputFooter(options->program, format_options, dbname, - sum_returns); } + /* Update the cumulative summary returns structure and clean the returns + substructures for the current search iteration. */ + Blast_SummaryReturnUpdate(sum_returns, &full_sum_returns); Blast_SummaryReturnClean(sum_returns); filter_loc = Blast_ValNodeMaskListFree(filter_loc); query_slp = SeqLocSetFree(query_slp); } /* End loop on sets of queries */ - options = SBlastOptionsFree(options); + if (infp) + FileClose(infp); + + Blast_PrintOutputFooter(format_info, full_sum_returns); + sum_returns = Blast_SummaryReturnFree(sum_returns); + full_sum_returns = Blast_SummaryReturnFree(full_sum_returns); if (!tabular_output) { - if (align_view < 7 && myargs[ARG_LOGINFO].intvalue) - BlastPrintLogReport(format_options, num_queries_total); - format_options = BlastFormattingOptionsFree(format_options); + if (align_view < eAlignViewXml && myargs[ARG_LOGINFO].intvalue) + BlastPrintLogReport(format_info); + format_info = BlastFormattingInfoFree(format_info); } else { @@ -2059,9 +2071,8 @@ static Int2 Main_new(void) FileClose(outfp); } - if (infp) - FileClose(infp); - + options = SBlastOptionsFree(options); + return status; } #endif diff --git a/demo/nps2gps.c b/demo/nps2gps.c new file mode 100644 index 00000000..e2d9d292 --- /dev/null +++ b/demo/nps2gps.c @@ -0,0 +1,997 @@ +/* nps2gps.c +* =========================================================================== +* +* PUBLIC DOMAIN NOTICE +* National Center for Biotechnology Information (NCBI) +* +* This software/database is a "United States Government Work" under the +* terms of the United States Copyright Act. It was written as part of +* the author's official duties as a United States Government employee and +* thus cannot be copyrighted. This software/database is freely available +* to the public for use. The National Library of Medicine and the U.S. +* Government do not place any restriction on its use or reproduction. +* We would, however, appreciate having the NCBI and the author cited in +* any work or product based on this material +* +* Although all reasonable efforts have been taken to ensure the accuracy +* and reliability of the software and data, the NLM and the U.S. +* Government do not and cannot warrant the performance or results that +* may be obtained by using this software or data. The NLM and the U.S. +* Government disclaim all warranties, express or implied, including +* warranties of performance, merchantability or fitness for any particular +* purpose. +* +* =========================================================================== +* +* File Name: nps2gps.c +* +* Author: Jonathan Kans +* +* Version Creation Date: 5/12/05 +* +* $Revision: 1.3 $ +* +* File Description: +* +* Modifications: +* -------------------------------------------------------------------------- +* ========================================================================== +*/ + +#include <ncbi.h> +#include <objall.h> +#include <objsset.h> +#include <objsub.h> +#include <objfdef.h> +#include <sqnutils.h> +#include <seqport.h> +#include <explore.h> +#include <subutil.h> +#include <toasn3.h> + +#define NPS2GPSAPP_VER "1.2" + +CharPtr NPS2GPSAPPLICATION = NPS2GPSAPP_VER; + +typedef struct n2gdata { + CharPtr results; + CharPtr outfile; + Boolean failure; +} N2GData, PNTR N2GPtr; + +typedef struct npsseqs { + BioseqPtr nuc; + BioseqPtr prot; +} NpsSeqs, PNTR NpsSeqsPtr; + +static void FindNucProtSeqs ( + BioseqPtr bsp, + Pointer userdata +) + +{ + NpsSeqsPtr nsp; + + if (bsp == NULL) return; + nsp = (NpsSeqsPtr) userdata; + if (nsp == NULL) return; + + if (ISA_na (bsp->mol)) { + nsp->nuc = bsp; + } else if (ISA_aa (bsp->mol)) { + nsp->prot = bsp; + } +} + +static void LclMakeNucProtCDS ( + BioseqSetPtr bssp, + Pointer userdata +) + +{ + CodeBreakPtr cbp; + SeqFeatPtr cds; + CdRegionPtr crp; + SeqFeatPtr mrna; + NpsSeqs ns; + Boolean partial5, partial3; + SeqFeatPtr sfp; + SeqIdPtr sip; + SeqLocPtr slp; + Int4 start, stop; + SeqFeatPtr temp; + + ns.nuc = NULL; + ns.prot = NULL; + if (VisitBioseqsInSet (bssp, (Pointer) &ns, FindNucProtSeqs) != 2) return; + if (ns.nuc == NULL || ns.prot == NULL) return; + + cds = SeqMgrGetCDSgivenProduct (ns.prot, NULL); + mrna = SeqMgrGetRNAgivenProduct (ns.nuc, NULL); + if (cds == NULL || mrna == NULL) return; + + CheckSeqLocForPartial (cds->location, &partial5, &partial3); + + start = GetOffsetInLoc (cds->location, mrna->location, SEQLOC_START); + stop = GetOffsetInLoc (cds->location, mrna->location, SEQLOC_STOP); + + if (start < 0 || start >= ns.nuc->length || + stop < 0 || stop >= ns.nuc->length) return; + + sip = SeqIdFindBest (ns.nuc->id, 0); + if (sip == NULL) return; + + /* copy cds feature fields to paste into new cds feature */ + temp = AsnIoMemCopy (cds, + (AsnReadFunc) SeqFeatAsnRead, + (AsnWriteFunc) SeqFeatAsnWrite); + if (temp == NULL) return; + + sfp = CreateNewFeatureOnBioseq (ns.nuc, SEQFEAT_CDREGION, NULL); + if (sfp == NULL) return; + + sfp->location = SeqLocFree (sfp->location); + if (StringISearch (cds->except_text, "ribosomal slippage") == NULL && + StringISearch (cds->except_text, "ribosome slippage") == NULL && + StringISearch (cds->except_text, "trans splicing") == NULL && + StringISearch (cds->except_text, "trans-splicing") == NULL && + StringISearch (cds->except_text, "artificial frameshift") == NULL) { + sfp->location = AddIntervalToLocation (NULL, sip, start, stop, partial5, partial3); + } else { + slp = SeqLocFindNext (cds->location, NULL); + while (slp != NULL) { + start = GetOffsetInLoc (slp, mrna->location, SEQLOC_START); + stop = GetOffsetInLoc (slp, mrna->location, SEQLOC_STOP); + sfp->location = AddIntervalToLocation (sfp->location, sip, start, + stop, partial5, partial3); + slp = SeqLocFindNext (cds->location, slp); + } + sfp->location = SeqLocMergeEx (ns.nuc, sfp->location, NULL, FALSE, TRUE, FALSE, FALSE); + } + SetSeqFeatProduct (sfp, ns.prot); + + /* paste fields from temp copy of original cds */ + crp = (CdRegionPtr) temp->data.value.ptrvalue; + sfp->data.value.ptrvalue = (Pointer) crp; + + sfp->partial = temp->partial; + sfp->excpt = temp->excpt; + sfp->comment = temp->comment; + sfp->qual = temp->qual; + sfp->title = temp->title; + sfp->ext = temp->ext; + sfp->cit = temp->cit; + sfp->exp_ev = temp->exp_ev; + sfp->xref = temp->xref; + sfp->dbxref = temp->dbxref; + sfp->pseudo = temp->pseudo; + sfp->except_text = temp->except_text; + + MemFree (temp); /* do not SeqFeatFree */ + + /* update code break locations */ + for (cbp = crp->code_break; cbp != NULL; cbp = cbp->next) { + CheckSeqLocForPartial (cbp->loc, &partial5, &partial3); + start = GetOffsetInLoc (cbp->loc, mrna->location, SEQLOC_START); + stop = GetOffsetInLoc (cbp->loc, mrna->location, SEQLOC_STOP); + if (start < 0 || start >= ns.nuc->length || + stop < 0 || stop >= ns.nuc->length) continue; + cbp->loc = SeqLocFree (cbp->loc); + cbp->loc = AddIntervalToLocation (NULL, sip, start, stop, partial5, partial3);; + } +} + +/* copy gene from contig onto nuc-prot, single interval on cdna bioseq */ + +static void LclCopyGene ( + SeqFeatPtr sfp, + Pointer userdata +) + +{ + BioseqPtr bsp; + SeqMgrFeatContext gcontext; + SeqFeatPtr gene, copy, temp; + GeneRefPtr grp, xref; + Boolean partial5, partial3; + + /* input mrna features are multi-interval on contig */ + + if (sfp->data.choice != SEQFEAT_RNA) return; + + /* find cdna product of mrna */ + + bsp = BioseqFindFromSeqLoc (sfp->product); + if (bsp == NULL) return; + + /* check for gene xref */ + + xref = SeqMgrGetGeneXref (sfp); + if (xref != NULL) { + if (SeqMgrGeneIsSuppressed (xref)) return; + + /* copy gene xref for new gene feature */ + + grp = AsnIoMemCopy (xref, + (AsnReadFunc) GeneRefAsnRead, + (AsnWriteFunc) GeneRefAsnWrite); + if (grp == NULL) return; + + /* make new gene feature on full-length of cdna */ + + copy = CreateNewFeatureOnBioseq (bsp, SEQFEAT_GENE, NULL); + if (copy == NULL) return; + + copy->data.value.ptrvalue = grp; + return; + } + + /* overlapping gene should be single interval on contig */ + + gene = SeqMgrGetOverlappingGene (sfp->location, &gcontext); + if (gene == NULL) return; + + CheckSeqLocForPartial (gene->location, &partial5, &partial3); + + /* copy gene feature fields to paste into new gene feature */ + + temp = AsnIoMemCopy (gene, + (AsnReadFunc) SeqFeatAsnRead, + (AsnWriteFunc) SeqFeatAsnWrite); + if (temp == NULL) return; + + /* make new gene feature on full-length of cdna */ + + copy = CreateNewFeatureOnBioseq (bsp, SEQFEAT_GENE, NULL); + if (copy == NULL) { + SeqFeatFree (temp); + return; + } + + /* paste fields from temp copy of original gene */ + + copy->data.value.ptrvalue = temp->data.value.ptrvalue; + copy->partial = temp->partial; + copy->excpt = temp->excpt; + copy->comment = temp->comment; + copy->qual = temp->qual; + copy->title = temp->title; + copy->ext = temp->ext; + copy->cit = temp->cit; + copy->exp_ev = temp->exp_ev; + copy->xref = temp->xref; + copy->dbxref = temp->dbxref; + copy->pseudo = temp->pseudo; + copy->except_text = temp->except_text; + + SetSeqLocPartial (copy->location, partial5, partial3); + + SeqLocFree (temp->location); + MemFree (temp); /* do not SeqFeatFree */ +} + +static void LclAddMrnaTitles ( + SeqLocPtr slp, + Pointer userdata +) + +{ + BioseqPtr bsp; + SeqMgrFeatContext ccontext; + CharPtr cdslabel = NULL; + SeqMgrFeatContext gcontext; + CharPtr genelabel = NULL; + size_t len; + CharPtr organism; + SeqFeatPtr sfp; + CharPtr str; + + if (slp == NULL) return; + bsp = BioseqFindFromSeqLoc (slp); + if (bsp == NULL) return; + if (! ISA_na (bsp->mol)) return; + organism = (CharPtr) userdata; + if (BioseqGetTitle (bsp) != NULL) return; + sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_GENE, 0, &gcontext); + if (sfp != NULL) { + genelabel = gcontext.label; + if (StringHasNoText (genelabel)) { + genelabel = NULL; + } + } + sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_CDREGION, 0, &ccontext); + if (sfp != NULL) { + cdslabel = ccontext.label; + if (StringHasNoText (cdslabel)) { + cdslabel = NULL; + } + } + len = StringLen (organism) + StringLen (genelabel) + StringLen (cdslabel) + + StringLen (" mRNA, complete cds.") + 10; + str = (CharPtr) MemNew (len * sizeof (Char)); + if (str == NULL) return; + str [0] = '\0'; + + if (! StringHasNoText (organism)) { + StringCat (str, organism); + } + if (cdslabel != NULL) { + StringCat (str, " "); + StringCat (str, cdslabel); + } + if (genelabel != NULL) { + StringCat (str, " ("); + StringCat (str, genelabel); + StringCat (str, ")"); + } + if (cdslabel != NULL && genelabel != NULL) { + if (ccontext.partialL || ccontext.partialR) { + StringCat (str, " mRNA, partial cds."); + } else { + StringCat (str, " mRNA, complete cds."); + } + } else if (genelabel != NULL) { + StringCat (str, " mRNA."); + } + SeqDescrAddPointer (&(bsp->descr), Seq_descr_title, (Pointer) str); +} + +static Boolean LIBCALLBACK DummyACMProc ( + SeqFeatPtr sfp, + SeqMgrFeatContextPtr context +) + +{ + return TRUE; +} + +static Boolean AmbiguousCdsMrna (BioseqPtr bsp) + +{ + Int2 count; + SeqMgrFeatContext fcontext; + SeqFeatPtr sfp; + + if (bsp == NULL) return TRUE; + + sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_CDREGION, 0, &fcontext); + while (sfp != NULL) { + count = SeqMgrGetAllOverlappingFeatures (sfp->location, FEATDEF_mRNA, NULL, 0, + CHECK_INTERVALS, NULL, DummyACMProc); + if (count > 1) return TRUE; + sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_CDREGION, 0, &fcontext); + } + + return FALSE; +} + +static SeqIdPtr MakeIdFromLocusTag (SeqFeatPtr mrna) + +{ + Char buf [64], suffix [8]; + SeqFeatPtr gene; + GeneRefPtr grp; + + if (mrna == NULL) return NULL; + suffix [0] = '\0'; + grp = SeqMgrGetGeneXref (mrna); + if (grp != NULL) { + if (SeqMgrGeneIsSuppressed (grp)) return NULL; + } + if (grp == NULL) { + gene = SeqMgrGetOverlappingGene (mrna->location, NULL); + if (gene != NULL) { + grp = (GeneRefPtr) gene->data.value.ptrvalue; + if (gene->id.choice == 1) { + (gene->id.value.intvalue)++; + sprintf (suffix, "_%ld", (long) gene->id.value.intvalue); + } + } + } + if (grp != NULL) { + if (StringDoesHaveText (grp->locus_tag)) { + /* StringCpy (buf, "lcl|"); */ + StringCpy (buf, "gnl|MTRACK|"); + StringCat (buf, grp->locus_tag); + StringCat (buf, suffix); + return MakeSeqID (buf); + } + } + return NULL; +} + +static void InstantiateMrnaIntoProt (SeqFeatPtr cds, SeqFeatPtr mrna, Int2Ptr ctrp, Boolean ambig) + +{ + ByteStorePtr bs; + Int4 len; + BioseqPtr mbsp, pbsp; + MolInfoPtr mip; + SeqEntryPtr msep, psep; + Boolean partial5, partial3; + CharPtr rnaseq; + ValNodePtr vnp; + + if (cds == NULL || mrna == NULL) return; + if (cds->product == NULL || mrna->product != NULL) return; + + pbsp = BioseqFindFromSeqLoc (cds->product); + if (pbsp == NULL) return; + psep = SeqMgrGetSeqEntryForData (pbsp); + if (psep == NULL) return; + + rnaseq = GetSequenceByFeature (mrna); + if (rnaseq == NULL) return; + len = (Int4) StringLen (rnaseq); + + bs = BSNew (len + 2); + if (bs == NULL) return; + BSWrite (bs, (VoidPtr) rnaseq, len); + MemFree (rnaseq); + + mbsp = BioseqNew (); + if (mbsp == NULL) return; + mbsp->repr = Seq_repr_raw; + mbsp->mol = Seq_mol_rna; + mbsp->seq_data_type = Seq_code_iupacna; + mbsp->seq_data = bs; + mbsp->length = BSLen (bs); + BioseqPack (mbsp); + + /* + if (! ambig) { + mbsp->id = MakeIdFromLocusTag (mrna); + } + */ + /* now adds _# suffix to general Seq-id if ambiguous */ + mbsp->id = MakeIdFromLocusTag (mrna); + if (mbsp->id == NULL) { + mbsp->id = MakeNewProteinSeqIdEx (mrna->location, NULL, NULL, ctrp); + } + CheckSeqLocForPartial (mrna->location, &partial5, &partial3); + SeqMgrAddToBioseqIndex (mbsp); + + msep = SeqEntryNew (); + if (msep == NULL) return; + msep->choice = 1; + msep->data.ptrvalue = (Pointer) mbsp; + SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) mbsp, msep); + + mip = MolInfoNew (); + if (mip == NULL) return; + mip->biomol = MOLECULE_TYPE_MRNA; + if (partial5 && partial3) { + mip->completeness = 5; + } else if (partial5) { + mip->completeness = 3; + } else if (partial3) { + mip->completeness = 4; + } + vnp = CreateNewDescriptor (msep, Seq_descr_molinfo); + if (vnp != NULL) { + vnp->data.ptrvalue = (Pointer) mip; + } + + SetSeqFeatProduct (mrna, mbsp); + + /* add cDNA to protein, promoting to nuc-prot set component of genomic product set */ + + AddSeqEntryToSeqEntry (psep, msep, FALSE); +} + +static void RemoveFeatIDs (SeqFeatPtr sfp, Pointer userdata) + +{ + if (sfp == NULL) return; + if (sfp->id.choice == 1) { + sfp->id.choice = 0; + sfp->id.value.intvalue = 0; + } +} + +typedef struct loopdata { + Boolean ambig; + Int2 count; + SeqFeatPtr mrna; +} LoopData, PNTR LoopDataPtr; + +static Boolean LIBCALLBACK FindSingleMrnaProc ( + SeqFeatPtr sfp, + SeqMgrFeatContextPtr context +) + +{ + LoopDataPtr ldp; + + ldp = (LoopDataPtr) context->userdata; + + if (sfp->id.choice == 0) { + (ldp->count)++; + ldp->mrna = sfp; + } else { + ldp->ambig = TRUE; + } + + return TRUE; +} + +static void LoopThroughCDSs (BioseqPtr bsp, Int2Ptr ctrp, N2GPtr ngp) + +{ + Int2 count; + SeqMgrFeatContext fcontext; + Boolean goOn; + LoopData ld; + SeqFeatPtr sfp; + + /* loop through CDS features, finding single unused mRNA partner */ + + goOn = TRUE; + while (goOn) { + goOn = FALSE; + sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_CDREGION, 0, &fcontext); + while (sfp != NULL) { + if (sfp->id.choice == 0) { + ld.ambig = FALSE; + ld.count = 0; + ld.mrna = NULL; + count = SeqMgrGetAllOverlappingFeatures (sfp->location, FEATDEF_mRNA, NULL, 0, + CHECK_INTERVALS, (Pointer) &ld, FindSingleMrnaProc); + if (ld.count == 1 && ld.mrna != NULL) { + InstantiateMrnaIntoProt (sfp, ld.mrna, ctrp, ld.ambig); + sfp->id.choice = 1; + ld.mrna->id.choice = 1; + goOn = TRUE; + } + } + sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_CDREGION, 0, &fcontext); + } + } + + /* if any CDSs were not promoted, record failure */ + + sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_CDREGION, 0, &fcontext); + while (sfp != NULL) { + if (sfp->id.choice == 0) { + if (ngp != NULL) { + ngp->failure = TRUE; + } + } + sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_CDREGION, 0, &fcontext); + } +} + +static void NPStoGPS (Uint2 entityID, CharPtr filename, N2GPtr ngp) + +{ + BioSourcePtr biop; + BioseqPtr bsp; + BioseqSetPtr bssp; + Int2 ctr = 1; + SeqMgrFeatContext gcontext, mcontext; + SeqFeatPtr gene, mrna, sfp, lastsfp; + GeneRefPtr grp; + SeqEntryPtr old, top; + CharPtr organism; + OrgRefPtr orp; + Uint2 parenttype; + Pointer parentptr; + SeqAnnotPtr sap, annot, lastsap; + SeqDescrPtr sdp; + + top = GetTopSeqEntryForEntityID (entityID); + if (top == NULL || top->choice != 2) return; + bssp = (BioseqSetPtr) top->data.ptrvalue; + if (bssp == NULL || bssp->_class != BioseqseqSet_class_nuc_prot) return; + + if (StringHasNoText (filename)) { + filename = "?"; + } + + bsp = FindNucBioseq (top); + if (bsp == NULL) { + Message (MSG_OK, "Unable to find nucleotide Bioseq in %s", filename); + if (ngp != NULL) { + ngp->failure = TRUE; + } + return; + } + + /* + if (AmbiguousCdsMrna (bsp)) { + Message (MSG_POSTERR, + "Unable to proceed because of unresolved CDS-mRNA ambiguity in %s", + filename); + if (ngp != NULL) { + ngp->failure = TRUE; + } + return; + } + */ + + GetSeqEntryParent (top, &parentptr, &parenttype); + + bssp->_class = BioseqseqSet_class_gen_prod_set; + + /* move feature table from nuc-prot set to nucleotide bioseq */ + + sap = bssp->annot; + bssp->annot = NULL; + annot = bsp->annot; + if (annot == NULL) { + bsp->annot = sap; + } else if (sap->next == NULL && annot->next == NULL && + sap->type == 1 && annot->type == 1 && + sap->data != NULL && annot->data != NULL) { + lastsfp = (SeqFeatPtr) annot->data; + while (lastsfp->next != NULL) { + lastsfp = lastsfp->next; + } + sfp = (SeqFeatPtr) sap->data; + lastsfp->next = sfp; + sap->data = NULL; + SeqAnnotFree (sap); + } else { + lastsap = bsp->annot; + while (lastsap->next != NULL) { + lastsap = lastsap->next; + } + lastsap->next = sap; + } + + VisitFeaturesInSep (top, NULL, RemoveFeatIDs); + + old = SeqEntrySetScope (top); + + ctr = (Int2) VisitBioseqsInSep (top, NULL, NULL) + 1; + + /* count mRNAs per overlapping gene */ + + mrna = SeqMgrGetNextFeature (bsp, NULL, 0, FEATDEF_mRNA, &mcontext); + while (mrna != NULL) { + grp = SeqMgrGetGeneXref (mrna); + if (grp == NULL) { + gene = SeqMgrGetOverlappingGene (mrna->location, NULL); + if (gene != NULL) { + if (gene->id.choice == 0) { + gene->id.choice = 1; + gene->id.value.intvalue = 1; + } else if (gene->id.choice == 1) { + (gene->id.value.intvalue)++; + } + } + } + mrna = SeqMgrGetNextFeature (bsp, mrna, 0, FEATDEF_mRNA, &mcontext); + } + + /* only leave genes with multiple mRNAs marked */ + + gene = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_GENE, 0, &gcontext); + while (gene != NULL) { + if (gene->id.choice == 1) { + if (gene->id.value.intvalue == 1) { + /* if count was 1, clear id */ + gene->id.value.intvalue = 0; + gene->id.choice = 0; + } else { + /* if count was > 1, just reset count, do not clear id */ + gene->id.value.intvalue = 0; + } + } + gene = SeqMgrGetNextFeature (bsp, gene, SEQFEAT_GENE, 0, &gcontext); + } + + /* + sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_CDREGION, 0, &fcontext); + while (sfp != NULL) { + mrna = SeqMgrGetOverlappingFeature (sfp->location, FEATDEF_mRNA, NULL, 0, + NULL, CHECK_INTERVALS, &mcontext); + if (mrna != NULL) { + InstantiateMrnaIntoProt (sfp, mrna, &ctr); + } + sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_CDREGION, 0, &fcontext); + } + */ + + /* make cDNA bioseq from mRNA feature, package with protein product */ + + LoopThroughCDSs (bsp, &ctr, ngp); + + SeqMgrLinkSeqEntry (top, parenttype, parentptr); + + SeqEntrySetScope (old); + + SeqMgrClearFeatureIndexes (bssp->idx.entityID, NULL); + + VisitFeaturesInSep (top, NULL, RemoveFeatIDs); + + /* need to reindex to get mRNA and CDS features from cDNA and protein */ + SeqMgrIndexFeatures (bssp->idx.entityID, NULL); + VisitSetsInSet (bssp, NULL, LclMakeNucProtCDS); + + /* need to reindex before copying genes, instantiating protein titles */ + SeqMgrIndexFeatures (bssp->idx.entityID, NULL); + + VisitFeaturesInSep (top, NULL, LclCopyGene); + + /* need to reindex before instantiating mRNA titles */ + SeqMgrIndexFeatures (bssp->idx.entityID, NULL); + + organism = NULL; + for (sdp = bssp->descr; sdp != NULL; sdp = sdp->next) { + if (sdp->choice != Seq_descr_source) continue; + biop = (BioSourcePtr) sdp->data.ptrvalue; + if (biop == NULL) continue; + orp = biop->org; + if (orp == NULL) continue; + if (! StringHasNoText (orp->taxname)) { + organism = orp->taxname; + } + } + + sfp = SeqMgrGetNextFeature (bsp, NULL, 0, FEATDEF_mRNA, &mcontext); + while (sfp != NULL) { + LclAddMrnaTitles (sfp->product, organism); + sfp = SeqMgrGetNextFeature (bsp, sfp, 0, FEATDEF_mRNA, &mcontext); + } + + SeqMgrClearFeatureIndexes (bssp->idx.entityID, NULL); + + move_cds (top); +} + +static void ProcessOneRecord ( + CharPtr filename, + Pointer userdata +) + +{ + AsnIoPtr aip; + BioseqPtr bsp; + ValNodePtr bsplist; + BioseqSetPtr bssp; + Pointer dataptr = NULL; + Uint2 datatype, entityID = 0; + SeqDescrPtr descr1, descr2; + Char file [FILENAME_MAX], id [42], path [PATH_MAX]; + FILE *fp; + N2GPtr ngp; + SeqEntryPtr nsep, sep; + CharPtr ptr, str; + SeqIdPtr sip; + + if (StringHasNoText (filename)) return; + ngp = (N2GPtr) userdata; + if (ngp == NULL) return; + + fp = FileOpen (filename, "r"); + if (fp == NULL) { + Message (MSG_POSTERR, "Failed to open '%s'", filename); + return; + } + + dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, FALSE, FALSE, FALSE, FALSE); + + FileClose (fp); + + entityID = ObjMgrRegister (datatype, dataptr); + + if (entityID < 1 || dataptr == NULL) { + Message (MSG_POSTERR, "Data read failed for input file '%s'", filename); + return; + } + + if (datatype == OBJ_SEQSUB || datatype == OBJ_SEQENTRY || + datatype == OBJ_BIOSEQ || datatype == OBJ_BIOSEQSET) { + + sep = GetTopSeqEntryForEntityID (entityID); + + if (sep == NULL) { + sep = SeqEntryNew (); + if (sep != NULL) { + if (datatype == OBJ_BIOSEQ) { + bsp = (BioseqPtr) dataptr; + sep->choice = 1; + sep->data.ptrvalue = bsp; + SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) bsp, sep); + } else if (datatype == OBJ_BIOSEQSET) { + bssp = (BioseqSetPtr) dataptr; + sep->choice = 2; + sep->data.ptrvalue = bssp; + SeqMgrSeqEntry (SM_BIOSEQSET, (Pointer) bssp, sep); + } else { + sep = SeqEntryFree (sep); + } + } + sep = GetTopSeqEntryForEntityID (entityID); + } + + if (sep != NULL) { + bsplist = NULL; + + str = StringRChr (filename, DIRDELIMCHR); + if (str != NULL) { + str++; + } else { + str = filename; + } + StringCpy (file, str); + + bsp = FindNucBioseq (sep); + id [0] = '\0'; + if (bsp != NULL) { + sip = SeqIdFindWorst (bsp->id); + SeqIdWrite (sip, id, PRINTID_REPORT, sizeof (id)); + Message (MSG_POST, "Processing %s (%s)", id, file); + } + + SeqMgrIndexFeatures (entityID, NULL); + + /* remove pubs and biosources from nuc-prot set and nucleotide bioseq */ + + nsep = FindNucSeqEntry (sep); + descr1 = ExtractBioSourceAndPubs (sep); + descr2 = ExtractBioSourceAndPubs (nsep); + if (descr1 == NULL) { + descr1 = descr2; + } else if (descr2 != NULL) { + ValNodeLink (&descr1, descr2); + } + + /* convert from nuc-prot set to genomic-product set */ + + NPStoGPS (entityID, file, ngp); + + /* put pubs and biosources onto genomic-product set */ + + ReplaceBioSourceAndPubs (sep, descr1); + + if (ngp->failure) { + Message (MSG_POST, "Not saving %s", file); + } else if (StringDoesHaveText (ngp->outfile)) { + aip = AsnIoOpen (ngp->outfile, "w"); + if (aip != NULL) { + SeqEntryAsnWrite (sep, aip, NULL); + AsnIoClose (aip); + } + } else { + str = StringRChr (filename, DIRDELIMCHR); + if (str != NULL) { + *str = '\0'; + str++; + } else { + str = filename; + } + if (StringDoesHaveText (ngp->results)) { + StringCpy (path, ngp->results); + } else { + StringCpy (path, filename); + } + if (str != NULL) { + ptr = StringRChr (str, '.'); + if (ptr != NULL) { + *ptr = '\0'; + } + StringCpy (file, str); + StringCat (file, ".gps"); + FileBuildPath (path, NULL, file); + aip = AsnIoOpen (path, "w"); + if (aip != NULL) { + SeqEntryAsnWrite (sep, aip, NULL); + AsnIoClose (aip); + } + } + } + } + + } else { + + Message (MSG_POSTERR, "Datatype %d not recognized", (int) datatype); + } + + ObjMgrFree (datatype, dataptr); +} + +/* Args structure contains command-line arguments */ + +#define p_argInputPath 0 +#define r_argOutputPath 1 +#define i_argInputFile 2 +#define o_argOutputFile 3 +#define f_argFilter 4 +#define x_argSuffix 5 + + +Args myargs [] = { + {"Path to Files", NULL, NULL, NULL, + TRUE, 'p', ARG_STRING, 0.0, 0, NULL}, + {"Path for Results", NULL, NULL, NULL, + TRUE, 'r', ARG_STRING, 0.0, 0, NULL}, + {"Single Input File", "stdin", NULL, NULL, + TRUE, 'i', ARG_FILE_IN, 0.0, 0, NULL}, + {"Single Output File", "stdout", NULL, NULL, + TRUE, 'o', ARG_FILE_OUT, 0.0, 0, NULL}, + {"Substring Filter", NULL, NULL, NULL, + TRUE, 'f', ARG_STRING, 0.0, 0, NULL}, + {"File Selection Suffix", ".ent", NULL, NULL, + TRUE, 'x', ARG_STRING, 0.0, 0, NULL}, +}; + +Int2 Main (void) + +{ + Char app [64]; + CharPtr directory, filter, infile, outfile, results, suffix; + N2GData ngd; + N2GPtr ngp; + + /* standard setup */ + + ErrSetFatalLevel (SEV_MAX); + ErrClearOptFlags (EO_SHOW_USERSTR); + ErrSetLogfile ("stderr", ELOG_APPEND); + UseLocalAsnloadDataAndErrMsg (); + ErrPathReset (); + + if (! AllObjLoad ()) { + Message (MSG_FATAL, "AllObjLoad failed"); + return 1; + } + if (! SubmitAsnLoad ()) { + Message (MSG_FATAL, "SubmitAsnLoad failed"); + return 1; + } + if (! FeatDefSetLoad ()) { + Message (MSG_FATAL, "FeatDefSetLoad failed"); + return 1; + } + if (! SeqCodeSetLoad ()) { + Message (MSG_FATAL, "SeqCodeSetLoad failed"); + return 1; + } + if (! GeneticCodeTableLoad ()) { + Message (MSG_FATAL, "GeneticCodeTableLoad failed"); + return 1; + } + + /* process command line arguments */ + + sprintf (app, "nps2gps %s", NPS2GPSAPPLICATION); + if (! GetArgs (app, sizeof (myargs) / sizeof (Args), myargs)) { + return 0; + } + + MemSet ((Pointer) &ngd, 0, sizeof (N2GData)); + ngp = &ngd; + ngd.failure = FALSE; + + directory = (CharPtr) myargs [p_argInputPath].strvalue; + results = (CharPtr) myargs [r_argOutputPath].strvalue; + if (StringHasNoText (results)) { + results = NULL; + } + infile = (CharPtr) myargs [i_argInputFile].strvalue; + outfile = (CharPtr) myargs [o_argOutputFile].strvalue; + filter = (CharPtr) myargs [f_argFilter].strvalue; + suffix = (CharPtr) myargs [x_argSuffix].strvalue; + + /* process input file */ + + if (StringDoesHaveText (directory)) { + + ngd.results = results; + + DirExplore (directory, filter, suffix, ProcessOneRecord, (Pointer) &ngd); + + } else if (StringDoesHaveText (infile) && StringDoesHaveText (outfile)) { + + ngd.outfile = outfile; + + ProcessOneRecord (infile, (Pointer) &ngd); + } + + if (ngd.failure) return 1; + + return 0; +} + diff --git a/demo/rpsblast.c b/demo/rpsblast.c index 07b1bb59..4e134e67 100644 --- a/demo/rpsblast.c +++ b/demo/rpsblast.c @@ -1,6 +1,6 @@ -static char const rcsid[] = "$Id: rpsblast.c,v 6.67 2005/04/27 14:55:09 papadopo Exp $"; +static char const rcsid[] = "$Id: rpsblast.c,v 6.70 2005/06/02 20:43:38 dondosha Exp $"; -/* $Id: rpsblast.c,v 6.67 2005/04/27 14:55:09 papadopo Exp $ +/* $Id: rpsblast.c,v 6.70 2005/06/02 20:43:38 dondosha Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -31,12 +31,23 @@ static char const rcsid[] = "$Id: rpsblast.c,v 6.67 2005/04/27 14:55:09 papadopo * * Initial Version Creation Date: 12/14/1999 * -* $Revision: 6.67 $ +* $Revision: 6.70 $ * * File Description: * Main file for RPS BLAST program * * $Log: rpsblast.c,v $ +* Revision 6.70 2005/06/02 20:43:38 dondosha +* 1. Added loop over concatenated query sets; +* 2. Removed unused variables and other clean-up; +* 3. Use BlastFormattingInfo structure for formatting. +* +* Revision 6.69 2005/05/20 18:57:51 camacho +* Update to use new signature to BLAST_FillLookupTableOptions +* +* Revision 6.68 2005/05/02 17:00:28 coulouri +* change default to new engine +* * Revision 6.67 2005/04/27 14:55:09 papadopo * change signature of BlastFillHitSavingOptions * @@ -260,7 +271,7 @@ static char const rcsid[] = "$Id: rpsblast.c,v 6.67 2005/04/27 14:55:09 papadopo #include <algo/blast/api/blast_api.h> #include <algo/blast/api/blast_seq.h> -#if PURIFY +#ifdef PURIFY #include "/am/purew/solaris2/new/../purify/purify-4.5-solaris2/purify.h" #endif @@ -378,8 +389,8 @@ static Args myargs[] = { " 0 in 'stop' refers to the end of the sequence", "0,0", NULL, NULL, TRUE, 'L', ARG_STRING, 0.0, 0, NULL}, /* OPT_FORCE_OLD_ENGINE */ - { "Force use of the original BLAST engine", - "T", NULL, NULL, TRUE, 'V', ARG_BOOLEAN, 0.0, 0, NULL}, + { "Force use of the legacy BLAST engine", + "F", NULL, NULL, TRUE, 'V', ARG_BOOLEAN, 0.0, 0, NULL}, }; @@ -389,7 +400,7 @@ static Args myargs[] = { in blast_driver.c; unneeded setup is removed */ static Int2 -BLAST_FillOptions(SBlastOptions* options) +s_FillOptions(SBlastOptions* options) { Int2 status; EBlastProgramType program_number = options->program; @@ -406,8 +417,7 @@ BLAST_FillOptions(SBlastOptions* options) FALSE, /* megablast */ 0, /* default threshold */ 0, /* default wordsize */ - FALSE, /* no variable wordsize */ - FALSE);/* no PSSM (i.e. not psiblast) */ + FALSE);/* no variable wordsize */ BLAST_FillQuerySetUpOptions(query_setup_options, program_number, myargs[OPT_FILTER].strvalue, 0); @@ -485,32 +495,24 @@ Int2 Main_New(void) { Int2 status; Boolean query_is_na; - LookupTableOptions* lookup_options; char* blast_program = NULL; EBlastProgramType program_number; - BlastInitialWordOptions* word_options; - BlastScoringOptions* score_options; - BlastExtensionOptions* ext_options; - BlastHitSavingOptions* hit_options; char* dbname = NULL; FILE *infp; Int2 ctr = 1; Int4 letters_read; Int4 query_from = 0, query_to = 0; SBlastOptions* options = NULL; - QuerySetUpOptions* query_options=NULL; - BlastEffectiveLengthsOptions* eff_len_options=NULL; SeqLoc* query_slp = NULL; SeqAlign* seqalign = NULL; - BlastFormattingOptions* format_options; - PSIBlastOptions* psi_options = NULL; - BlastDatabaseOptions* db_options = NULL; + BlastFormattingInfo* format_info = NULL; Blast_SummaryReturn* sum_returns=Blast_SummaryReturnNew(); - Boolean believe_defline = FALSE; Int4 num_queries = 0; SeqLoc* lcase_mask = NULL; SeqLoc* filter_loc = NULL; Boolean mask_at_hash; + const int kMaxConcatLength = 40000; + Blast_SummaryReturn* full_sum_returns = NULL; /* select protein or nucleotide query, and choose the appropriate program type */ @@ -537,32 +539,9 @@ Int2 Main_New(void) dbname = myargs[OPT_DB].strvalue; - BLAST_FillOptions(options); + s_FillOptions(options); program_number = options->program; - if ((status = BlastFormattingOptionsNew(program_number, - myargs[OPT_OUTPUT_FILE].strvalue, - myargs[OPT_NUM_DESC].intvalue, - myargs[OPT_NUM_RESULTS].intvalue, - myargs[OPT_FORMAT].intvalue, &format_options)) != 0) { - ErrPostEx(SEV_FATAL, 1, 0, "Formatting setup failed"); - } - - if (myargs[OPT_HTML].intvalue) { - format_options->html = TRUE; - format_options->align_options += TXALIGN_HTML; - format_options->print_options += TXALIGN_HTML; - } - if (myargs[OPT_SHOW_GI].intvalue) { - format_options->align_options += TXALIGN_SHOW_GI; - format_options->print_options += TXALIGN_SHOW_GI; - } - - BLAST_PrintOutputHeader(format_options, FALSE, blast_program, - dbname, TRUE); - - believe_defline = myargs[OPT_BELIEVE_QUERY].intvalue; - /* Get the query */ if ((infp = FileOpen(myargs[OPT_QUERY_FILE].strvalue, "r")) == NULL) { @@ -576,88 +555,116 @@ Int2 Main_New(void) ErrPostEx(SEV_FATAL, 1, 0, "Invalid subject sequence location\n"); return -1; } + if ((query_from != 0 || query_to != 0) && (Boolean)myargs[OPT_PROT_QUERY].intvalue == FALSE) { ErrPostEx(SEV_FATAL, 1, 0, "No query range allowed for nucleotide query"); return -1; } - if ((Boolean)myargs[OPT_LCASE].intvalue) { - letters_read = BLAST_GetQuerySeqLoc(infp, query_is_na, 0, 0, - query_from, query_to, &lcase_mask, - &query_slp, &ctr, &num_queries, - believe_defline); - } else { - letters_read = BLAST_GetQuerySeqLoc(infp, query_is_na, 0, 0, - query_from, query_to, NULL, - &query_slp, &ctr, &num_queries, - believe_defline); - } - if (letters_read <= 0) { - ErrPostEx(SEV_FATAL, 1, 0, "Failure reading query\n"); - return -1; - } - - /* Call database search function. Pass NULL for tabular formatting structure - * pointer, because on-the-fly tabular formatting is not allowed for RPS - * BLAST. - */ - status = - Blast_DatabaseSearch(query_slp, dbname, lcase_mask, options, NULL, - &seqalign, &filter_loc, &mask_at_hash, sum_returns); - - /* Free the lower case mask in SeqLoc form. */ - lcase_mask = Blast_ValNodeMaskListFree(lcase_mask); - - /* If masking was done for lookup table only, free the masking locations, - because they will not be used for formatting. */ - if (mask_at_hash) + BlastFormattingInfoNew(myargs[OPT_FORMAT].intvalue, options, blast_program, + dbname, myargs[OPT_OUTPUT_FILE].strvalue, + &format_info); + + /* This is not megablast, so pass FALSE for the respective arguments. */ + BlastFormattingInfoSetUpOptions(format_info, myargs[OPT_NUM_DESC].intvalue, + myargs[OPT_NUM_RESULTS].intvalue, + (Boolean) myargs[OPT_HTML].intvalue, + FALSE, (Boolean) myargs[OPT_SHOW_GI].intvalue, + (Boolean) myargs[OPT_BELIEVE_QUERY].intvalue); + + BLAST_PrintOutputHeader(format_info); + + /* Loop over sets of queries. */ + while (1) { + + if ((Boolean)myargs[OPT_LCASE].intvalue) { + letters_read = + BLAST_GetQuerySeqLoc(infp, query_is_na, 0, kMaxConcatLength, + query_from, query_to, &lcase_mask, + &query_slp, &ctr, &num_queries, + myargs[OPT_BELIEVE_QUERY].intvalue); + } else { + letters_read = + BLAST_GetQuerySeqLoc(infp, query_is_na, 0, kMaxConcatLength, + query_from, query_to, NULL, + &query_slp, &ctr, &num_queries, + myargs[OPT_BELIEVE_QUERY].intvalue); + } + + if (letters_read <= 0) + break; + + /* Call database search function. Pass NULL for tabular formatting + * structure pointer, because on-the-fly tabular formatting is not + * allowed for RPS BLAST. + */ + status = + Blast_DatabaseSearch(query_slp, dbname, lcase_mask, options, NULL, + &seqalign, &filter_loc, &mask_at_hash, + sum_returns); + + /* Free the lower case mask in SeqLoc form. */ + lcase_mask = Blast_ValNodeMaskListFree(lcase_mask); + + /* If masking was done for lookup table only, free the masking locations, + because they will not be used for formatting. */ + if (mask_at_hash) + filter_loc = Blast_ValNodeMaskListFree(filter_loc); + + /* Post warning or error messages, no matter what the search status + was. */ + Blast_SummaryReturnsPostError(sum_returns); + + if (status != 0) { + ErrPostEx(SEV_FATAL, 1, 0, "BLAST search failed"); + return status; + } + + /* do initial cleanup */ + + /* format results */ + + if (myargs[OPT_ASNOUT].strvalue) { + AsnIoPtr asnout = AsnIoOpen(myargs[OPT_ASNOUT].strvalue, (char*)"w"); + GenericSeqAlignSetAsnWrite(seqalign, asnout); + asnout = AsnIoClose(asnout); + } + + status = + BLAST_FormatResults(seqalign, num_queries, query_slp, filter_loc, + format_info, sum_returns); + + /* finish cleanup */ filter_loc = Blast_ValNodeMaskListFree(filter_loc); - - /* Post warning or error messages, no matter what the search status was. */ - Blast_SummaryReturnsPostError(sum_returns); - - if (status != 0) { - ErrPostEx(SEV_FATAL, 1, 0, "BLAST search failed"); - return status; - } - - /* do initial cleanup */ - - /* format results */ - - if (myargs[OPT_ASNOUT].strvalue) { - AsnIoPtr asnout = AsnIoOpen(myargs[OPT_ASNOUT].strvalue, (char*)"w"); - GenericSeqAlignSetAsnWrite(seqalign, asnout); - asnout = AsnIoClose(asnout); + seqalign = SeqAlignSetFree(seqalign); + query_slp = SeqLocSetFree(query_slp); + + /* Update the cumulative summary returns structure and clean the returns + substructures for the current search iteration. */ + Blast_SummaryReturnUpdate(sum_returns, &full_sum_returns); + Blast_SummaryReturnClean(sum_returns); } - - status = BLAST_FormatResults(seqalign, dbname, blast_program, - num_queries, query_slp, - filter_loc, format_options, - FALSE, NULL, sum_returns); - - /* finish cleanup */ - filter_loc = Blast_ValNodeMaskListFree(filter_loc); - seqalign = SeqAlignSetFree(seqalign); - query_slp = SeqLocSetFree(query_slp); - options = SBlastOptionsFree(options); - /* Print the footer with summary information */ - Blast_PrintOutputFooter(program_number, format_options, dbname, sum_returns); + if (infp) + FileClose(infp); + + /* Print the footer with summary information. */ + Blast_PrintOutputFooter(format_info, full_sum_returns); sum_returns = Blast_SummaryReturnFree(sum_returns); + full_sum_returns = Blast_SummaryReturnFree(full_sum_returns); - if(format_options->html && myargs[OPT_FORMAT].intvalue < 7) { - fprintf(format_options->outfp, "</PRE>\n</BODY>\n</HTML>\n"); - } - BlastFormattingOptionsFree(format_options); + format_info = BlastFormattingInfoFree(format_info); + options = SBlastOptionsFree(options); + return status; } /*------------------- RPS BLAST USING OLD ENGINE ------------------------*/ -void PGPGetPrintOptions(Boolean gapped, Uint4Ptr align_options_out, +static void +s_PGPGetPrintOptions(Boolean gapped, Uint4Ptr align_options_out, Uint4Ptr print_options_out) { Uint4 print_options, align_options; @@ -708,7 +715,8 @@ void PGPGetPrintOptions(Boolean gapped, Uint4Ptr align_options_out, return; } -void RPSBlastOptionsFree(RPSBlastOptionsPtr rpsbop) +static void +s_RPSBlastOptionsFree(RPSBlastOptionsPtr rpsbop) { FileClose(rpsbop->outfp); @@ -723,11 +731,11 @@ void RPSBlastOptionsFree(RPSBlastOptionsPtr rpsbop) return; } -static RPSBlastOptionsPtr RPSReadBlastOptions(void) +static RPSBlastOptionsPtr +s_RPSReadBlastOptions(void) { RPSBlastOptionsPtr rpsbop; BLAST_OptionsBlkPtr options; - static Int4 count=0; CharPtr location = NULL; rpsbop = MemNew(sizeof(RPSBlastOptions)); @@ -791,8 +799,8 @@ static RPSBlastOptionsPtr RPSReadBlastOptions(void) rpsbop->asn1_mode = StringSave("wb"); } else - PGPGetPrintOptions(options->gapped_calculation, - &rpsbop->align_options, &rpsbop->print_options); + s_PGPGetPrintOptions(options->gapped_calculation, + &rpsbop->align_options, &rpsbop->print_options); if (!rpsbop->is_xml_output && @@ -882,9 +890,9 @@ static RPSBlastOptionsPtr RPSReadBlastOptions(void) return rpsbop; } -static void RPSViewSeqAlign(BioseqPtr query_bsp, - SeqAlignPtr seqalign, RPSBlastOptionsPtr rpsbop, - ValNodePtr other_returns) +static void +s_RPSViewSeqAlign(BioseqPtr query_bsp, SeqAlignPtr seqalign, + RPSBlastOptionsPtr rpsbop, ValNodePtr other_returns) { SeqAnnotPtr seqannot; BlastPruneSapStructPtr prune; @@ -978,7 +986,8 @@ static void RPSViewSeqAlign(BioseqPtr query_bsp, return; } -Boolean RPSFormatFooter(RPSBlastOptionsPtr rpsbop, ValNodePtr other_returns) +static Boolean +s_RPSFormatFooter(RPSBlastOptionsPtr rpsbop, ValNodePtr other_returns) { ValNodePtr mask_loc, vnp; BLAST_KarlinBlkPtr ka_params=NULL, ka_params_gap=NULL; @@ -1044,7 +1053,8 @@ Boolean RPSFormatFooter(RPSBlastOptionsPtr rpsbop, ValNodePtr other_returns) return TRUE; } -static SeqEntryPtr LIBCALLBACK RPSGetNextSeqEntry(SeqLocPtr PNTR slp, VoidPtr data) +static SeqEntryPtr LIBCALLBACK +s_RPSGetNextSeqEntry(SeqLocPtr PNTR slp, VoidPtr data) { SeqEntryPtr sep; static TNlmMutex read_mutex; @@ -1091,11 +1101,10 @@ static SeqEntryPtr LIBCALLBACK RPSGetNextSeqEntry(SeqLocPtr PNTR slp, VoidPtr da return sep; } -static Boolean LIBCALLBACK RPSResultsCallback(BioseqPtr query_bsp, - RPSBlastOptionsPtr rpsbop, - SeqAlignPtr seqalign, - ValNodePtr other_returns, - ValNodePtr error_returns, VoidPtr data) +static Boolean LIBCALLBACK +s_RPSResultsCallback(BioseqPtr query_bsp, RPSBlastOptionsPtr rpsbop, + SeqAlignPtr seqalign, ValNodePtr other_returns, + ValNodePtr error_returns, VoidPtr data) { if(rpsbop->is_xml_output == TRUE) { AsnIoPtr aip = AsnIoOpen(rpsbop->out_filename, "wx"); @@ -1116,8 +1125,8 @@ static Boolean LIBCALLBACK RPSResultsCallback(BioseqPtr query_bsp, rpsbop->believe_query, 0, 0, rpsbop->outfp, FALSE); } else { - RPSViewSeqAlign(query_bsp, seqalign, rpsbop, other_returns); - RPSFormatFooter(rpsbop, other_returns); + s_RPSViewSeqAlign(query_bsp, seqalign, rpsbop, other_returns); + s_RPSFormatFooter(rpsbop, other_returns); } return TRUE; @@ -1132,7 +1141,7 @@ Int2 Main_Old(void) if((fd = FileOpen(myargs[OPT_OUTPUT_FILE].strvalue, "w")) != NULL) FileClose(fd); - if((rpsbop = RPSReadBlastOptions()) == NULL) { + if((rpsbop = s_RPSReadBlastOptions()) == NULL) { ErrPostEx(SEV_FATAL, 1, 0, "Unable to create RPS Blast options"); return 1; } @@ -1140,9 +1149,11 @@ Int2 Main_Old(void) if (!rpsbop->is_xml_output && !rpsbop->is_tabular && !rpsbop->is_asn1_output) BlastPrintVersionInfo("RPS-BLAST", rpsbop->html, rpsbop->outfp); - RPSBlastSearchMT(rpsbop, RPSGetNextSeqEntry, NULL, - RPSResultsCallback, NULL); - RPSBlastOptionsFree(rpsbop); + RPSBlastSearchMT(rpsbop, s_RPSGetNextSeqEntry, NULL, + s_RPSResultsCallback, NULL); + s_RPSBlastOptionsFree(rpsbop); + + return 0; } /*-------------------------- TOP LEVEL DRIVER -------------------------*/ diff --git a/demo/tbl2asn.c b/demo/tbl2asn.c index b2164a6e..a5fa5057 100644 --- a/demo/tbl2asn.c +++ b/demo/tbl2asn.c @@ -29,7 +29,7 @@ * * Version Creation Date: 5/5/00 * -* $Revision: 6.107 $ +* $Revision: 6.111 $ * * File Description: * @@ -64,7 +64,7 @@ #include <pmfapi.h> #include <tax3api.h> -#define TBL2ASN_APP_VER "3.6" +#define TBL2ASN_APP_VER "3.8" CharPtr TBL2ASN_APPLICATION = TBL2ASN_APP_VER; @@ -1235,7 +1235,8 @@ static void ProcessOneNuc ( Uint2 entityID, BioseqPtr bsp, BioSourcePtr src, - TblArgsPtr tbl + TblArgsPtr tbl, + MolInfoPtr template_molinfo ) { @@ -1306,6 +1307,10 @@ static void ProcessOneNuc ( if (stp != NULL) { mip = ParseTitleIntoMolInfo (stp, mip); } + if (mip->biomol == 0 && template_molinfo != NULL) + { + mip->biomol = template_molinfo->biomol; + } if (mip->biomol == 0) { mip->biomol = MOLECULE_TYPE_GENOMIC; } @@ -1590,7 +1595,8 @@ static Uint2 ProcessOneAsn ( BioSourcePtr src, TblArgsPtr tbl, CharPtr localname, - SeqEntryPtr gsep + SeqEntryPtr gsep, + MolInfoPtr template_molinfo ) { @@ -1621,6 +1627,14 @@ static Uint2 ProcessOneAsn ( } else if (datatype == OBJ_BIOSEQSET) { bssp->seq_set = SeqMgrGetSeqEntryForData (dataptr); SeqMgrSeqEntry (SM_BIOSEQSET, (Pointer) dataptr, gsep); + } else if (datatype == OBJ_SEQENTRY) { + sep = (SeqEntryPtr) dataptr; + bssp->seq_set = sep; + if (IS_Bioseq (sep)) { + SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) sep->data.ptrvalue, gsep); + } else if (IS_Bioseq_set (sep)) { + SeqMgrSeqEntry (SM_BIOSEQSET, (Pointer) sep->data.ptrvalue, gsep); + } else return 0; } else return 0; SeqMgrLinkSeqEntry (gsep, parenttype, parentptr); @@ -1649,7 +1663,7 @@ static Uint2 ProcessOneAsn ( } } - ProcessOneNuc (entityID, bsp, src, tbl); + ProcessOneNuc (entityID, bsp, src, tbl, template_molinfo); return entityID; } @@ -1657,7 +1671,8 @@ static Uint2 ProcessOneAsn ( static Uint2 ProcessBulkSet ( FILE* fp, BioSourcePtr src, - TblArgsPtr tbl + TblArgsPtr tbl, + MolInfoPtr template_molinfo ) { @@ -1733,7 +1748,7 @@ while ((bsp = ReadDeltaFasta (fp, NULL)) != NULL) { } lastsep = sep; - ProcessOneNuc (entityID, bsp, src, tbl); + ProcessOneNuc (entityID, bsp, src, tbl, template_molinfo); } SeqMgrLinkSeqEntry (topsep, 0, NULL); @@ -1776,7 +1791,8 @@ static Uint2 ProcessDeltaSet ( BioSourcePtr src, TblArgsPtr tbl, CharPtr localname, - SeqEntryPtr gsep + SeqEntryPtr gsep, + MolInfoPtr template_molinfo ) { @@ -1847,7 +1863,7 @@ static Uint2 ProcessDeltaSet ( entityID = ObjMgrRegister (OBJ_BIOSEQ, (Pointer) bsp); } - ProcessOneNuc (entityID, bsp, src, tbl); + ProcessOneNuc (entityID, bsp, src, tbl, template_molinfo); return entityID; } @@ -1962,7 +1978,7 @@ static Uint2 ProcessDeltaSet ( } } - ProcessOneNuc (entityID, deltabsp, src, tbl); + ProcessOneNuc (entityID, deltabsp, src, tbl, template_molinfo); return entityID; } @@ -2029,7 +2045,8 @@ static void ShowAlignmentNotes static Uint2 ProcessAlignSet ( FILE *fp, BioSourcePtr src, - TblArgsPtr tbl + TblArgsPtr tbl, + MolInfoPtr template_molinfo ) { @@ -2129,7 +2146,7 @@ static Uint2 ProcessAlignSet ( if (IS_Bioseq (sep)) { bsp = (BioseqPtr) sep->data.ptrvalue; entityID = ObjMgrRegister (OBJ_BIOSEQ, (Pointer) bsp); - ProcessOneNuc (entityID, bsp, src, tbl); + ProcessOneNuc (entityID, bsp, src, tbl, template_molinfo); } else if (IS_Bioseq_set (sep)) { bssp = (BioseqSetPtr) sep->data.ptrvalue; bssp->_class = BioseqseqSet_class_phy_set; @@ -2137,7 +2154,7 @@ static Uint2 ProcessAlignSet ( for (tmp = bssp->seq_set; tmp != NULL; tmp = tmp->next) { if (IS_Bioseq (tmp)) { bsp = (BioseqPtr) tmp->data.ptrvalue; - ProcessOneNuc (entityID, bsp, src, tbl); + ProcessOneNuc (entityID, bsp, src, tbl, template_molinfo); } } } else return 0; @@ -3086,6 +3103,46 @@ static void ProcessSourceTable ( } } +static SeqDescrPtr GetDescriptorTypeAlreadyInList (Uint1 descr_choice, SeqDescrPtr list) + +{ + while (list != NULL && list->choice != descr_choice) + { + list = list->next; + } + return list; +} + +static void AddTemplateDescriptors +( + SeqDescrPtr PNTR current_list, + SeqDescrPtr new_list +) + +{ + SeqDescrPtr sdp_next, sdp; + + if (current_list == NULL || new_list == NULL) return; + + for (sdp = new_list; sdp != NULL; sdp = sdp->next) + { + if ((sdp->choice == Seq_descr_molinfo || sdp->choice == Seq_descr_source) + && GetDescriptorTypeAlreadyInList (Seq_descr_molinfo, *current_list) != NULL) + { + continue; + } + else + { + sdp_next = sdp->next; + sdp->next = NULL; + ValNodeLink (current_list, AsnIoMemCopy ((Pointer) sdp, + (AsnReadFunc) SeqDescrAsnRead, + (AsnWriteFunc) SeqDescrAsnWrite)); + sdp->next = sdp_next; + } + } +} + static void ProcessOneRecord ( SubmitBlockPtr sbp, PubdescPtr pdp, @@ -3132,6 +3189,7 @@ static void ProcessOneRecord ( SimpleSeqPtr ssp; CharPtr str; SeqEntryPtr tmp; + MolInfoPtr template_molinfo = NULL; fp = OpenOneFile (directory, base, suffix); if (fp == NULL) return; @@ -3154,16 +3212,25 @@ static void ProcessOneRecord ( localname = base; } + /* find MolInfo from template, if there is any */ + sdp = sdphead; + while (sdp != NULL && sdp->choice != Seq_descr_molinfo) { + sdp = sdp->next; + } + if (sdp != NULL) { + template_molinfo = (MolInfoPtr) sdp->data.ptrvalue; + } + /* read one or more ASN.1 or FASTA sequence files */ if (tbl->fastaset) { - entityID = ProcessBulkSet (fp, src, tbl); + entityID = ProcessBulkSet (fp, src, tbl, template_molinfo); } else if (tbl->deltaset) { - entityID = ProcessDeltaSet (fp, src, tbl, localname, gsep); + entityID = ProcessDeltaSet (fp, src, tbl, localname, gsep, template_molinfo); } else if (tbl->alignset) { - entityID = ProcessAlignSet (fp, src, tbl); + entityID = ProcessAlignSet (fp, src, tbl, template_molinfo); } else { - entityID = ProcessOneAsn (fp, src, tbl, localname, gsep); + entityID = ProcessOneAsn (fp, src, tbl, localname, gsep, template_molinfo); } FileClose (fp); @@ -3359,16 +3426,10 @@ static void ProcessOneRecord ( if (sdphead != NULL) { if (IS_Bioseq (sep)) { bsp = (BioseqPtr) sep->data.ptrvalue; - ValNodeLink (&(bsp->descr), - AsnIoMemCopy ((Pointer) sdphead, - (AsnReadFunc) SeqDescrAsnRead, - (AsnWriteFunc) SeqDescrAsnWrite)); + AddTemplateDescriptors (&(bsp->descr), sdphead); } else if (IS_Bioseq_set (sep)) { dssp = (BioseqSetPtr) sep->data.ptrvalue; - ValNodeLink (&(dssp->descr), - AsnIoMemCopy ((Pointer) sdphead, - (AsnReadFunc) SeqDescrAsnRead, - (AsnWriteFunc) SeqDescrAsnWrite)); + AddTemplateDescriptors (&(dssp->descr), sdphead); } } dp = DateCurr (); @@ -3412,11 +3473,9 @@ static void ProcessOneRecord ( VisitFeaturesInSep (sep, NULL, RemoveUnnecGeneXref); } - if (SeqMgrFeaturesAreIndexed (entityID)) { - InstantiateProteinTitles (entityID, NULL); - } else { - Message (MSG_POSTERR, "Unable to instantiate protein titles due to dropped index"); - } + /* need to reindex so hypothetical protein titles pick up locus_tag */ + SeqMgrIndexFeatures (entityID, NULL); + InstantiateProteinTitles (entityID, NULL); if (tbl->genprodset) { /* need to reindex before instantiating mRNA titles */ diff --git a/demo/trna2tbl.c b/demo/trna2tbl.c new file mode 100644 index 00000000..0270cc82 --- /dev/null +++ b/demo/trna2tbl.c @@ -0,0 +1,238 @@ +/* trna2tbl.c +* =========================================================================== +* +* PUBLIC DOMAIN NOTICE +* National Center for Biotechnology Information (NCBI) +* +* This software/database is a "United States Government Work" under the +* terms of the United States Copyright Act. It was written as part of +* the author's official duties as a United States Government employee and +* thus cannot be copyrighted. This software/database is freely available +* to the public for use. The National Library of Medicine and the U.S. +* Government do not place any restriction on its use or reproduction. +* We would, however, appreciate having the NCBI and the author cited in +* any work or product based on this material +* +* Although all reasonable efforts have been taken to ensure the accuracy +* and reliability of the software and data, the NLM and the U.S. +* Government do not and cannot warrant the performance or results that +* may be obtained by using this software or data. The NLM and the U.S. +* Government disclaim all warranties, express or implied, including +* warranties of performance, merchantability or fitness for any particular +* purpose. +* +* =========================================================================== +* +* File Name: trna2tbl.c +* +* Author: Jonathan Kans +* +* Version Creation Date: 5/31/05 +* +* $Revision: 1.1 $ +* +* File Description: +* +* Modifications: +* -------------------------------------------------------------------------- +* Date Name Description of modification +* ------- ---------- ----------------------------------------------------- +* +* +* ========================================================================== +*/ + +/* +* To compile on Linux: +* +* gcc -o trna2tbl trna2tbl.c -lm +* +* To compile on Solaris: +* +* cc -xildoff -o trna2tbl trna2tbl.c -lgen -lm +* +* To compile on SGI: +* +* cc -mips1 -o trna2tbl trna2tbl.c -lm -lPW -lsun +* +* To compile on Darwin: +* +* cc -pipe -o trna2tbl trna2tbl.c -lc +* +*/ + +#include <stddef.h> +#include <stdio.h> +#include <string.h> +#include <stdlib.h> + +/* convenient defines and typedefs from NCBI toolkit */ + +#ifndef NULL +#define NULL ((void *)0) +#endif + +#ifndef Pointer +typedef void * Pointer; +#endif + +#ifndef Char +typedef char Char, * CharPtr; +#endif + +#ifndef Bool +typedef unsigned char Bool, * BoolPtr; +#endif + +#ifndef TRUE +#define TRUE ((Bool)1) +#endif + +#ifndef FALSE +#define FALSE ((Bool)0) +#endif + +#ifndef Int2 +typedef short Int2, * Int2Ptr; +#endif + +#ifndef Int4 +typedef long Int4, * Int4Ptr; +#endif + +#ifndef MIN +#define MIN(a,b) ((a)>(b)?(b):(a)) +#endif + +/* useful portable character macros from NCBI toolkit (assumes ASCII) */ + +#define IS_DIGIT(c) ('0'<=(c) && (c)<='9') +#define IS_UPPER(c) ('A'<=(c) && (c)<='Z') +#define IS_LOWER(c) ('a'<=(c) && (c)<='z') +#define IS_ALPHA(c) (IS_UPPER(c) || IS_LOWER(c)) +#define TO_LOWER(c) ((Char)(IS_UPPER(c) ? (c)+' ' : (c))) +#define TO_UPPER(c) ((Char)(IS_LOWER(c) ? (c)-' ' : (c))) +#define IS_WHITESP(c) (((c) == ' ') || ((c) == '\n') || ((c) == '\r') || ((c) == '\t')) +#define IS_ALPHANUM(c) (IS_ALPHA(c) || IS_DIGIT(c)) +#define IS_PRINT(c) (' '<=(c) && (c)<='~') + +#define MAX_FIELDS 9 + +static void RunTrnaScan (void) + +{ + CharPtr aa; + CharPtr beg; + Char buf [256]; + Char cmmd [256]; + CharPtr end; + CharPtr field [MAX_FIELDS]; + CharPtr id; + Int2 idNotSent = TRUE; + Int2 inBody = FALSE; + CharPtr intronBeg; + CharPtr intronEnd; + long int intronStart; + long int intronStop; + Int2 numFields = 0; + CharPtr ptr; + CharPtr speed; + long int start; + long int stop; + Char str [80]; + +/* line by line processing of tRNAscan-SE output table */ + + while (fgets (buf, sizeof (buf), stdin) != NULL) { + + if (inBody) { + memset (field, 0, sizeof (field)); + +/* +* parse tab-delimited output line into array of fields, avoiding use of +* strtok so that empty columns (adjacent tabs) are properly assigned to +* field array +*/ + + ptr = buf; + for (numFields = 0; numFields < MAX_FIELDS && ptr != NULL; numFields++) { + field [numFields] = ptr; + ptr = strchr (ptr, '\t'); + if (ptr != NULL) { + *ptr = '\0'; + ptr++; + } + } + +/* interested in ID, start, stop, amino acid, and intron start and stop */ + + id = field [0]; + beg = field [2]; + end = field [3]; + aa = field [4]; + intronBeg = field [6]; + intronEnd = field [7]; + + if (numFields > 7 && + sscanf (beg, "%ld", &start) == 1 && + sscanf (end, "%ld", &stop) == 1 && + sscanf (intronBeg, "%ld", &intronStart) == 1 && + sscanf (intronEnd, "%ld", &intronStop) == 1) { + +/* first line of output gives SeqId from FASTA definition line */ + + if (idNotSent) { + fprintf (stdout, ">Features %s\n", id); + fflush (stdout); + idNotSent = FALSE; + } + +/* first line of feature has start (tab) stop (tab) feature key */ +/* multiple intervals would have lines of start (tab) stop */ + + if (intronStart == 0 && intronStop == 0) { + fprintf (stdout, "%ld\t%ld\ttRNA\n", (long) start, (long) stop); + fflush (stdout); + } else { + fprintf (stdout, "%ld\t%ld\ttRNA\n", (long) start, (long) (intronStart - 1)); + fprintf (stdout, "%ld\t%ld\t\n", (long) (intronStop + 1), (long) stop); + fflush (stdout); + } + +/* qualifier lines are (tab) (tab) (tab) qualifier key (tab) value */ + + if (strstr (aa, "Pseudo") != NULL) { + fprintf (stdout, "\t\t\tnote\ttRNA-Pseudo\n"); + fflush (stdout); + } else { + fprintf (stdout, "\t\t\tproduct\t%s\n", aa); + fflush (stdout); + } + +/* dash (formerly empty) gene qualifier to suppress /gene (e.g., if tRNA is in an intron) */ + + fprintf (stdout, "\t\t\tgene\t-\n"); + fflush (stdout); + } + } + +/* detect last line of table header, ignoring everything before data section */ + + if (strstr (buf, "-----") != NULL) { + inBody = TRUE; + } + } + + if (idNotSent) { + fprintf (stdout, ">Message\ntRNAscan-SE found no tRNA genes in this sequence\n"); + fflush (stdout); + } +} + +main (int argc, char *argv[]) + +{ + RunTrnaScan (); + return 0; +} + |