diff options
Diffstat (limited to 'demo')
-rw-r--r-- | demo/blast_driver.c | 72 | ||||
-rw-r--r-- | demo/blastall.c | 9 | ||||
-rw-r--r-- | demo/copymat.c | 254 | ||||
-rw-r--r-- | demo/debruijn.c | 113 | ||||
-rw-r--r-- | demo/formatdb.c | 14 | ||||
-rw-r--r-- | demo/megablast.c | 42 | ||||
-rw-r--r-- | demo/rtestval.c | 6 | ||||
-rw-r--r-- | demo/tbl2asn.c | 152 |
8 files changed, 474 insertions, 188 deletions
diff --git a/demo/blast_driver.c b/demo/blast_driver.c index 8249d58f..65befc58 100644 --- a/demo/blast_driver.c +++ b/demo/blast_driver.c @@ -1,4 +1,4 @@ -/* $Id: blast.c,v 1.9 2003/10/23 20:15:37 dondosha Exp $ +/* $Id: blast_driver.c,v 1.16 2004/01/07 14:40:20 papadopo Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -32,10 +32,10 @@ Author: Ilya Dondoshansky Contents: Main function for running BLAST ****************************************************************************** - * $Revision: 1.9 $ + * $Revision: 1.16 $ * */ -static char const rcsid[] = "$Id: blast.c,v 1.9 2003/10/23 20:15:37 dondosha Exp $"; +static char const rcsid[] = "$Id: blast_driver.c,v 1.16 2004/01/07 14:40:20 papadopo Exp $"; #include <ncbi.h> #include <sqnutils.h> @@ -45,9 +45,9 @@ static char const rcsid[] = "$Id: blast.c,v 1.9 2003/10/23 20:15:37 dondosha Exp #include <algo/blast/core/blast_message.h> #include <algo/blast/core/blast_util.h> #include <algo/blast/core/blast_engine.h> -#include "blast_seqalign.h" -#include "blast_format.h" -#include "seqsrc_readdb.h" +#include <algo/blast/api/blast_seqalign.h> +#include <algo/blast/api/blast_format.h> +#include <algo/blast/api/seqsrc_readdb.h> #define NUMARG (sizeof(myargs)/sizeof(myargs[0])) @@ -147,8 +147,9 @@ static Args myargs[] = { "0", NULL, NULL, FALSE, 'y', ARG_INT, 0.0, 0, NULL}, { "Do only ungapped alignment (always TRUE for tblastx)",/*ARG_UNGAPPED*/ "F", NULL, NULL, FALSE, 'u', ARG_BOOLEAN, 0.0, 0, NULL}, - { "Use greedy algorithm for gapped extensions", /* ARG_GREEDY */ - "F", NULL, NULL, FALSE, 'g', ARG_BOOLEAN, 0.0, 0, NULL}, + { "Use greedy algorithm for gapped extensions:\n 0 no, 1 one-step, " + "2 two-step, 3 two-step with ungapped", /* ARG_GREEDY */ + "F", NULL, NULL, FALSE, 'g', ARG_INT, 0.0, 0, NULL}, { "Gap open penalty (default: non-affine if greedy; 5 if dyn. prog.)", "0", NULL, NULL, FALSE, 'G', ARG_INT, 0.0, 0, NULL}, /* ARG_GAPOPEN */ { "Gap extension penalty (default: non-affine if greedy; 2 otherwise)", @@ -256,6 +257,14 @@ BLAST_FillOptions(LookupTableOptions* lookup_options, if (myargs[ARG_STRIDE].intvalue) lookup_options->scan_step = myargs[ARG_STRIDE].intvalue; + + if (myargs[ARG_PHI].strvalue) { + lookup_options->phi_pattern = strdup(myargs[ARG_PHI].strvalue); + lookup_options->lut_type = + ((program_number == blast_type_blastn) ? + PHI_NA_LOOKUP : PHI_AA_LOOKUP); + hit_options->phi_align = TRUE; + } BLAST_FillQuerySetUpOptions(query_setup_options, program_number, myargs[ARG_FILTER].strvalue, myargs[ARG_STRAND].intvalue); @@ -272,6 +281,25 @@ BLAST_FillOptions(LookupTableOptions* lookup_options, BLAST_FillExtensionOptions(ext_options, program_number, greedy_extension, myargs[ARG_XDROP].intvalue, myargs[ARG_XDROP_FINAL].intvalue); + if (greedy_extension) { + switch (myargs[ARG_GREEDY].intvalue) { + case 1: + ext_options->algorithm_type = EXTEND_GREEDY; + word_options->ungapped_extension = FALSE; + break; + case 2: + ext_options->algorithm_type = EXTEND_GREEDY_NO_TRACEBACK; + word_options->ungapped_extension = FALSE; + break; + case 3: + ext_options->algorithm_type = EXTEND_GREEDY_NO_TRACEBACK; + word_options->ungapped_extension = TRUE; + break; + default: + break; + } + } + BLAST_FillScoringOptions(score_options, program_number, greedy_extension, myargs[ARG_MISMATCH].intvalue, myargs[ARG_MATCH].intvalue, myargs[ARG_MATRIX].strvalue, myargs[ARG_GAPOPEN].intvalue, @@ -304,10 +332,10 @@ BLAST_FillOptions(LookupTableOptions* lookup_options, BLAST_FillEffectiveLengthsOptions(eff_len_options, numseqs, totlen, (Int8) myargs[ARG_SEARCHSP].floatvalue); - if (myargs[ARG_DBGENCODE].intvalue && db_options && - (program_number == blast_type_tblastn || - program_number == blast_type_tblastx)) { - db_options->genetic_code = myargs[ARG_DBGENCODE].intvalue; + if (db_options && (program_number == blast_type_tblastn || + program_number == blast_type_tblastx)) { + if (myargs[ARG_DBGENCODE].intvalue) + db_options->genetic_code = myargs[ARG_DBGENCODE].intvalue; if ((status = BLAST_GeneticCodeFind(db_options->genetic_code, &db_options->gen_code_string))) return status; @@ -338,13 +366,13 @@ Int2 Nlm_Main(void) Int2 status; QuerySetUpOptions* query_options=NULL; BlastEffectiveLengthsOptions* eff_len_options=NULL; - BlastMask* lcase_mask = NULL; - BlastMask* filter_loc=NULL; /* All masking locations */ + BlastMaskLoc* lcase_mask = NULL; + BlastMaskLoc* filter_loc=NULL; /* All masking locations */ SeqLoc* query_slp = NULL; BlastScoreBlk* sbp = NULL; FILE *infp, *outfp; BlastQueryInfo* query_info; - BlastResults* results = NULL; + BlastHSPResults* results = NULL; Blast_Message* blast_message = NULL; SeqAlign* seqalign; BlastFormattingOptions* format_options; @@ -413,6 +441,8 @@ Int2 Nlm_Main(void) readdb_args = (ReaddbNewArgs*) malloc(sizeof(ReaddbNewArgs)); readdb_args->dbname = dbname = myargs[ARG_DB].strvalue; readdb_args->is_protein = !db_is_na; + readdb_args->first_db_seq = 0; + readdb_args->final_db_seq = 0; bssn_info.constructor = &ReaddbSeqSrcNew; bssn_info.ctor_argument = (void*) readdb_args; @@ -460,7 +490,7 @@ Int2 Nlm_Main(void) } if (translated_query) { - BlastMaskDNAToProtein(&lcase_mask, query_slp); + BlastMaskLocDNAToProtein(&lcase_mask, query_slp); } status = BLAST_SetUpQuery(program_number, query_slp, @@ -476,7 +506,7 @@ Int2 Nlm_Main(void) if (translated_query) { /* Filter locations were returned in protein coordinates; convert them back to nucleotide here */ - BlastMaskProteinToDNA(&filter_loc, query_slp); + BlastMaskLocProteinToDNA(&filter_loc, query_slp); } if (status) { @@ -490,8 +520,6 @@ Int2 Nlm_Main(void) LookupTableWrapInit(query, lookup_options, lookup_segments, sbp, &lookup_wrap); - return_stats = (BlastReturnStat*) calloc(1, sizeof(BlastReturnStat)); - if (bssp) { BLAST_DatabaseSearchEngine(program_number, query, query_info, bssp, sbp, score_options, lookup_wrap, @@ -512,7 +540,7 @@ Int2 Nlm_Main(void) /* Convert results to the SeqAlign form */ BLAST_ResultsToSeqAlign(program_number, results, query_slp, bssp, - subject_slp, score_options, sbp, hit_options->gapped_calculation, + subject_slp, score_options, sbp, score_options->gapped_calculation, &seqalign); results = BLAST_ResultsFree(results); @@ -530,6 +558,8 @@ Int2 Nlm_Main(void) blast_program, query_info->num_queries, query_slp, filter_loc, format_options, score_options->is_ooframe); + BlastMaskLocFree(filter_loc); + PrintOutputFooter(program_number, format_options, score_options, sbp, lookup_options, word_options, ext_options, hit_options, query_info, readdb_args, return_stats); @@ -540,7 +570,9 @@ Int2 Nlm_Main(void) query_slp = SeqLocSetFree(query_slp); } /* End loop on sets of queries */ + sfree(readdb_args); subject = BlastSequenceBlkFree(subject); + subject_slp = SeqLocSetFree(subject_slp); sfree(return_stats); LookupTableOptionsFree(lookup_options); BlastQuerySetUpOptionsFree(query_options); diff --git a/demo/blastall.c b/demo/blastall.c index 6f56f52f..41fcf1b7 100644 --- a/demo/blastall.c +++ b/demo/blastall.c @@ -1,6 +1,6 @@ -static char const rcsid[] = "$Id: blastall.c,v 6.135 2003/08/21 15:37:54 dondosha Exp $"; +static char const rcsid[] = "$Id: blastall.c,v 6.136 2003/11/05 22:28:06 dondosha Exp $"; -/* $Id: blastall.c,v 6.135 2003/08/21 15:37:54 dondosha Exp $ +/* $Id: blastall.c,v 6.136 2003/11/05 22:28:06 dondosha Exp $ ************************************************************************** * * * COPYRIGHT NOTICE * @@ -28,6 +28,9 @@ static char const rcsid[] = "$Id: blastall.c,v 6.135 2003/08/21 15:37:54 dondosh ************************************************************************** * * $Log: blastall.c,v $ + * Revision 6.136 2003/11/05 22:28:06 dondosha + * No need to shift subsequence coordinates in tabular output, since they are already shifted in the seqalign + * * Revision 6.135 2003/08/21 15:37:54 dondosha * Corrections for out-of-frame tabular output and megablast XML output * @@ -1571,7 +1574,7 @@ Int2 Main (void) BlastPrintTabularResults(curr_seqalign, query_bsp, slp, number_of_alignments, blast_program, !options->gapped_calculation, options->is_ooframe, - believe_query, from, 0, global_fp, NULL, (align_view == 9)); + believe_query, 0, 0, global_fp, NULL, (align_view == 9)); SeqAlignSetFree(curr_seqalign); } diff --git a/demo/copymat.c b/demo/copymat.c index 08231e58..adbdd1e8 100644 --- a/demo/copymat.c +++ b/demo/copymat.c @@ -1,4 +1,4 @@ -static char const rcsid[] = "$Id: copymat.c,v 6.25 2003/05/30 17:31:09 coulouri Exp $"; +static char const rcsid[] = "$Id: copymat.c,v 6.30 2004/01/30 20:34:45 coulouri Exp $"; /* * =========================================================================== @@ -36,6 +36,22 @@ Contents: main routines for copymatrices program to convert score matrices output by makematrices into a single byte-encoded file. $Log: copymat.c,v $ +Revision 6.30 2004/01/30 20:34:45 coulouri +fix minor nit to FileWrite call + +Revision 6.29 2004/01/26 19:40:48 coulouri +* Correct buffer overrun +* Use offset rather than pointer in LookupBackboneCell + +Revision 6.28 2003/11/24 18:18:47 coulouri +Correction to previous fix for 64-bit irix + +Revision 6.27 2003/11/21 18:01:15 ivanov +Added extern definition for impalaMakeFileNames() + +Revision 6.26 2003/11/20 15:44:32 camacho +Tom Madden's changes to use lookup table contruction code from algo/blast. + Revision 6.25 2003/05/30 17:31:09 coulouri add rcsid @@ -107,10 +123,39 @@ Changed a little format of RPS lookup tables file. #include <sequtil.h> #include <seqport.h> #include <tofasta.h> -#include <blast.h> -#include <blastpri.h> -#include <posit.h> -#include <profiles.h> + +#ifndef MAXLINELEN +# define MAXLINELEN 2000 +#endif +#ifndef MAX_NAME_LENGTH +# define MAX_NAME_LENGTH 500 +#endif +#ifndef PRO_ALPHABET_SIZE +# define PRO_ALPHABET_SIZE 26 +#endif +#ifndef SORT_THRESHOLD +# define SORT_THRESHOLD 20 +#endif +#ifndef RPS_MAGIC_NUMBER +# define RPS_MAGIC_NUMBER 7702 +#endif +/*factor used to multiply the gapped K parameter to make it + more accurate in most cases*/ +#ifndef PRO_K_MULTIPLIER +# define PRO_K_MULTIPLIER 1.2 +#endif +#include <algo/blast/core/blast_lookup.h> +#include <algo/blast/core/blast_options.h> + +typedef Int4 ScoreRow[PRO_ALPHABET_SIZE]; +extern Boolean LIBCALL +IMPALAPrintHelp PROTO((Boolean html, Int4 line_length, Char * programName, + FILE *outfp)); +extern void LIBCALL +impalaMakeFileNames PROTO((Char * matrixDbName, Char * auxiliaryFileName, + Char * mmapFileName, Char * seqFileName, + Char *matrixFileName, Char * ckptFileName, + Char *directoryPrefix)); #define NUMARG (sizeof(myargs)/sizeof(myargs[0])) @@ -333,6 +378,40 @@ static Int4 findTotalLength(FILE *matrixAuxiliaryFile, Int4 numProfiles, return(totalLength); } +static Boolean RPSUpdateOffsets(LookupTable *lookup) +{ + Uint4 len; + Int4 index; + Int4 num_used; + Int4 offset_diff; + + len = lookup->backbone_size; + offset_diff = lookup->wordsize - 1; + + /* Walk through table, copying info into mod_lt[] */ + for(index = 0; index < len; index++) { + + if((num_used=lookup->thick_backbone[index].num_used) <= 3) + { + while (num_used > 0) + { + num_used--; + lookup->thick_backbone[index].payload.entries[num_used] += offset_diff; + } + } + else + { + while (num_used > 0) + { + num_used--; + lookup->overflow [ lookup->thick_backbone[index].payload.overflow_cursor + num_used] += offset_diff; + } + } + } + return TRUE; +} + + /* #define RPS_THRESHOLD 11 */ /* #define RPS_WORDSIZE 3 */ @@ -341,50 +420,44 @@ static Int4 findTotalLength(FILE *matrixAuxiliaryFile, Int4 numProfiles, pointers relative to the start of "mod_lookup_table_memory" chunk RPS Blast will calculate real pointers in run time using these values */ -Boolean RPSUpdatePointers(LookupTablePtr lookup) +Boolean RPSUpdatePointers(LookupTable *lookup, Uint4 *new_overflow, Uint4 *new_overflow_size) { - ModLAEntry * mod_lt; Uint4 len; Int4 index; - ModLookupPositionPtr start_address; + Uint4 *start_address; long mlpp_address; - ModLookupPositionPtr *lpp; + Uint4 *new_overflow_cursor; + Int4 *src; + Int4 first_hit; + + len = lookup->backbone_size; - len = lookup->array_size; - mod_lt=lookup->mod_lt; - start_address = (ModLookupPositionPtr) lookup->mod_lookup_table_memory; + start_address = new_overflow_cursor = new_overflow; /* Walk through table, copying info into mod_lt[] */ for(index = 0; index < len; index++) { - if(mod_lt[index].num_used <= 3) + if(lookup->thick_backbone[index].num_used <= 3) continue; -#if 1 + src = &(lookup->overflow[lookup->thick_backbone[index].payload.overflow_cursor]); + MemCpy(new_overflow_cursor, &src[1], sizeof(Uint4)*(lookup->thick_backbone[index].num_used-1)); + + mlpp_address = (long) new_overflow_cursor; + + new_overflow_cursor += lookup->thick_backbone[index].num_used-1; + first_hit = src[0]; - /* Taking pointer to 4/8 bytes address */ - lpp= (ModLookupPositionPtr *) &mod_lt[index].entries[1]; - - mlpp_address = (long) *lpp; mlpp_address -= (long) start_address; /* Now this is new relative address - usually small */ - *lpp = (ModLookupPositionPtr) mlpp_address; - -#if defined(OS_UNIX_IRIX) - if(sizeof(ModLookupPositionPtr) == 8) { /* 64bit build */ - mlpp_address = mod_lt[index].entries[1]; - mod_lt[index].entries[1] = mod_lt[index].entries[2]; - mod_lt[index].entries[2] = mlpp_address; - } -#endif - -#else - mod_lt[index].entries[1] -= (int) start_address; - mod_lt[index].entries[2] = 0; /* Not used */ -#endif + lookup->thick_backbone[index].payload.entries[1] = (Int4) mlpp_address; + lookup->thick_backbone[index].payload.entries[0] = first_hit; } + + *new_overflow_size = new_overflow_cursor - new_overflow; + return TRUE; } @@ -392,17 +465,24 @@ Boolean RPSUpdatePointers(LookupTablePtr lookup) Write lookup table to the disk into file "*.loo", which will be used memory-mapped during RPS Blast search */ -Boolean RPSDumpLookupTable(LookupTablePtr lookup, FILE *fd) +Boolean RPSDumpLookupTable(LookupTable *lookup, FILE *fd) { + Uint4 *new_overflow; + Uint4 new_overflow_size; - RPSUpdatePointers(lookup); + RPSUpdateOffsets(lookup); - FileWrite(lookup->mod_lt, sizeof(ModLAEntry), 1+lookup->array_size, fd); - if(lookup->mod_lookup_table_size) { - FileWrite(lookup->mod_lookup_table_memory, - lookup->mod_lookup_table_size, - sizeof(ModLookupPosition), fd); - } + new_overflow = malloc(lookup->overflow_size*sizeof(Uint4)); + RPSUpdatePointers(lookup, new_overflow, &new_overflow_size); + + FileWrite(lookup->thick_backbone, sizeof(LookupBackboneCell), lookup->backbone_size, fd); + if(new_overflow_size) + FileWrite(new_overflow, + sizeof(Uint4), + new_overflow_size, + fd); + + sfree(new_overflow); return TRUE; } @@ -415,25 +495,63 @@ Boolean RPSCreateLookupFile(ScoreRow *combinedMatrix, Int4 numProfiles, Int4Ptr seqlens, CharPtr filename, Nlm_FloatHi scalingFactor) { - LookupTablePtr lookup; - BlastAllWordPtr all_words; - Int4 start, i, header_size, all_length, magicNumber; + BlastScoreBlk *sbp; + DoubleInt *double_int; FILE *fd; + Int4 **posMatrix; + Int4 start, i, header_size, all_length, magicNumber; Int4Ptr offsets; - BLAST_ScorePtr PNTR posMatrix; Int4 num_lookups; + ListNode *lookup_segment=NULL; + LookupTable *lookup; + LookupTableWrap* lookup_wrap_ptr=NULL; + LookupTableOptions* lookup_options; + if((fd = FileOpen(filename, "wb")) == NULL) return FALSE; num_lookups = 1; /* Single lookup table for all set */ - lookup = lookup_new(PRO_ALPHABET_SIZE, 3, 0); - all_words = BlastPopulateAllWordArrays(3, PRO_ALPHABET_SIZE); + all_length = seqlens[numProfiles] - seqlens[0]; + + posMatrix = MemNew((all_length + 1) * sizeof(Int4 *)); + for (i = 0; i < all_length; i++) { + posMatrix[i] = (Int4 *) &(combinedMatrix[i][0]); + } + + /* Last row is necessary */ + posMatrix[all_length] = MemNew(sizeof(Int4) * PRO_ALPHABET_SIZE); + + for(i = 0; i < PRO_ALPHABET_SIZE; i++) { + posMatrix[all_length][i] = -INT2_MAX; + } + + sbp = BlastScoreBlkNew(BLASTAA_SEQ_CODE, 1); + sbp->posMatrix = posMatrix; + LookupTableOptionsNew(blast_type_blastp, &lookup_options); + BLAST_FillLookupTableOptions(lookup_options, blast_type_blastp, FALSE, + (Int4) (myargs[3].floatvalue*scalingFactor), myargs[4].intvalue, FALSE, FALSE, TRUE); /* add last arg for psi-blast?? */ + + + double_int = (DoubleInt*) calloc(1, sizeof(DoubleInt)); + double_int->i1 = 0; + double_int->i2 = all_length; + + ListNodeAddPointer(&lookup_segment, 0, double_int); + + /* Need query for psi-blast?? where to put the PSSM? */ + LookupTableWrapInit(NULL, lookup_options, lookup_segment, sbp, &lookup_wrap_ptr); + + sbp->posMatrix = NULL; + sbp = BlastScoreBlkFree(sbp); + lookup_options = LookupTableOptionsFree(lookup_options); + lookup_segment = ListNodeFreeData(lookup_segment); + + lookup = (LookupTable*) lookup_wrap_ptr->lut; /* Only Uint4 maximum length for lookup file allowed in current implementation */ - header_size = (numProfiles+1)*sizeof(Int4) + 8*sizeof(Int4); /* Beginning of file will be allocated for lookup offsets */ @@ -441,32 +559,11 @@ Boolean RPSCreateLookupFile(ScoreRow *combinedMatrix, Int4 numProfiles, offsets = MemNew(sizeof(Int4) * (num_lookups + 1)); - all_length = seqlens[numProfiles] - seqlens[0]; - - posMatrix = MemNew((all_length + 1) * sizeof(BLAST_Score *)); - for (i = 0; i < all_length; i++) { - posMatrix[i] = (BLAST_Score *) &(combinedMatrix[i][0]); - } - - /* Last row is necessary */ - posMatrix[all_length] = MemNew(sizeof(BLAST_Score) * PRO_ALPHABET_SIZE); - for(i = 0; i < PRO_ALPHABET_SIZE; i++) { - posMatrix[all_length][i] = -INT2_MAX; - } offsets[0] = ftell(fd); start = seqlens[0]; /* 0 */ - if(BlastNewFindWordsEx(lookup, &posMatrix[start], 0, all_length, - all_words, (Int4) (myargs[3].floatvalue*scalingFactor), - myargs[4].intvalue, 0) < 0) { - ErrPostEx(SEV_ERROR, 0,0, "Failure to create llokup table"); - return FALSE; - } - - lookup_position_aux_destruct(lookup); - RPSDumpLookupTable(lookup, fd); i = 1; @@ -477,9 +574,9 @@ Boolean RPSCreateLookupFile(ScoreRow *combinedMatrix, Int4 numProfiles, magicNumber = RPS_MAGIC_NUMBER; FileWrite(&magicNumber, sizeof(Int4), 1, fd); /* header[0] */ FileWrite(&num_lookups, sizeof(Int4), 1, fd); /* header[1] */ - FileWrite(&lookup->num_pos_added, sizeof(Int4), 1, fd); /* header[2] */ - FileWrite(&lookup->num_unique_pos_added, sizeof(Int4), 1, fd); /* header[3] */ - FileWrite(&lookup->mod_lookup_table_size, sizeof(Int4), 1, fd); /* header[4] */ + FileWrite(&lookup->neighbor_matches, sizeof(Int4), 1, fd); /* header[2] */ + FileWrite(&lookup->neighbor_matches, sizeof(Int4), 1, fd); /* header[3] */ + FileWrite(&lookup->overflow_size, sizeof(Int4), 1, fd); /* header[4] */ /* Now writing recorded offsets in the beginning of the file */ @@ -487,27 +584,14 @@ Boolean RPSCreateLookupFile(ScoreRow *combinedMatrix, Int4 numProfiles, FileWrite(offsets, sizeof(Int4), num_lookups + 1, fd); FileClose(fd); -/* comment out for now, why is this here? - if (scalingFactor != 1.0) { - for(j = 0; j < all_length; j++) { - for(i = 0; i < PRO_ALPHABET_SIZE; i++) { - combinedMatrix[j][i] /= scalingFactor; - } - } - } -*/ - /* Final memory cleenup */ - lookup = lookup_destruct(lookup); - MemFree(posMatrix[all_length]); MemFree(posMatrix); - BlastAllWordDestruct(all_words); - return TRUE; } + /* -- SSH -- Create file <database_name> (without extention), which is concatenation of all FASTA files used. Used by RPS Blast. diff --git a/demo/debruijn.c b/demo/debruijn.c index 732745b0..c287351c 100644 --- a/demo/debruijn.c +++ b/demo/debruijn.c @@ -1,72 +1,107 @@ -#include <stdio.h> -#include <util.h> +/* $Id: debruijn.c,v 1.3 2004/01/05 20:44:55 coulouri Exp $ + +* =========================================================================== +* +* PUBLIC DOMAIN NOTICE +* National Center for Biotechnology Information +* +* This software/database is a "United States Government Work" under the +* terms of the United States Copyright Act. It was written as part of +* the author's offical duties as a United States Government employee and +* thus cannot be copyrighted. This software/database is freely available +* to the public for use. The National Library of Medicine and the U.S. +* Government have not placed any restriction on its use or reproduction. +* +* Although all reasonable efforts have been taken to ensure the accuracy +* and reliability of the software and data, the NLM and the U.S. +* Government do not and cannot warrant the performance or results that +* may be obtained by using this software or data. The NLM and the U.S. +* Government disclaim all warranties, express or implied, including +* warranties of performance, merchantability or fitness for any particular +* purpose. +* +* Please cite the author in any work or product based on this material. +* +* =========================================================================== + +*/ + +static char const rcsid[] = "$Id: debruijn.c,v 1.3 2004/01/05 20:44:55 coulouri Exp $"; /* * example driver for de Bruijn sequences. * * this code generates all n-mers over a protein - * or dna alphabet. useful for creating fasta test sequences. + * or dna alphabet. useful for creating test sequences. */ -/* k = 22 */ -char proteinalphabet[] = "arndcqeghilkmfpstwyvbz"; +#include <ncbi.h> +#include <algo/blast/core/lookup_util.h> + +static Args myargs[] = { + { "word size", + NULL, NULL, NULL, FALSE, 'n', ARG_INT, 0.0, 0, NULL }, + { "alphabet\n" + "(supply 'ncbistdaa' or 'ncbi2na' for standard\n" + "alphabets, or supply your own alphabet)\n", + NULL, NULL, NULL, FALSE, 'a', ARG_STRING, 0.0, 0, NULL }, +}; -/* k = 4 */ -char dnaalphabet[] = "acgt"; +Uint1 ncbistdaa[] = "-abcdefghiklmnpqrstvwxyzu*"; +Uint1 ncbi2na[] = "acgt"; -int main(int argc, char *argv[]) +Int2 Main(void) { - int i; - int n, k; - char *output; - int outputsize; - char *alphabet; + Int4 i; + Int4 n, k; + Uint1 *output; + Int4 outputsize; + Uint1 *alphabet=NULL; + + if ( ! GetArgs("debruijn", sizeof(myargs)/sizeof(myargs[0]), myargs) ) + return 1; + + n = myargs[0].intvalue; - if (argc != 4) + if (n < 1) { - fprintf(stderr, "usage: %s n k prot-or-dna\n", argv[0]); - fprintf(stderr, "where prot-or-dna = 0 for protein, 1 for dna\n"); - fprintf(stderr, "example: %s 3 22 0\n",argv[0]); - fprintf(stderr, "example: %s 11 4 1\n",argv[0]); - exit(1); + fprintf(stderr,"n must be greater than one.\n"); + return 1; } - - n = atoi(argv[1]); - k = atoi(argv[2]); - - if (atoi(argv[3]) == 0) - alphabet = proteinalphabet; - else - alphabet = dnaalphabet; + alphabet = myargs[1].strvalue; + + if (strcmp("ncbistdaa", myargs[1].strvalue) == 0) + alphabet = ncbistdaa; + + if (strcmp("ncbi2na", myargs[1].strvalue) == 0) + alphabet = ncbi2na; + + k = strlen(alphabet); + /* output array needs: * k^n bytes - to store the de Bruijn sequence * n-1 bytes - to unwrap (see below) * 1 byte - for the terminating NUL */ - outputsize = iexp(k,n) + (n-1) + 1; - output = (char *) calloc( outputsize, sizeof(char)); + outputsize = iexp(k,n) + (n-1); + output = (char *) malloc(outputsize + 1); /* compute the (n,k) de Bruijn sequence */ debruijn(n,k,output,alphabet); - /* but, we don't want a true cyclical de Bruijn sequence, we want - * all words in a straight line. so we copy the first n-1 letters + /* We don't want a true cyclical de Bruijn sequence; we want + * all words in a straight line- copy the first n-1 letters * to the end. */ for(i=0;i<(n-1);i++) - output[outputsize-((n-1)+1)+i] = output[i]; - - /* don't forget to NUL-terminate it */ + output[outputsize-n+1+i] = output[i]; - output[outputsize-1] = '\0'; + /* Terminate the string. */ - fprintf(stderr,"n (word size) = %d\n",n); - fprintf(stderr,"k (alphabet size) = %d\n",k); - fprintf(stderr,"output size = k^n + (n-1) + 1 = %d + %d + 1 = %d\n",iexp(k,n),n-1,outputsize); - fprintf(stderr,"naive size would have been %d\n",n * iexp(k,n)); + output[outputsize+1] = '\0'; puts(output); diff --git a/demo/formatdb.c b/demo/formatdb.c index 37e7a8ed..9a2143fe 100644 --- a/demo/formatdb.c +++ b/demo/formatdb.c @@ -1,4 +1,4 @@ -static char const rcsid[] = "$Id: formatdb.c,v 6.91 2003/10/01 18:59:56 camacho Exp $"; +static char const rcsid[] = "$Id: formatdb.c,v 6.92 2004/01/29 14:56:44 camacho Exp $"; /***************************************************************************** @@ -32,11 +32,14 @@ static char const rcsid[] = "$Id: formatdb.c,v 6.91 2003/10/01 18:59:56 camacho Version Creation Date: 10/01/96 - $Revision: 6.91 $ + $Revision: 6.92 $ File Description: formats FASTA databases for use by BLAST $Log: formatdb.c,v $ + Revision 6.92 2004/01/29 14:56:44 camacho + Removed -A option, FORMATDB_VER_TEXT no longer supported + Revision 6.91 2003/10/01 18:59:56 camacho Fix to creation of custom databases using a gi list and alias files when the source database spans multiple volumes. @@ -418,8 +421,6 @@ Args dump_args[] = { "F", NULL, NULL, TRUE, 's', ARG_BOOLEAN, 0.0, 0, NULL}, { "Verbose: check for non-unique string ids in the database", "F", NULL, NULL, TRUE, 'V', ARG_BOOLEAN, 0.0, 0, NULL}, - { "Create ASN.1 structured deflines", - "T", NULL, NULL, TRUE, 'A', ARG_BOOLEAN, 0.0, 0, NULL}, { "Create an alias file with this name\n" " use the gifile arg (below) if set to calculate db size\n" " use the BLAST db specified with -i (above)", @@ -452,7 +453,6 @@ enum { dbsize_arg, sparse_arg, nonunique_arg, - asn1_deflines_arg, alias_fn_arg, gifile_arg, bin_gifile_arg, @@ -669,9 +669,7 @@ Int2 Main(void) dump_args[basename_arg].strvalue, dump_args[alias_fn_arg].strvalue, ((Int8)dump_args[dbsize_arg].intvalue)*1000000, 0, - dump_args[asn1_deflines_arg].intvalue ? - FORMATDB_VER : FORMATDB_VER_TEXT , - FALSE, 0); + FORMATDB_VER, FALSE, 0); if (options == NULL) return 1; diff --git a/demo/megablast.c b/demo/megablast.c index 2fa5beb6..7db38590 100644 --- a/demo/megablast.c +++ b/demo/megablast.c @@ -1,6 +1,6 @@ -static char const rcsid[] = "$Id: megablast.c,v 6.107 2003/05/30 17:31:09 coulouri Exp $"; +static char const rcsid[] = "$Id: megablast.c,v 6.111 2004/01/27 20:47:18 dondosha Exp $"; -/* $Id: megablast.c,v 6.107 2003/05/30 17:31:09 coulouri Exp $ +/* $Id: megablast.c,v 6.111 2004/01/27 20:47:18 dondosha Exp $ ************************************************************************** * * * COPYRIGHT NOTICE * @@ -28,6 +28,18 @@ static char const rcsid[] = "$Id: megablast.c,v 6.107 2003/05/30 17:31:09 coulou ************************************************************************** * $Revision 6.13$ * * $Log: megablast.c,v $ + * Revision 6.111 2004/01/27 20:47:18 dondosha + * Set no_traceback option to 2 for -D4, 1 for -D0, 0 for others + * + * Revision 6.110 2003/11/05 22:28:06 dondosha + * No need to shift subsequence coordinates in tabular output, since they are already shifted in the seqalign + * + * Revision 6.109 2003/10/30 17:30:49 dondosha + * Fixed error in previous commit - output file pointer has to be set for all cases + * + * Revision 6.108 2003/10/29 17:48:15 dondosha + * Added -D4 option for 2-stage greedy gapped extension with format identical to -D2 + * * Revision 6.107 2003/05/30 17:31:09 coulouri * add rcsid * @@ -396,7 +408,8 @@ enum { MBLAST_ENDPOINTS = 0, MBLAST_SEGMENTS, MBLAST_ALIGNMENTS, - MBLAST_ALIGN_INFO + MBLAST_ALIGN_INFO, + MBLAST_DELAYED_TRACEBACK }; static int LIBCALLBACK @@ -912,6 +925,7 @@ Int2 Main (void) Boolean lcase_masking; BlastOutputPtr boutp = NULL; MBXmlPtr mbxp = NULL; + Boolean traditional_formatting; StringCpy(buf, "megablast "); StringNCat(buf, BlastGetVersionNumber(), sizeof(buf)-StringLen(buf)-1); @@ -947,14 +961,17 @@ Int2 Main (void) } align_type = BlastGetTypes(blast_program, &query_is_na, &db_is_na); - + traditional_formatting = + (myargs[12].intvalue == MBLAST_ALIGNMENTS || + myargs[12].intvalue == MBLAST_DELAYED_TRACEBACK); + if (myargs[15].strvalue) { if (myargs[15].strvalue[0] == 'f' || myargs[15].strvalue[0] == 'F' || myargs[15].strvalue[0] == '0') believe_query = FALSE; else believe_query = TRUE; - } else if (myargs[12].intvalue == MBLAST_ALIGNMENTS && + } else if (traditional_formatting && !myargs[14].strvalue) believe_query = FALSE; else @@ -1058,9 +1075,11 @@ Int2 Main (void) options->gifile = StringSave(myargs[22].strvalue); if (myargs[12].intvalue == MBLAST_ENDPOINTS) - options->no_traceback = TRUE; + options->no_traceback = 1; + else if (myargs[12].intvalue == MBLAST_DELAYED_TRACEBACK) + options->no_traceback = 2; else - options->no_traceback = FALSE; + options->no_traceback = 0; options->megablast_full_deflines = (Boolean) myargs[27].intvalue; options->perc_identity = (FloatLo) myargs[30].floatvalue; @@ -1077,10 +1096,9 @@ Int2 Main (void) StrCpy(prefix, ""); global_fp = outfp; - if (myargs[12].intvalue != MBLAST_ALIGNMENTS) - options->output = outfp; + options->output = outfp; - if (myargs[12].intvalue==MBLAST_ALIGNMENTS) { + if (traditional_formatting) { if (align_view < 7) { if (html) { fprintf(outfp, "<HTML>\n<TITLE>MEGABLAST Search Results</TITLE>\n"); @@ -1233,7 +1251,7 @@ Int2 Main (void) } - if (myargs[12].intvalue==MBLAST_ALIGNMENTS) { + if (traditional_formatting) { dbinfo = NULL; ka_params = NULL; ka_params_gap = NULL; @@ -1333,7 +1351,7 @@ Int2 Main (void) BlastPrintTabulatedResults(seqalign, query_bsp_array[index], NULL, number_of_alignments, blast_program, !options->gapped_calculation, - believe_query, options->required_start, 0, + believe_query, 0, 0, global_fp, (align_view == 9)); SeqAlignSetFree(seqalign); diff --git a/demo/rtestval.c b/demo/rtestval.c index 6752bbe7..1a3bd194 100644 --- a/demo/rtestval.c +++ b/demo/rtestval.c @@ -5,7 +5,7 @@ * check for stop codons * Check for and fix non 3.0 asn spec things * -* $Id: rtestval.c,v 1.10 2003/05/13 16:02:42 coulouri Exp $ +* $Id: rtestval.c,v 1.11 2003/11/14 18:07:54 kans Exp $ * *****************************************************************************/ #include <accid1.h> @@ -158,6 +158,10 @@ Int2 Main(void) vsp->cutoff = (Int2)(myargs[10].intvalue); vsp->useSeqMgrIndexes = (Boolean)(myargs[12].intvalue); /* indexed validate */ vsp->validateAlignments = (Boolean)(myargs[13].intvalue); + if (vsp->validateAlignments) { + vsp->alignFindRemoteBsp = TRUE; + vsp->doSeqHistAssembly = TRUE; + } vsp->farIDsInAlignments = (Boolean)(myargs[13].intvalue); vsp->alwaysRequireIsoJTA = (Boolean)(myargs[14].intvalue); vsp->farFetchCDSproducts = (Boolean) (myargs[15].intvalue && myargs[16].intvalue); diff --git a/demo/tbl2asn.c b/demo/tbl2asn.c index 1b36b887..a969f40e 100644 --- a/demo/tbl2asn.c +++ b/demo/tbl2asn.c @@ -29,7 +29,7 @@ * * Version Creation Date: 5/5/00 * -* $Revision: 6.71 $ +* $Revision: 6.79 $ * * File Description: * @@ -61,7 +61,7 @@ #include <simple.h> #include <aliparse.h> -#define TBL2ASN_APP_VER "2.5" +#define TBL2ASN_APP_VER "2.8" CharPtr TBL2ASN_APPLICATION = TBL2ASN_APP_VER; @@ -162,6 +162,7 @@ static void ValidateOneFile ( if (vsp != NULL) { vsp->useSeqMgrIndexes = TRUE; vsp->suppressContext = TRUE; + vsp->seqSubmitParent = TRUE; oldErrSev = ErrSetMessageLevel (SEV_NONE); ValidateSeqEntry (sep, vsp); ValidStructFree (vsp); @@ -552,6 +553,16 @@ static OrgStuff commonOrgStuff [] = { "PLN", 1, 1, 39947 }, { + "Aspergillus nidulans FGSC A4", "", + "Eukaryota; Fungi; Ascomycota; Pezizomycotina; Eurotiomycetes; Eurotiales; Trichocomaceae; Emericella", + "PLN", 1, 4, 227321 + }, + { + "environmental sequence", "", + "unclassified; environmental samples", + "UNA", 1, 2, 256318 + }, + { NULL, NULL, NULL, NULL, 0, 0, 0 } }; @@ -778,42 +789,62 @@ static BioseqPtr AttachSeqAnnotEntity (Uint2 entityID, SeqAnnotPtr sap, Boolean return bsp; } -static CharPtr TrimBracketsFromString (CharPtr str) +static CharPtr TrimBracketsFromString (CharPtr str, SqnTagPtr stp) { Uchar ch; /* to use 8bit characters in multibyte languages */ + Int2 count; CharPtr dst; CharPtr ptr; - if (StringHasNoText (str)) return str; + if (StringHasNoText (str) || stp == NULL) return str; /* remove bracketed fields */ + count = 0; dst = str; ptr = str; ch = *ptr; while (ch != '\0') { if (ch == '[') { - ptr++; - ch = *ptr; - while (ch != '\0' && ch != ']' && ch != '"') { + if (count < stp->num_tags && (! stp->used [count])) { + *dst = ch; + dst++; ptr++; ch = *ptr; - } - if (ch == '"') { + while (ch != '\0' && ch != ']') { + *dst = ch; + dst++; + ptr++; + ch = *ptr; + } + *dst = ch; + dst++; + ptr++; + ch = *ptr; + } else { ptr++; ch = *ptr; - while (ch != '\0' && ch != '"') { + while (ch != '\0' && ch != ']' && ch != '"') { + ptr++; + ch = *ptr; + } + if (ch == '"') { + ptr++; + ch = *ptr; + while (ch != '\0' && ch != '"') { + ptr++; + ch = *ptr; + } + } + while (ch != '\0' && ch != ']') { ptr++; ch = *ptr; } - } - while (ch != '\0' && ch != ']') { ptr++; ch = *ptr; } - ptr++; - ch = *ptr; + count++; } else { *dst = ch; dst++; @@ -1037,16 +1068,16 @@ static void ProcessOneNuc ( } } - if (stp != NULL) { - SqnTagFree (stp); - } - - TrimBracketsFromString (ttl); + TrimBracketsFromString (ttl, stp); if (! StringHasNoText (ttl)) { str = StringSave (ttl); SeqDescrAddPointer (&(bsp->descr), Seq_descr_title, (Pointer) str); } + if (stp != NULL) { + SqnTagFree (stp); + } + ValNodeFreeData (vnp); } @@ -1179,6 +1210,60 @@ static void ReplaceOnePeptide ( MemFree (str2); } +static void ReplaceOneRNA ( + SimpleSeqPtr ssp, + Boolean conflict +) + +{ + ByteStorePtr bs; + BioseqPtr bsp; + SeqIdPtr sip; + CharPtr str1, str2; + + if (ssp == NULL || ssp->numid < 1) return; + + sip = MakeSeqID (ssp->id [0]); + bsp = BioseqFind (sip); + SeqIdFree (sip); + if (bsp == NULL || bsp->repr != Seq_repr_raw) return; + + /* remove trailing X and * */ + + bs = ssp->seq; + ssp->seqlen = BSLen (bs); + + str1 = BSMerge (ssp->seq, NULL); + str2 = GetSequenceByBsp (bsp); + + if (StringCmp (str1, str2) != 0) { + + /* swap sequence byte stores */ + + bs = bsp->seq_data; + bsp->seq_data = ssp->seq; + ssp->seq = bs; + bsp->length = BSLen (bsp->seq_data); + bsp->seq_data_type = Seq_code_iupacna; + + /* + mrna = SeqMgrGetRNAgivenProduct (bsp, NULL); + if (mrna != NULL) { + + if (conflict) { + mrna->excpt = TRUE; + if (StringHasNoText (mrna->except_text)) { + mrna->except_text = StringSave ("RNA editing"); + } + } + } + */ + } + + MemFree (str1); + MemFree (str2); +} + static Uint2 ProcessOneAsn ( FILE* fp, BioSourcePtr src, @@ -2128,6 +2213,29 @@ static void ProcessOneRecord ( FileClose (fp); } + /* read one or more feature tables from .rna file */ + + fp = OpenOneFile (directory, base, ".rna"); + if (fp != NULL) { + + /* indexing needed to find mRNA from transcript product to set RNA editing exception */ + + SeqMgrIndexFeatures (entityID, NULL); + + while ((dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, FALSE, TRUE, TRUE, TRUE)) != NULL) { + if (datatype == OBJ_FASTA) { + + ssp = (SimpleSeqPtr) dataptr; + ReplaceOneRNA (ssp, tbl->conflict); + SimpleSeqFree (ssp); + + } else { + ObjMgrFree (datatype, dataptr); + } + } + FileClose (fp); + } + /* read one or more quality score blocks from .qvl file */ fp = OpenOneFile (directory, base, ".qvl"); @@ -2222,7 +2330,11 @@ static void ProcessOneRecord ( if (! tbl->genprodset) { VisitFeaturesInSep (sep, NULL, ClearRnaProducts); } - InstantiateProteinTitles (entityID, NULL); + if (SeqMgrFeaturesAreIndexed (entityID)) { + InstantiateProteinTitles (entityID, NULL); + } else { + Message (MSG_POSTERR, "Unable to instantiate protein titles due to dropped index"); + } if (tbl->genprodset) { /* need to reindex before instantiating mRNA titles */ SeqMgrIndexFeatures (entityID, NULL); |