diff options
author | Aaron M. Ucko <ucko@debian.org> | 2007-09-05 15:33:43 +0000 |
---|---|---|
committer | Aaron M. Ucko <ucko@debian.org> | 2007-09-05 15:33:43 +0000 |
commit | 7647e504b18f91edcedba85e7a6ef772b2a0f48b (patch) | |
tree | 6152f91efe8f30174ce9d51525a458b85335f12b /algo/blast/api/blast_input.c | |
parent | b0629a94e882461a9d6cab18807c5f96501cf38f (diff) |
[svn-upgrade] Integrating new upstream version, ncbi-tools6 (6.1.20070822)
Diffstat (limited to 'algo/blast/api/blast_input.c')
-rw-r--r-- | algo/blast/api/blast_input.c | 517 |
1 files changed, 516 insertions, 1 deletions
diff --git a/algo/blast/api/blast_input.c b/algo/blast/api/blast_input.c index 6c9e50f1..364d1dfc 100644 --- a/algo/blast/api/blast_input.c +++ b/algo/blast/api/blast_input.c @@ -1,5 +1,5 @@ #ifndef SKIP_DOXYGEN_PROCESSING -static char const rcsid[] = "$Id: blast_input.c,v 1.25 2006/05/24 21:17:50 camacho Exp $"; +static char const rcsid[] = "$Id: blast_input.c,v 1.30 2007/05/07 13:27:15 kans Exp $"; #endif /* SKIP_DOXYGEN_PROCESSING */ /* =========================================================================== * @@ -37,6 +37,9 @@ static char const rcsid[] = "$Id: blast_input.c,v 1.25 2006/05/24 21:17:50 camac #include <algo/blast/api/blast_seq.h> #include <algo/blast/core/blast_filter.h> #include <algo/blast/core/blast_setup.h> +#include <algo/blast/core/blast_posit.h> +#include <objscoremat.h> +#include <asn.h> /** @addtogroup CToolkitAlgoBlast * @@ -178,5 +181,517 @@ BLAST_GetQuerySeqLoc(FILE *infp, Boolean query_is_na, Uint1 strand, return total_length; } + + +/** + * Read the query data stored in a PSI-BLAST checkpoint file. + * + * @param *pold_query_length length of the old query, as stored in the + * checkpoint file + * @param old_query a buffer of length at least query_length + * to hold the query data from the checkpoint + * file + * @param query_length the expected query length, and the length + * of the old_query buffer + * @param file the checkpoint file + * @return the number of query characters read, + * <= the value of the query_length parameter. + */ +static int +s_GetOldQueryFromCheckpoint(Int4 * pold_query_length, Uint1 * old_query, + int query_length, FILE * file) +{ + int i; + int num_read; /* number of query characters read from disk */ + + ASSERT(query_length > 0); + if (1 != fread(pold_query_length, sizeof(Int4), 1, file)) { + *pold_query_length = 0; + return 0; + } + /* Read at most query_length bytes, no matter what the value of + * *pold_query_length is */ + num_read = (int)fread(old_query, sizeof(Uint1), query_length, file); + for (i = 0; i < num_read; i++) { + old_query[i] = AMINOACID_TO_NCBISTDAA[old_query[i]]; + } + return num_read; +} + + +/** + * Compare the query to another sequence, typically a query stored in + * a PSI-BLAST checkpoint file. Warn if they don't match. + * @param query one query to be compared + * @param query_length the length of query and old_query + * @param old_query the other query + * @return 0 if the sequences match, -1 otherwise */ +static Int4 +s_ValidateOldQuery(const Uint1 query[], int query_length, + const Uint1 old_query[], int old_query_length) +{ + int query_index; + /* Value for the X ambiguity character */ + enum { eXchar = 21 }; + + if (query_length != old_query_length) { + ErrPostEx(SEV_WARNING, 0, 0, "Invalid usage of checkpoint recovery; " + "old query has length %ld, new query has length %ld", + (long) old_query_length, (long) query_length); + return -1; + } + for (query_index = 0; query_index < query_length; query_index++) { + if (old_query[query_index] != query[query_index]) { + char old_char = NCBISTDAA_TO_AMINOACID[old_query[query_index]]; + char new_char = NCBISTDAA_TO_AMINOACID[query[query_index]]; + + if (old_query[query_index] != eXchar) { + if (query[query_index] == eXchar) { + ErrPostEx(SEV_WARNING, 0, 0, + "\nStored query has a %c at position " + "%d, while new query has a %c there.\n%c " + "appears in query sequence: The query " + "could be filtered. Run with \"-F F\" " + "option to turn the filter off.", + old_char, query_index, new_char, new_char); + } else { + ErrPostEx(SEV_WARNING, 0, 0, + "Stored query has a %c at position %d, " + "while new query has a %c there.", + old_char, query_index, new_char); + } + return -1; + } else { /* old_query[c] == eXchar */ + ErrPostEx(SEV_WARNING, 0, 0, + "Stored query has a %c at position %d, " + "while new query has a %c there\n%c appears " + "in the stored query: The stored query may be " + "filtered. Run blastpgp with \"-F F\" option " + "to turn the filter off.", + old_char, query_index, new_char, old_char); + /* This is only a warning */ + } + } + } + return 0; +} + + +/** + * Read frequency ratios from a PSI-BLAST checkpoint file; read the + * query data for the file before calling this routine. + * + * @param freq_ratios the frequency ratios [out] + * @param query_length the length of the query + * @param file file to read + * + * @return 0 on sucess, nonzero on error */ +static int +s_PosReadFreqRatios(double ** freq_ratios, + int qlength, + FILE * file) +{ + int query_index; /* Query position (column) in the PSSM */ + int stdaa_index; /* Index in the ncbi_stdaa alphabet */ + int trueaa_index; /* Index in the ARND... alphabet of true + amino acids */ + /* conversion from 28 letter NCBIstdaa alphabet to 20 letter order + * for true amino acids: ARNDCQEGHILKMFPSTWYV. */ + static int alphaConvert[BLASTAA_SIZE] = + {(-1), 0, (-1), 4, 3, 6, 13, 7, 8, 9, 11, 10, 12, 2, 14, 5, 1, 15, + 16, 19, 17, (-1), 18, (-1), (-1), (-1), (-1), (-1)}; + /* Buffer to hold frequency data in ARND... order */ + double trueaa_buffer[PRO_TRUE_ALPHABET_SIZE]; + int num_read; /* Number of items retrieved by a call to read */ + + for (query_index = 0; query_index < qlength; query_index++) { + num_read = (int)fread(trueaa_buffer, sizeof(double), + PRO_TRUE_ALPHABET_SIZE, file); + if (num_read < PRO_TRUE_ALPHABET_SIZE) + return -1; + for (stdaa_index = 0; stdaa_index < BLASTAA_SIZE; stdaa_index++) { + trueaa_index = alphaConvert[stdaa_index]; + if (trueaa_index < 0) { + freq_ratios[query_index][stdaa_index] = 0.0; + } else { + freq_ratios[query_index][stdaa_index] = + trueaa_buffer[trueaa_index]; + } + } + } + return 0; +} + + +/** + * Read frequency ratios from a standard format PSI-BLAST checkpoint file. + * + * @param freq_ratios the frequency ratios + * @param query_length the length of the query, and second dimension of + * freq_ratios + * @param query query sequence data + * @param file an open file to be read + * @param blast_msg a pointer to hold BLAST warnings. + * + * @return 0 on success, nonzero otherwise + */ +static int +s_PosReadStdCheckpointFile(double ** freq_ratios, + int query_length, + const Uint1 * query, + FILE * file, + Blast_Message* *blast_msg) +{ + int chkpt_query_length = 0; /* Length of the query saved in the + checkpoint file */ + int num_read = 0; /* number if query characters actually read from + the checkpoint file */ + int status = 0; /* error status */ + /* Buffer to hold the query from the checkpoint file */ + Uint1 * chkpt_query = NULL; + + chkpt_query = calloc(query_length, sizeof(Uint1)); + if (NULL != chkpt_query) { + num_read = s_GetOldQueryFromCheckpoint(&chkpt_query_length, + chkpt_query, query_length, + file); + } + if (NULL == chkpt_query || num_read < chkpt_query_length) { + Blast_MessageWrite(blast_msg, eBlastSevFatal, + kBlastMessageNoContext, + "Blast_PosReadCheckpoint: " + "Failed to reconstruct previous query\n"); + goto error_return; + } + status = s_ValidateOldQuery(query, query_length, + chkpt_query, chkpt_query_length); + free(chkpt_query); + + if (0 != status) + goto error_return; + + status = s_PosReadFreqRatios(freq_ratios, query_length, file); + if (0 != status) + goto error_return; + + return 0; +error_return: + Blast_MessageWrite(blast_msg, eBlastSevFatal, + kBlastMessageNoContext, + "Blast_PosReadCheckpoint: " + "Failed to recover data\n"); + return -1; +} + + +/** + * Read frequency ratios from a standard format PSI-BLAST checkpoint file + * of the given name. + * + * @param freq_ratios the frequency ratios + * @param query_length the length of the query, and second dimension of + * freq_ratios + * @param query query sequence data + * @param file the name of the file to be read + * @param blast_msg a pointer to hold BLAST warnings. + * + * @return 0 on success, nonzero otherwise + */ +static int +s_PosReadStdCheckpoint(Nlm_FloatHi ** freq_ratios, + int qlength, + const Uint1 * query, + const char * filename, + Blast_Message* *blast_msg) +{ + FILE * file = fopen(filename, "rb"); + int status, close_status; + + if (!file) { + Blast_MessageWrite(blast_msg, eBlastSevFatal, + kBlastMessageNoContext, + "Blast_PosReadCheckpointFile: " + "Could not open checkpoint file\n"); + return -1; + } + status = s_PosReadStdCheckpointFile(freq_ratios, qlength, query, + file, blast_msg); + close_status = fclose(file); + + return (0 == status) ? close_status : status; +} + + +static int +s_PosReadAsnCheckpoint(double ** freq_ratios, + int query_length, + const Uint1 query[], + char fileName[], + int is_ascii_scoremat) +{ + AsnIoPtr infile = NULL; + PssmWithParametersPtr scoremat = NULL; + PssmPtr pssm = NULL; + PssmIntermediateDataPtr freqs = NULL; + ValNodePtr freq_list; + Bioseq *bsp; + int i, j, c; + enum { eXchar = 21 }; + + if (is_ascii_scoremat) { + infile = AsnIoOpen(fileName, "r"); + } else { + infile = AsnIoOpen(fileName, "rb"); + } + if (infile == NULL) { + ErrPostEx(SEV_WARNING, 0, 0,"Could not open scoremat file\n"); + return -1; + } + + scoremat = PssmWithParametersAsnRead(infile, NULL); + AsnIoClose(infile); + if (scoremat == NULL) { + ErrPostEx(SEV_WARNING, 0, 0, + "Could not read scoremat from input file\n"); + return 1; + } + pssm = scoremat->pssm; + if (pssm == NULL) { + ErrPostEx(SEV_WARNING, 0, 0,"Scoremat is empty\n"); + PssmWithParametersFree(scoremat); + return -1; + } + freqs = pssm->intermediateData; + if (freqs == NULL) { + ErrPostEx(SEV_WARNING, 0, 0, + "Scoremat doesn't contain intermediate data\n"); + PssmWithParametersFree(scoremat); + return -1; + } + if (freqs->freqRatios == NULL) { + ErrPostEx(SEV_WARNING, 0, 0, + "Scoremat does not contain frequency ratios\n"); + PssmWithParametersFree(scoremat); + return -1; + } + if (pssm->numRows != BLASTAA_SIZE) { + ErrPostEx(SEV_WARNING, 0, 0, "Wrong alphabet size of %d in " + "input scoremat\n", pssm->numRows); + PssmWithParametersFree(scoremat); + return -1; + } + if (!pssm->query || !pssm->query->data.ptrvalue) { + ErrPostEx(SEV_WARNING, 0, 0, + "Missing sequence data in input scoremat\n"); + PssmWithParametersFree(scoremat); + return -1; + } + bsp = (Bioseq *)(pssm->query->data.ptrvalue); + if (pssm->numColumns != bsp->length) { + ErrPostEx(SEV_WARNING, 0, 0, "Different sequence lengths " + "(%d and %d) in input scoremat\n", pssm->numColumns, + bsp->length); + PssmWithParametersFree(scoremat); + return -1; + } + if (pssm->numColumns != query_length) { + ErrPostEx(SEV_WARNING, 0, 0, "Scoremat sequence length " + "(%d) does not match query length (%d)\n", + pssm->numColumns, query_length); + PssmWithParametersFree(scoremat); + return -1; + } + if (!bsp->seq_data || !ISA_aa(bsp->mol)) { + ErrPostEx(SEV_WARNING, 0, 0, + "Sequence within checkpoint file has no data or is " + "not protein\n"); + PssmWithParametersFree(scoremat); + return -1; + } + + if (bsp->seq_data_type == Seq_code_gap) { + ErrPostEx(SEV_WARNING, 0, 0,"Seq_code_gap passed to s_PosReadAsnCheckpoint\n"); + return -1; + } + + BSSeek((ByteStorePtr) bsp->seq_data, 0, SEEK_SET); + + /* Convert sequence data into Seq_code_ncbistdaa */ + if (bsp->seq_data_type != Seq_code_ncbistdaa) { + + ByteStore* new_byte_store = BSConvertSeq((ByteStorePtr) bsp->seq_data, + Seq_code_ncbistdaa, + bsp->seq_data_type, + bsp->length); + if ( !new_byte_store ) { + ErrPostEx(SEV_FATAL, 1, 0, "Failed to convert Bioseq in ASN.1" + " PSSM to Seq_code_ncbistdaa"); + } + bsp->seq_data = (SeqDataPtr) new_byte_store; + bsp->seq_data_type = Seq_code_ncbistdaa; + BSSeek((ByteStorePtr) bsp->seq_data, 0, SEEK_SET); + } + + /* verify the input query is the same as the sequence + within the checkpoint file */ + + for (i = 0; i < query_length; i++) { + c = BSGetByte((ByteStorePtr) bsp->seq_data); + if (c == EOF) { + ErrPostEx(SEV_WARNING, 0, 0, "Premature end of sequence data\n"); + PssmWithParametersFree(scoremat); + return -1; + } + if (c != query[i]) { + char old_char = NCBISTDAA_TO_AMINOACID[query[i]]; + char new_char = NCBISTDAA_TO_AMINOACID[c]; + if (query[i] == eXchar) { + ErrPostEx(SEV_WARNING, 0, 0, + "Query sequence contains '%c' at position %d; " + "if filtering was used, rerun the search with " + "filtering turned off ('-F F')\n", old_char, i); + } + else { + ErrPostEx(SEV_WARNING, 0, 0, + "Query sequence contains '%c' at position %d, " + "while sequence withing checkpoint file contains " + "'%c' at this position\n", + old_char, i, new_char); + } + PssmWithParametersFree(scoremat); + return -1; + } + } + + /* Read in the frequency ratios, verify they fall + in the correct range, and verify that the linked list + of residue frequencies is exactly as long as it should be */ + + freq_list = freqs->freqRatios; + /* This is bad */ + if (pssm->byRow == FALSE) { + j = 0; + for (i = 0; i < pssm->numColumns; i++) { + for (j = 0; j < pssm->numRows; j++) { + if (freq_list == NULL) + break; + freq_ratios[i][j] = freq_list->data.realvalue; + + if (freq_ratios[i][j] < 0.0) { + ErrPostEx(SEV_WARNING, 0, 0, "position frequency (%d,%d) " + "out of bounds\n", i, j); + PssmWithParametersFree(scoremat); + return -1; + } + freq_list = freq_list->next; + } + if (j < pssm->numRows) + break; + } + } else { + i = 0; + for (j = 0; j < pssm->numRows; j++) { + for (i = 0; i < pssm->numColumns; i++) { + if (freq_list == NULL) + break; + freq_ratios[i][j] = freq_list->data.realvalue; + + if (freq_ratios[i][j] < 0.0) { + ErrPostEx(SEV_WARNING, 0, 0, "position frequency (%d,%d) " + "out of bounds\n", i, j); + PssmWithParametersFree(scoremat); + return -1; + } + freq_list = freq_list->next; + } + if (i < pssm->numColumns) + break; + } + } + if (i < pssm->numColumns || j < pssm->numRows) { + ErrPostEx(SEV_WARNING, 0, 0, "Not enough frequency " + "ratios in input scoremat\n"); + PssmWithParametersFree(scoremat); + return -1; + } + if (freq_list != NULL) { + ErrPostEx(SEV_WARNING, 0, 0, "Too many frequency " + "ratios in input scoremat\n"); + PssmWithParametersFree(scoremat); + return -1; + } + PssmWithParametersFree(scoremat); + return 0; +} + + +Blast_PsiCheckpointLoc * +Blast_PsiCheckpointLocNew(EPsiCheckpointType checkpoint_type, + char * filename) +{ + Blast_PsiCheckpointLoc * psi_checkpoint = + malloc(sizeof(Blast_PsiCheckpointLoc)); + if (psi_checkpoint) { + size_t length_filename = strlen(filename); + psi_checkpoint->filename = calloc(length_filename + 1, sizeof(char)); + if (!psi_checkpoint->filename) { + free(psi_checkpoint); + psi_checkpoint = NULL; + } + memcpy(psi_checkpoint->filename, filename, length_filename + 1); + psi_checkpoint->checkpoint_type = checkpoint_type; + } + return psi_checkpoint; +} + + +void +Blast_PsiCheckpointLocFree(Blast_PsiCheckpointLoc ** ppsi_checkpoint) +{ + Blast_PsiCheckpointLoc * psi_checkpoint = *ppsi_checkpoint; + if (psi_checkpoint) { + if (psi_checkpoint->filename) { + free(psi_checkpoint->filename); + } + free(psi_checkpoint); + } + *ppsi_checkpoint = NULL; +} + + +int +Blast_PosReadCheckpoint(double ** freq_ratios, + int query_length, + const Uint1 * query, + Blast_PsiCheckpointLoc * psi_checkpoint, + Blast_Message* *blast_msg) +{ + int status; + + switch(psi_checkpoint->checkpoint_type) { + case eStandardCheckpoint: + status = s_PosReadStdCheckpoint(freq_ratios, query_length, + query, + psi_checkpoint->filename, + blast_msg); + break; + case eAsnTextCheckpoint: + case eAsnBinaryCheckpoint: + {{ + int is_ascii_checkpoint = + psi_checkpoint->checkpoint_type == eAsnTextCheckpoint; + status = s_PosReadAsnCheckpoint(freq_ratios, + query_length, query, + psi_checkpoint->filename, + is_ascii_checkpoint); + }} + break; + default: + ASSERT(0 && "Impossible type of checkpoint file"); + status = -1; + break; + } + return status; +} /* @} */ |