diff options
Diffstat (limited to 'demo')
-rw-r--r-- | demo/.BLAST_VERSION | 2 | ||||
-rw-r--r-- | demo/aceread_tst.c | 855 | ||||
-rw-r--r-- | demo/alint.c | 218 | ||||
-rw-r--r-- | demo/asn2all.c | 7 | ||||
-rw-r--r-- | demo/asn2fsa.c | 9 | ||||
-rw-r--r-- | demo/asn2gb.c | 212 | ||||
-rw-r--r-- | demo/asn2idx.c | 4 | ||||
-rw-r--r-- | demo/asnbarval.c | 10 | ||||
-rw-r--r-- | demo/asndisc.c | 216 | ||||
-rwxr-xr-x | demo/asnmacro.c | 4 | ||||
-rw-r--r-- | demo/asnval.c | 176 | ||||
-rw-r--r-- | demo/blastall.c | 9 | ||||
-rw-r--r-- | demo/blastpgp.c | 14 | ||||
-rw-r--r-- | demo/cleanasn.c | 740 | ||||
-rw-r--r-- | demo/copymat.c | 37 | ||||
-rw-r--r-- | demo/cspeedtest.c | 340 | ||||
-rw-r--r-- | demo/entrez2.c | 4 | ||||
-rw-r--r-- | demo/formatrpsdb.c | 53 | ||||
-rw-r--r-- | demo/nps2gps.c | 49 | ||||
-rw-r--r-- | demo/rpsblast.c | 40 | ||||
-rw-r--r-- | demo/scantest.c | 793 | ||||
-rwxr-xr-x | demo/src_chk.c | 398 | ||||
-rw-r--r-- | demo/subfuse.c | 229 | ||||
-rw-r--r-- | demo/sugint.c | 214 | ||||
-rw-r--r-- | demo/taxblast_main.c | 99 | ||||
-rw-r--r-- | demo/tbl2asn.c | 1609 |
26 files changed, 5464 insertions, 877 deletions
diff --git a/demo/.BLAST_VERSION b/demo/.BLAST_VERSION index 8c57128f..ef93bccb 100644 --- a/demo/.BLAST_VERSION +++ b/demo/.BLAST_VERSION @@ -1 +1 @@ -2.2.18 +2.2.19 diff --git a/demo/aceread_tst.c b/demo/aceread_tst.c new file mode 100644 index 00000000..0754f0f6 --- /dev/null +++ b/demo/aceread_tst.c @@ -0,0 +1,855 @@ +/* aceread_tst.c +* =========================================================================== +* +* PUBLIC DOMAIN NOTICE +* National Center for Biotechnology Information (NCBI) +* +* This software/database is a "United States Government Work" under the +* terms of the United States Copyright Act. It was written as part of +* the author's official duties as a United States Government employee and +* thus cannot be copyrighted. This software/database is freely available +* to the public for use. The National Library of Medicine and the U.S. +* Government do not place any restriction on its use or reproduction. +* We would, however, appreciate having the NCBI and the author cited in +* any work or product based on this material +* +* Although all reasonable efforts have been taken to ensure the accuracy +* and reliability of the software and data, the NLM and the U.S. +* Government do not and cannot warrant the performance or results that +* may be obtained by using this software or data. The NLM and the U.S. +* Government disclaim all warranties, express or implied, including +* warranties of performance, merchantability or fitness for any particular +* purpose. +* +* =========================================================================== +* +* File Name: aceread_tst.c +* +* Author: Colleen Bollin +* +* Version Creation Date: 7/22/08 +* +* $Revision: 1.11 $ +* +* File Description: +* +* Modifications: +* -------------------------------------------------------------------------- +* Date Name Description of modification +* ------- ---------- ----------------------------------------------------- +* +* +* ========================================================================== +*/ + +#include <ncbi.h> +#include <objall.h> +#include <objsset.h> +#include <objsub.h> +#include <objfdef.h> +#include <seqport.h> +#include <sequtil.h> +#include <sqnutils.h> +#include <subutil.h> +#include <gather.h> +#include <explore.h> +#include <lsqfetch.h> +#include <valid.h> +#include <pmfapi.h> +#ifdef INTERNAL_NCBI_ASNDISC +#include <accpubseq.h> +#include <tax3api.h> +#endif + +#include "aceread.h" +#include "acerdapi.h" + +typedef enum { + i_argInputFile, + o_argOutputFile, + f_argFASTA, + S_argIDSubstitutionFile, + R_argSRRids, + L_argSuppressIdLookup, + Q_argMakeQualScores, + X_argXMLFile, + t_argTemplateFile, + T_argTSAFields, + C_argCenter, + F_argFormat, + G_argGapString, + V_argValidateAgainstAsn1File, + q_argReadQualScoresFile, + r_argReadFASTAFile, + N_argRecalculateConsensus, + l_argLimitNumContigs +} EArgNum; + +Args myargs [] = { + {"Single Input File", "stdin", NULL, NULL, + TRUE, 'i', ARG_FILE_IN, 0.0, 0, NULL}, + {"Single Output File", NULL, NULL, NULL, + TRUE, 'o', ARG_FILE_OUT, 0.0, 0, NULL}, + {"FASTA Output", "F", NULL, NULL, + TRUE, 'f', ARG_BOOLEAN, 0.0, 0, NULL}, + {"ID Substitution File", "", NULL, NULL, + TRUE, 'S', ARG_FILE_IN, 0.0, 0, NULL}, + {"Replacement IDs are SRR", "F", NULL, NULL, + TRUE, 'R', ARG_BOOLEAN, 0.0, 0, NULL}, + {"Suppress ID Lookup", "F", NULL, NULL, + TRUE, 'L', ARG_BOOLEAN, 0.0, 0, NULL}, + {"Make Qual Scores", "T", NULL, NULL, + TRUE, 'Q', ARG_BOOLEAN, 0.0, 0, NULL}, + {"XML Output File", "", NULL, NULL, + TRUE, 'X', ARG_FILE_OUT, 0.0, 0, NULL }, + {"Template File", "", NULL, NULL, + TRUE, 't', ARG_FILE_IN, 0.0, 0, NULL }, + {"TSA fields", NULL, NULL, NULL, + TRUE, 'T', ARG_STRING, 0.0, 0, NULL }, + {"Genome Center Tag", NULL, NULL, NULL, + TRUE, 'C', ARG_STRING, 0.0, 0, NULL}, + {"Assembly Format\n\tM MAQ\n\tE Standalone Eland\n\tA ACE", "A", NULL, NULL, + TRUE, 'F', ARG_STRING, 0.0, 0, NULL}, + {"Gap String", NULL, NULL, NULL, + TRUE, 'G', ARG_STRING, 0.0, 0, NULL}, + {"ASN.1 File to validate against", NULL, NULL, NULL, + TRUE, 'V', ARG_FILE_IN, 0.0, 0, NULL}, + {"Quality score file for read sequences", NULL, NULL, NULL, + TRUE, 'q', ARG_FILE_IN, 0.0, 0, NULL}, + {"FASTA file for read sequences (to use when trimming read quality scores)", NULL, NULL, NULL, + TRUE, 'r', ARG_FILE_IN, 0.0, 0, NULL}, + {"Recalculate consensus sequence using read data\n\tW Whole Consensus\n\tN Ns Only", "", NULL, NULL, + TRUE, 'N', ARG_STRING, 0.0, 0, NULL}, + {"Limit number of contigs to read", NULL, NULL, NULL, + TRUE, 'l', ARG_INT, 0.0, 0, NULL}, +}; + + +static FILE *OpenAceFile (CharPtr infile) +{ + FILE *f; + Int4 len; +#ifdef OS_UNIX + Char cmmd [256]; + CharPtr gzcatprog; + int ret; + Boolean usedPopen = FALSE; +#endif + + len = StringLen (infile); + if (StringCmp (infile + len - 3, ".gz") == 0) { +#ifdef OS_UNIX + gzcatprog = getenv ("NCBI_UNCOMPRESS_BINARY"); + if (gzcatprog != NULL) { + sprintf (cmmd, "%s %s", gzcatprog, infile); + } else { + ret = system ("gzcat -h >/dev/null 2>&1"); + if (ret == 0) { + sprintf (cmmd, "gzcat %s", infile); + } else if (ret == -1) { + Message (MSG_POSTERR, "Unable to fork or exec gzcat in ScanBioseqSetRelease"); + return NULL; + } else { + ret = system ("zcat -h >/dev/null 2>&1"); + if (ret == 0) { + sprintf (cmmd, "zcat %s", infile); + } else if (ret == -1) { + Message (MSG_POSTERR, "Unable to fork or exec zcat in ScanBioseqSetRelease"); + return NULL; + } else { + Message (MSG_POSTERR, "Unable to find zcat or gzcat in ScanBioseqSetRelease - please edit your PATH environment variable"); + return NULL; + } + } + } + f = popen (cmmd, "r"); + usedPopen = TRUE; +#else + Message (MSG_POSTERR, "Unable to read gzipped files when not running in UNIX"); + return NULL; +#endif + } else { + f = FileOpen (infile, "r"); + } + return f; +} + + +static Boolean ValidateAgainstASNFile (TACEFilePtr ace_file, CharPtr filename, char *has_errors) +{ + Pointer dataptr; + Uint2 datatype; + SeqEntryPtr sep = NULL; + SeqSubmitPtr ssp = NULL; + Boolean chars_stripped = FALSE; + FILE *fp; + Boolean rval = FALSE; + + + fp = FileOpen (filename, "r"); + if (fp == NULL) { + printf ("Unable to open %s\n", filename); + return FALSE; + } + + /* Read in one sequence from the file */ + dataptr = ReadAsnFastaOrFlatFileEx (fp, &datatype, NULL, FALSE, FALSE, + TRUE, FALSE, &chars_stripped); + FileClose (fp); + if (NULL == dataptr) + { + printf ("Unable to read SeqEntry from %s\n", filename); + return FALSE; + } + + /* Convert the file data to a SeqEntry */ + + if (datatype == OBJ_SEQENTRY) + sep = (SeqEntryPtr) dataptr; + else if (datatype == OBJ_BIOSEQ || datatype == OBJ_BIOSEQSET) + sep = SeqMgrGetSeqEntryForData (dataptr); + else if (datatype == OBJ_SEQSUB) + { + ssp = (SeqSubmitPtr) dataptr; + if (ssp != NULL && ssp->datatype == 1) + { + sep = (SeqEntryPtr) ssp->data; + } + } + + rval = ValidateACEFileAgainstSeqEntry (ace_file, sep, has_errors); + + if (ssp != NULL) { + ssp = SeqSubmitFree (ssp); + } else { + sep = SeqEntryFree (sep); + } + return rval; + +} + + +static Boolean StringNHasNoText (CharPtr str, Int4 n) +{ + CharPtr cp; + Int4 i; + if (str == NULL) return TRUE; + cp = str; + i = 0; + while (i < n) { + if (*cp == 0) return TRUE; + if (!isspace (*cp)) return FALSE; + cp++; + i++; + } + return TRUE; +} + + +static Boolean BracketMatchesLabel (CharPtr cp, CharPtr cp_equal, CharPtr label) +{ + Int4 len; + + if (cp == NULL || cp_equal == NULL || label == NULL) return FALSE; + + len = StringLen (label); + if (StringNCmp (cp, label, len) == 0 + && StringNHasNoText (cp + len, cp_equal - cp - len)) { + return TRUE; + } else { + return FALSE; + } +} + + +static CharPtr GetBracketValue (CharPtr cp, CharPtr cp_end) +{ + Int4 len; + CharPtr val = NULL; + + if (cp == NULL || cp_end == NULL || cp_end <= cp) return NULL; + + cp += StringSpn (cp, " \t"); + len = (cp_end - cp) + 1; + val = (CharPtr) MemNew (sizeof (Char) * len); + StringNCpy (val, cp, len - 1); + val [len] = 0; + while (len > 1 && isspace (val [len-1])) { + len--; + val[len] = 0; + } + return val; +} + + +static Boolean +GetTSAFieldsFromString +(CharPtr str, + CharPtr PNTR p_submitter_reference, + CharPtr PNTR p_archive_id, + CharPtr PNTR p_description) +{ + CharPtr cp, cp_next, cp_equal, cp_end; + CharPtr subref = NULL, arch_id = NULL, desc = NULL; + Boolean is_bad = FALSE; + + if (p_submitter_reference != NULL) { + *p_submitter_reference = NULL; + } + if (p_archive_id != NULL) { + *p_archive_id = NULL; + } + if (p_submitter_reference != NULL) { + *p_description = NULL; + } + if (StringHasNoText (str)) { + return TRUE; + } + + cp = StringChr (str, '['); + while (cp != NULL && !is_bad) { + cp++; + cp_next = StringChr (cp + 1, '['); + cp_equal = StringChr (cp, '='); + cp_end = StringChr (cp, ']'); + if (cp_equal == NULL || cp_end == NULL) { + is_bad = TRUE; + } else if (cp_equal > cp_end) { + is_bad = TRUE; + } else if (cp_next != NULL && (cp_equal > cp_next || cp_end > cp_next)) { + is_bad = TRUE; + } else { + cp += StringSpn (cp, " \t"); + if (BracketMatchesLabel (cp, cp_equal, "subref")) { + if (subref == NULL) { + subref = GetBracketValue (cp_equal + 1, cp_end); + } else { + is_bad = TRUE; + } + } else if (BracketMatchesLabel (cp, cp_equal, "archive_id")) { + if (arch_id == NULL) { + arch_id = GetBracketValue (cp_equal + 1, cp_end); + } else { + is_bad = TRUE; + } + } else if (BracketMatchesLabel (cp, cp_equal, "desc")) { + if (desc == NULL) { + desc = GetBracketValue (cp_equal + 1, cp_end); + } else { + is_bad = TRUE; + } + } else { + is_bad = TRUE; + } + } + cp = cp_next; + } + if (p_submitter_reference == NULL) { + subref = MemFree (subref); + } else { + *p_submitter_reference = subref; + } + if (p_archive_id == NULL) { + arch_id = MemFree (arch_id); + } else { + *p_archive_id = arch_id; + } + if (p_description == NULL) { + desc = MemFree (desc); + } else { + *p_description = desc; + } + return TRUE; +} + + +static void PrintTraceGapsXML (TGapInfoPtr gap_info) +{ + Int4 i; + + if (gap_info != NULL) { + printf (" <ntracegaps>%d</ntracegaps>\n", gap_info->num_gaps); + if (gap_info->num_gaps > 0) { + printf (" <tracegaps source=\"INLINE\">"); + for (i = 0; i < gap_info->num_gaps - 1; i++) { + printf ("%d,", gap_info->gap_offsets[i]); + } + printf ("%d</tracegaps>\n", gap_info->gap_offsets[gap_info->num_gaps - 1]); + } + } +} + + +static void TestPosConversions (TGapInfoPtr gap_info) +{ + Int4 i, t_pos, s_pos = 0, r_pos; + Int4 test_len = 0; + + if (gap_info != NULL && gap_info->num_gaps > 0) { + for (i = 0; i < gap_info->num_gaps; i++) { + test_len += gap_info->gap_offsets[i] + 1; + } + for (i = 0; i < test_len; i++) { + s_pos = SeqPosFromTilingPos (i, gap_info); + t_pos = TilingPosFromSeqPos (s_pos, gap_info); + if (t_pos != i) { + printf ("Failed! %d -> SeqPosFromTilingPos -> %d -> TilingPosFromSeqPos -> %d\n", + i, s_pos, t_pos); + } + r_pos = SeqPosFromTilingPos (t_pos, gap_info); + if (r_pos != s_pos) { + printf ("Failed! %d -> TilingPosFromSeqPos -> %d -> SeqPosFromTilingPos -> %d\n", + s_pos, t_pos, r_pos); + } + /* printf ("%d:%d:%d:%d\n", i, s_pos, t_pos, r_pos); */ + } + } +} + + +static void PrintTraceReadXML (TContigReadPtr read) +{ + if (read == NULL) { + printf ("Bad read\n"); + } else { + printf ("<trace>\n"); + printf (" <trace_name>%s</trace_name>\n", read->read_id == NULL ? "" : read->read_id); + PrintTraceGapsXML (read->gaps); + printf (" <nbasecalls>%d</nbasecalls>\n", StringLen (read->read_seq)); + printf (" <valid>\n"); + printf (" <start>%d</start>\n", read->read_assem_start + 1); + printf (" <stop>%d</stop>\n", read->read_assem_stop + 1); + printf (" </valid>\n"); + printf (" <tiling direction = \"%s\">\n", read->is_complement ? "REVERSE" : "FORWARD"); + printf (" <start>%d</start>\n", read->cons_start + 1); + printf (" <start>%d</start>\n", read->cons_start + StringLen (read->read_seq) + 1); + printf (" </tiling>\n"); + printf (" <consensus>\n"); + printf (" <start>%d</start>\n", read->cons_start + 1); + printf (" <start>%d</start>\n", read->cons_start + StringLen (read->read_seq) + 1); + printf (" </consensus>\n"); + printf ("<trace>\n"); + } +} + + + +static void TestGapInfoReading (CharPtr gap_string) +{ + TGapInfoPtr gap_info; + ValNodePtr list, vnp; + + if (!StringHasNoText (gap_string)) { + gap_info = GapInfoFromSequenceString(gap_string, "*"); + if (gap_info == NULL) { + printf ("error reading"); + } else { + PrintTraceGapsXML (gap_info); + TestPosConversions (gap_info); + list = GetTransitionsFromGapInfo (gap_info, 0, 0, 40); + for (vnp = list; vnp != NULL; vnp = vnp->next) { + printf ("%d\n", vnp->data.intvalue); + } + } + GapInfoFree (gap_info); + } +} + + +static void AddAlignmentToSeqEntry (DenseSegPtr dsp, SeqEntryPtr sep) +{ + SeqAnnotPtr sap; + SeqAlignPtr salp; + BioseqPtr bsp; + BioseqSetPtr bssp; + + if (dsp == NULL || sep == NULL) return; + + sap = SeqAnnotNew (); + sap->type = 2; + + salp = SeqAlignNew (); + salp->type = 3; + salp->segtype = 2; + salp->segs = (Pointer) dsp; + salp->dim = dsp->dim; + sap->data = (Pointer) salp; + + if (IS_Bioseq (sep)) { + bsp = (BioseqPtr) sep->data.ptrvalue; + sap->next = bsp->annot; + bsp->annot = sap; + } else if (IS_Bioseq_set (sep)) { + bssp = (BioseqSetPtr) sep->data.ptrvalue; + sap->next = bssp->annot; + bssp->annot = sap; + } +} + + +static void AddDescrToNucBioseqCallback (BioseqPtr bsp, Pointer data) +{ + SeqDescrPtr sdp, sdp_copy; + + if (bsp == NULL || !ISA_na (bsp->mol) || data == NULL) { + return; + } + sdp = (SeqDescrPtr) data; + sdp_copy = (SeqDescrPtr) AsnIoMemCopy (sdp, (AsnReadFunc) SeqDescrAsnRead, (AsnWriteFunc) SeqDescrAsnWrite); + sdp_copy->next = bsp->descr; + bsp->descr = sdp_copy; +} + + +static SeqSubmitPtr AddSeqSubmitFromTemplate (SeqEntryPtr sep, CharPtr filename) +{ + SeqSubmitPtr ssp = NULL; + SubmitBlockPtr sbp; + CitSubPtr csp; + FILE *fp = NULL; + Pointer dataptr; + Uint2 datatype; + + if (StringHasNoText (filename)) { + return NULL; + } + + fp = FileOpen (filename, "r"); + if (fp == NULL) { + printf ("Unable to read template file %s\n", filename); + return NULL; + } + + while ((dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, FALSE, FALSE, TRUE, FALSE)) != NULL) { + if (datatype == OBJ_SEQSUB) { + ssp = (SeqSubmitPtr) dataptr; + ssp->datatype = 1; + ssp->data = sep; + } else if (datatype == OBJ_SUBMIT_BLOCK) { + sbp = (SubmitBlockPtr) dataptr; + ssp = SeqSubmitNew (); + ssp->datatype = 1; + ssp->data = sep; + ssp->sub = sbp; + } else if (datatype == OBJ_SEQDESC) { + VisitBioseqsInSep (sep, dataptr, AddDescrToNucBioseqCallback); + ObjMgrFree (datatype, dataptr); + } else { + ObjMgrFree (datatype, dataptr); + } + } + FileClose (fp); + if (ssp == NULL) { + ssp = SeqSubmitNew (); + ssp->datatype = 1; + ssp->data = sep; + } + + if (ssp->sub == NULL) { + ssp->sub = SubmitBlockNew (); + } + + ssp->sub->tool = MemFree (ssp->sub->tool); + ssp->sub->tool = StringSave ("aceread"); + ssp->sub->hup = FALSE; + ssp->sub->reldate = DateFree (ssp->sub->reldate); + csp = ssp->sub->cit; + if (csp != NULL) { + csp->date = DateFree (csp->date); + csp->date = DateCurr (); + } + return ssp; +} + + +static Boolean AddReadQualityScores (TACEFilePtr afp, CharPtr qs_filename, CharPtr rd_filename) +{ + ReadBufferData q, r; + Boolean use_fasta = FALSE; + Boolean rval = FALSE; + + if (afp == NULL || StringHasNoText (qs_filename)) { + return TRUE; + } + + q.current_data = NULL; + r.current_data = NULL; + + q.fp = FileOpen (qs_filename, "r"); + if (q.fp == NULL) { + printf ("Unable to read quality score file\n"); + return FALSE; + } + + if (!StringHasNoText (rd_filename)) { + r.fp = FileOpen (rd_filename, "r"); + if (r.fp == NULL) { + printf ("Unable to open read FASTA file\n"); + FileClose (q.fp); + return FALSE; + } + use_fasta = TRUE; + } + + if (AddReadQualScores (afp, AbstractReadFunction, &q, use_fasta ? AbstractReadFunction : NULL, &r) > 0) { + rval = TRUE; + } + + FileClose (q.fp); + if (use_fasta) { + FileClose (r.fp); + } + return rval; +} + + +Int2 Main (void) + +{ + CharPtr infile, outfile, xmlfile; + + ReadBufferData rbd; + TACEFilePtr afp; + Int4 i, len; + SeqEntryPtr sep; + AsnIoPtr aip; + FILE *f = NULL; + FILE *f2; + CharPtr app = "aceread_tst"; + BioseqSetPtr bssp; + SeqEntryPtr last_sep = NULL; + Uint2 entityID; + Boolean make_qual_scores, suppress_lookup, srr_ids, fasta_out; + CharPtr submitter_ref = NULL, archive_id = NULL, description = NULL; + CharPtr center_name = NULL; + CharPtr format = NULL; + CharPtr gap_string; + CharPtr asn_file = NULL; + Int4 limit = 0; + char has_errors = 0; + Boolean recalculate_consensus = FALSE, recalculate_only_Ns = FALSE; + CharPtr recalculate_options; + SeqSubmitPtr ssp; + + /* standard setup */ + + ErrSetFatalLevel (SEV_MAX); + ErrSetMessageLevel (SEV_MAX); + ErrClearOptFlags (EO_SHOW_USERSTR); + ErrSetLogfile ("stderr", ELOG_APPEND); + ErrSetOpts (ERR_IGNORE, ERR_LOG_ON); + + UseLocalAsnloadDataAndErrMsg (); + ErrPathReset (); + + if (! AllObjLoad ()) { + Message (MSG_FATAL, "AllObjLoad failed"); + return 1; + } + if (! SubmitAsnLoad ()) { + Message (MSG_FATAL, "SubmitAsnLoad failed"); + return 1; + } + if (! FeatDefSetLoad ()) { + Message (MSG_FATAL, "FeatDefSetLoad failed"); + return 1; + } + PubSeqFetchEnable (); + + if (! GetArgs (app, sizeof (myargs) / sizeof (Args), myargs)) { + return 0; + } + + recalculate_options = (CharPtr) myargs[N_argRecalculateConsensus].strvalue; + if (!StringHasNoText (recalculate_options)) { + if (StringCmp (recalculate_options, "W") == 0) { + recalculate_consensus = TRUE; + recalculate_only_Ns = FALSE; + } else if (StringCmp (recalculate_options, "N") == 0) { + recalculate_consensus = TRUE; + recalculate_only_Ns = TRUE; + } else { + Message (MSG_FATAL, "Invalid consensus sequence recalculation option"); + return 1; + } + } + + + /* test gap info reading if provided */ + gap_string = (CharPtr) myargs[G_argGapString].strvalue; + TestGapInfoReading (gap_string); + + /* limit number of contigs? for debugging purposes */ + limit = myargs[l_argLimitNumContigs].intvalue; + + /* select format of input file */ + format = (CharPtr) myargs[F_argFormat].strvalue; + if (StringHasNoText (format)) { + format = "A"; + } + + infile = (CharPtr) myargs [i_argInputFile].strvalue; + if (StringHasNoText (infile)) { + Message (MSG_FATAL, "Must supply input file!"); + return 1; + } + outfile = (CharPtr) myargs [o_argOutputFile].strvalue; + xmlfile = (CharPtr) myargs[X_argXMLFile].strvalue; + make_qual_scores = (Boolean) myargs [Q_argMakeQualScores].intvalue; + center_name = (CharPtr) myargs[C_argCenter].strvalue; + suppress_lookup = (Boolean) myargs [L_argSuppressIdLookup].intvalue; + srr_ids = (Boolean) myargs[R_argSRRids].intvalue; + fasta_out = (Boolean) myargs[f_argFASTA].intvalue; + + /* ASN.1 file to validate against */ + asn_file = (CharPtr) myargs [V_argValidateAgainstAsn1File].strvalue; + + if (!GetTSAFieldsFromString ((CharPtr) myargs [T_argTSAFields].strvalue, + &submitter_ref, + &archive_id, + &description)) { + Message (MSG_FATAL, "Error reading TSA fields"); + return 1; + } + + len = StringLen (infile); + if (StringHasNoText (outfile)) { + if (len > 3 && StringCmp (infile + len - 4, ".ace") == 0) { + outfile = StringSave (infile); + StringCpy (outfile + len - 3, "sqn"); + } else if (len > 6 && StringCmp (infile + len - 7, ".ace.gz") == 0) { + outfile = StringSave (infile); + StringCpy (outfile + len - 6, "sqn"); + } else { + outfile = (CharPtr) MemNew (sizeof (Char) * (len + 5)); + sprintf (outfile, "%s.sqn", infile); + } + } + + if (!StringHasNoText ((CharPtr) myargs [S_argIDSubstitutionFile].strvalue)) { + f = FileOpen (myargs [S_argIDSubstitutionFile].strvalue, "r"); + if (f == NULL) { + Message (MSG_FATAL, "Unable to open %s", myargs [S_argIDSubstitutionFile].strvalue); + return 1; + } + } + + if (StringChr (format, 'M') != NULL) { + rbd.fp = FileOpen (infile, "r"); + if (rbd.fp == NULL) { + Message (MSG_FATAL, "Unable to open %s", infile); + return 1; + } + + rbd.current_data = NULL; + afp = ReadMAQFile (AbstractReadFunction, &rbd); + } else if (StringChr (format, 'E') != NULL) { + rbd.fp = FileOpen (infile, "r"); + if (rbd.fp == NULL) { + Message (MSG_FATAL, "Unable to open %s", infile); + return 1; + } + + rbd.current_data = NULL; + afp = ReadElandStandaloneFile (AbstractReadFunction, &rbd); + } else if (StringChr (format, 'A') != NULL) { + rbd.fp = OpenAceFile (infile); + if (rbd.fp == NULL) { + Message (MSG_FATAL, "Unable to open %s", infile); + return 1; + } + rbd.current_data = NULL; + afp = ReadACEFile ( AbstractReadFunction, &rbd, make_qual_scores, &has_errors); + } else { + Message (MSG_FATAL, "Unrecognized format: %s\n", format); + return 1; + } + FileClose (rbd.fp); + if (afp == NULL) { + printf ("<message severity=\"ERROR\" seq-id=\"No ID\" code=\"bad_format\">Unable to read file</message>\n"); + } else { + if (recalculate_consensus) { + if (!AddReadQualityScores (afp, (CharPtr) myargs [q_argReadQualScoresFile].strvalue, (CharPtr) myargs [r_argReadFASTAFile].strvalue)) { + printf ("<message severity=\"ERROR\" seq-id=\"No ID\" code=\"bad_format\">Failed to add read quality scores</message>\n"); + } else { + RecalculateConsensusSequences (afp, recalculate_only_Ns); + } + } + + if (limit > 0) { + for (i = limit; i < afp->num_contigs; i++) { + ContigFree (afp->contigs[i]); + afp->contigs[i] = NULL; + } + afp->num_contigs = limit; + } + + if (f != NULL) { + UpdateAceFileIds (afp, f, suppress_lookup, srr_ids, &has_errors); + FileClose (f); + f = NULL; + } + ValidateAceFileIds (afp, &has_errors); + + if (asn_file != NULL) { + if (ValidateAgainstASNFile (afp, asn_file, &has_errors)) { + printf ("Validation against %s succeeded\n", asn_file); + } + } + + if (!StringHasNoText (xmlfile)) { + f2 = FileOpen (xmlfile, "w"); + WriteTraceAssemblyFromAceFile (afp, submitter_ref, center_name, 0, description, f2); + FileClose (f2); + } + + if (fasta_out) { + f2 = FileOpen (outfile, "w"); + WriteFASTAFromAceFile (afp, f2); + FileClose (f2); + } else { + aip = AsnIoOpen (outfile, "w"); + if (aip == NULL) { + printf ("Unable to open %s\n", outfile); + } else { + bssp = BioseqSetNew (); + bssp->_class = BioseqseqSet_class_genbank; + + for (i = 0; i < afp->num_contigs; i++) { + sep = MakeSeqEntryFromContig (afp->contigs[i]); + if (last_sep == NULL) { + bssp->seq_set = sep; + } else { + last_sep->next = sep; + } + last_sep = sep; + } + sep = ValNodeNew (NULL); + sep->choice = 2; + sep->data.ptrvalue = bssp; + bssp->seqentry = sep; + SeqMgrLinkSeqEntry (sep, 0, NULL); + entityID = ObjMgrGetEntityIDForChoice (sep); + AssignIDsInEntityEx (entityID, 0, NULL, NULL); + SeqMgrIndexFeatures (entityID, sep); + ssp = AddSeqSubmitFromTemplate (sep, (CharPtr) myargs[t_argTemplateFile].strvalue); + if (ssp == NULL) { + SeqEntryAsnWrite (sep, aip, NULL); + sep = SeqEntryFree (sep); + } else { + SeqSubmitAsnWrite (ssp, aip, NULL); + ssp = SeqSubmitFree (ssp); + } + AsnIoClose (aip); + } + } + } + + if (has_errors) { + printf ("</aceread>\n"); + } + + return 0; + +} + diff --git a/demo/alint.c b/demo/alint.c new file mode 100644 index 00000000..24d752e3 --- /dev/null +++ b/demo/alint.c @@ -0,0 +1,218 @@ +/* alint.c +* =========================================================================== +* +* PUBLIC DOMAIN NOTICE +* National Center for Biotechnology Information (NCBI) +* +* This software/database is a "United States Government Work" under the +* terms of the United States Copyright Act. It was written as part of +* the author's official duties as a United States Government employee and +* thus cannot be copyrighted. This software/database is freely available +* to the public for use. The National Library of Medicine and the U.S. +* Government do not place any restriction on its use or reproduction. +* We would, however, appreciate having the NCBI and the author cited in +* any work or product based on this material +* +* Although all reasonable efforts have been taken to ensure the accuracy +* and reliability of the software and data, the NLM and the U.S. +* Government do not and cannot warrant the performance or results that +* may be obtained by using this software or data. The NLM and the U.S. +* Government disclaim all warranties, express or implied, including +* warranties of performance, merchantability or fitness for any particular +* purpose. +* +* =========================================================================== +* +* File Name: alint.c +* +* Author: Jonathan Kans +* +* Version Creation Date: 11/10/08 +* +* $Revision: 1.1 $ +* +* File Description: +* +* Lint for Alignments in FASTA format - upper cases points of exact match +* +* Modifications: +* -------------------------------------------------------------------------- +* Date Name Description of modification +* ------- ---------- ----------------------------------------------------- +* +* ========================================================================== +*/ + +#include <ncbi.h> +#include <sqnutils.h> + +static CharPtr GetSequence ( + CharPtr str, + Boolean skiptoken +) + +{ + Char ch; + + if (str == NULL) return NULL; + + if (! skiptoken) return str; + + ch = *str; + while (ch != '\0' && ch != ' ') { + str++; + ch = *str; + } + if (ch == ' ') { + str++; + } + + return str; +} + +static void ProcessAlignedFASTA ( + FILE *ifp, + FILE *ofp, + Boolean skiptoken +) + +{ + CharPtr PNTR array; + Char ch, ch0; + FileCache fc; + ValNodePtr head = NULL, last = NULL, vnp; + Int2 i, j, num = 0, len, minlen = INT2_MAX, matches = 0, mismatches = 0; + Char line [4096]; + Boolean match; + CharPtr ptr, str; + + FileCacheSetup (&fc, ifp); + + str = FileCacheReadLine (&fc, line, sizeof (line), NULL); + if (str == NULL) return; + + while (str != NULL) { + TrimSpacesAroundString (str); + if (StringDoesHaveText (str)) { + vnp = ValNodeCopyStr (&last, 0, str); + if (head == NULL) { + head = vnp; + } + last = vnp; + num++; + str = GetSequence (str, skiptoken); + len = (Int2) StringLen (str); + if (minlen > len) { + minlen = len; + } + } + str = FileCacheReadLine (&fc, line, sizeof (line), NULL); + } + + if (num < 1 || minlen < 1) return; + + array = (CharPtr PNTR) MemNew (sizeof (CharPtr) * (num + 1)); + if (array == NULL) return; + + for (vnp = head, i = 0; vnp != NULL; vnp = vnp->next, i++) { + str = (CharPtr) vnp->data.ptrvalue; + array [i] = str; + } + + for (j = 0; j < minlen; j++) { + ptr = GetSequence (array [0], skiptoken); + ch0 = ptr [j]; + match = TRUE; + + for (i = 1; i < num; i++) { + ptr = GetSequence (array [i], skiptoken); + ch = ptr [j]; + if (ch != ch0) { + match = FALSE; + } + } + + if (match) { + matches++; + } else { + mismatches++; + } + + for (i = 0; i < num; i++) { + ptr = GetSequence (array [i], skiptoken); + ch = ptr [j]; + if (match) { + ptr [j] = TO_UPPER (ch); + } else { + ptr [j] = TO_LOWER (ch); + } + } + } + + for (vnp = head, i = 0; vnp != NULL; vnp = vnp->next, i++) { + str = (CharPtr) vnp->data.ptrvalue; + fprintf (ofp, "%s\n", str); + } + + fprintf (ofp, "\n%d matches, %d mismatches, length %d, %d percent matching\n", + (int) matches, (int) mismatches, (int) minlen, + (int) (matches * 100 / minlen)); + + MemFree (array); + ValNodeFreeData (head); +} + +#define i_argInputFile 0 +#define o_argOutputFile 1 +#define s_argSkipToken 2 + +Args myargs [] = { + {"Input File", "stdin", NULL, NULL, + FALSE, 'i', ARG_FILE_IN, 0.0, 0, NULL}, + {"Output File", "stdout", NULL, NULL, + FALSE, 'o', ARG_FILE_OUT, 0.0, 0, NULL}, + {"Skip First Token", "F", NULL, NULL, + TRUE, 's', ARG_BOOLEAN, 0.0, 0, NULL}, +}; + +Int2 Main (void) + +{ + FILE *ifp, *ofp; + CharPtr infile, outfile; + Boolean skiptoken; + + /* standard setup */ + + ErrSetFatalLevel (SEV_MAX); + ErrClearOptFlags (EO_SHOW_USERSTR); + ErrPathReset (); + + if (! GetArgs ("alint", sizeof (myargs) / sizeof (Args), myargs)) { + return 0; + } + + infile = (CharPtr) myargs [i_argInputFile].strvalue; + outfile = (CharPtr) myargs [o_argOutputFile].strvalue; + skiptoken = (Boolean) myargs [s_argSkipToken].intvalue; + + ifp = FileOpen (infile, "r"); + if (ifp == NULL) { + Message (MSG_FATAL, "Unable to open input file"); + return 1; + } + + ofp = FileOpen (outfile, "w"); + if (ofp == NULL) { + Message (MSG_FATAL, "Unable to open output file"); + return 1; + } + + ProcessAlignedFASTA (ifp, ofp, skiptoken); + + FileClose (ofp); + FileClose (ifp); + + return 0; +} + diff --git a/demo/asn2all.c b/demo/asn2all.c index 9af408e9..5e8c092c 100644 --- a/demo/asn2all.c +++ b/demo/asn2all.c @@ -29,7 +29,7 @@ * * Version Creation Date: 7/26/04 * -* $Revision: 1.37 $ +* $Revision: 1.45 $ * * File Description: * @@ -53,7 +53,7 @@ #include <pmfapi.h> #include <lsqfetch.h> -#define ASN2ALL_APP_VER "2.4" +#define ASN2ALL_APP_VER "3.2" CharPtr ASN2ALL_APPLICATION = ASN2ALL_APP_VER; @@ -342,7 +342,6 @@ static void ProcessSingleRecord ( { AsnIoPtr aip; - AsnTypePtr atp = NULL; BioseqPtr bsp; ValNodePtr bsplist; BioseqSetPtr bssp; @@ -363,7 +362,7 @@ static void ProcessSingleRecord ( return; } - dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, FALSE, FALSE, FALSE, FALSE); + dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, FALSE, FALSE, TRUE, FALSE); FileClose (fp); diff --git a/demo/asn2fsa.c b/demo/asn2fsa.c index e48aa210..ffb60c20 100644 --- a/demo/asn2fsa.c +++ b/demo/asn2fsa.c @@ -29,7 +29,7 @@ * * Version Creation Date: 3/4/04 * -* $Revision: 1.40 $ +* $Revision: 1.46 $ * * File Description: * @@ -61,7 +61,7 @@ #include <accpubseq.h> #endif -#define ASN2FSA_APP_VER "2.2" +#define ASN2FSA_APP_VER "2.7" CharPtr ASN2FSA_APPLICATION = ASN2FSA_APP_VER; @@ -578,7 +578,7 @@ static void ProcessSingleRecord ( return; } - dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, FALSE, FALSE, FALSE, FALSE); + dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, FALSE, FALSE, TRUE, FALSE); FileClose (fp); @@ -715,7 +715,7 @@ static void ProcessMultipleRecord ( AsnTypePtr atp, atp_bss, atp_desc, atp_se; BioseqPtr bsp; ValNodePtr bsplist; - Char buf [64], cmmd [256], file [FILENAME_MAX], path [PATH_MAX], longest [64]; + Char buf [64], file [FILENAME_MAX], path [PATH_MAX], longest [64]; StreamFlgType flags = STREAM_CORRECT_INVAL; FILE *fp; Int4 numrecords = 0; @@ -723,6 +723,7 @@ static void ProcessMultipleRecord ( ObjMgrPtr omp; time_t starttime, stoptime, worsttime; #ifdef OS_UNIX + Char cmmd [256]; CharPtr gzcatprog; int ret; Boolean usedPopen = FALSE; diff --git a/demo/asn2gb.c b/demo/asn2gb.c index ffb209b8..fd5cd000 100644 --- a/demo/asn2gb.c +++ b/demo/asn2gb.c @@ -29,7 +29,7 @@ * * Version Creation Date: 10/21/98 * -* $Revision: 6.103 $ +* $Revision: 6.117 $ * * File Description: New GenBank flatfile generator application * @@ -48,9 +48,13 @@ #include <sequtil.h> #include <sqnutils.h> #include <explore.h> +#include <toasn3.h> #include <asn2gnbp.h> -#define ASN2GB_APP_VER "4.4" +/* asn2gnbi.h needed to test PUBSEQGetAccnVer in accpubseq.c */ +#include <asn2gnbi.h> + +#define ASN2GB_APP_VER "5.5" CharPtr ASN2GB_APPLICATION = ASN2GB_APP_VER; @@ -190,7 +194,7 @@ static Int2 HandleSingleRecord ( return 1; } - dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, FALSE, FALSE, FALSE, FALSE); + dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, FALSE, FALSE, TRUE, FALSE); FileClose (fp); @@ -529,6 +533,52 @@ static CharPtr ffvew [] = { NULL }; +static void ReportDiffs ( + CharPtr path1, + CharPtr path2, + CharPtr path3, + FILE* fp, + CharPtr ffdiff, + Boolean useFfdiff +) + +{ + Char buf [256]; + Char cmmd [256]; + size_t ct; + FILE *fpo; + + if (useFfdiff) { + sprintf (cmmd, "%s -o %s -n %s -d reports", ffdiff, path1, path2); + system (cmmd); + + sprintf (cmmd, "rm %s; rm %s", path1, path2); + system (cmmd); + } else { + sprintf (cmmd, "sort %s | uniq -c > %s.suc; rm %s", path1, path1, path1); + system (cmmd); + + sprintf (cmmd, "sort %s | uniq -c > %s.suc; rm %s", path2, path2, path2); + system (cmmd); + + sprintf (cmmd, "diff %s.suc %s.suc > %s", path1, path2, path3); + system (cmmd); + + sprintf (cmmd, "cat %s", path3); + fpo = popen (cmmd, "r"); + if (fpo != NULL) { + while ((ct = fread (buf, 1, sizeof (buf), fpo)) > 0) { + fwrite (buf, 1, ct, fp); + fflush (fp); + } + pclose (fpo); + } + + sprintf (cmmd, "rm %s.suc; rm %s.suc", path1, path2); + system (cmmd); + } +} + static void CompareFlatFiles ( CharPtr path1, CharPtr path2, @@ -580,39 +630,21 @@ static void CompareFlatFiles ( SaveAsn2gnbk (sep, path1, format, SEQUIN_MODE, style, flags, locks, custom); SaveAsn2gnbk (sep, path2, format, RELEASE_MODE, style, flags, locks, custom); - if (useFfdiff) { - sprintf (cmmd, "%s -o %s -n %s -d reports", ffdiff, path1, path2); - system (cmmd); - - sprintf (cmmd, "rm %s; rm %s", path1, path2); - system (cmmd); - } else { - sprintf (cmmd, "sort %s | uniq -c > %s.suc; rm %s", path1, path1, path1); - system (cmmd); - - sprintf (cmmd, "sort %s | uniq -c > %s.suc; rm %s", path2, path2, path2); - system (cmmd); + ReportDiffs (path1, path2, path3, fp, ffdiff, useFfdiff); - sprintf (cmmd, "diff %s.suc %s.suc > %s", path1, path2, path3); - system (cmmd); + } else if (batch == 3) { - sprintf (cmmd, "cat %s", path3); - fpo = popen (cmmd, "r"); - if (fpo != NULL) { - while ((ct = fread (buf, 1, sizeof (buf), fpo)) > 0) { - fwrite (buf, 1, ct, fp); - fflush (fp); - } - pclose (fpo); - } +#ifdef ASN2GNBK_SUPPRESS_UNPUB_AFFIL + VisitPubdescsInSep (sep, NULL, FreeUnpubAffil); +#endif - sprintf (cmmd, "rm %s.suc; rm %s.suc", path1, path2); - system (cmmd); - } + SaveAsn2gnbk (sep, path1, format, mode, style, flags, locks, custom); + SeriousSeqEntryCleanupBulk (sep); + SaveAsn2gnbk (sep, path2, format, mode, style, flags, locks, custom); - } else if (batch == 3) { + ReportDiffs (path1, path2, path3, fp, ffdiff, useFfdiff); - SaveAsn2gnbk (sep, path1, format, mode, style, flags, locks, custom); + } else if (batch == 4) { aip = AsnIoOpen (path3, "w"); if (aip == NULL) return; @@ -627,43 +659,24 @@ static void CompareFlatFiles ( SeqIdWrite (bsp->id, buf, PRINTID_FASTA_LONG, sizeof (buf)); arguments [0] = '\0'; - sprintf (arguments, "-format %s -mode %s -style %s -view %s", + sprintf (arguments, "-format %s -mode %s -style %s -view %s -nocleanup", fffmt [(int) format], ffmod [(int) mode], ffstl [(int) style], ffvew [(int) format]); - sprintf (cmmd, "%s %s -i %s -o %s", asn2flat, arguments, path3, path2); + sprintf (cmmd, "%s %s -i %s -o %s", asn2flat, arguments, path3, path1); system (cmmd); - if (useFfdiff) { - sprintf (cmmd, "%s -o %s -n %s -d reports", ffdiff, path1, path2); - system (cmmd); - - sprintf (cmmd, "rm %s; rm %s", path1, path2); - system (cmmd); - } else { - sprintf (cmmd, "sort %s | uniq -c > %s.suc; rm %s", path1, path1, path1); - system (cmmd); - - sprintf (cmmd, "sort %s | uniq -c > %s.suc; rm %s", path2, path2, path2); - system (cmmd); + arguments [0] = '\0'; + sprintf (arguments, "-format %s -mode %s -style %s -view %s", + fffmt [(int) format], ffmod [(int) mode], ffstl [(int) style], ffvew [(int) format]); - sprintf (cmmd, "diff %s.suc %s.suc > %s", path1, path2, path3); - system (cmmd); + sprintf (cmmd, "%s %s -i %s -o %s", asn2flat, arguments, path3, path2); + system (cmmd); - sprintf (cmmd, "cat %s", path3); - fpo = popen (cmmd, "r"); - if (fpo != NULL) { - while ((ct = fread (buf, 1, sizeof (buf), fpo)) > 0) { - fwrite (buf, 1, ct, fp); - fflush (fp); - } - pclose (fpo); - } + ReportDiffs (path1, path2, path3, fp, ffdiff, useFfdiff); - sprintf (cmmd, "rm %s.suc; rm %s.suc", path1, path2); - system (cmmd); - } + } else if (batch == 5) { - } else if (batch == 4) { + SaveAsn2gnbk (sep, path1, format, mode, style, flags, locks, custom); aip = AsnIoOpen (path3, "w"); if (aip == NULL) return; @@ -678,50 +691,15 @@ static void CompareFlatFiles ( SeqIdWrite (bsp->id, buf, PRINTID_FASTA_LONG, sizeof (buf)); arguments [0] = '\0'; - sprintf (arguments, "-format %s -mode %s -style %s -view %s -nocleanup", - fffmt [(int) format], ffmod [(int) mode], ffstl [(int) style], ffvew [(int) format]); - - sprintf (cmmd, "%s %s -i %s -o %s", asn2flat, arguments, path3, path1); - system (cmmd); - - arguments [0] = '\0'; sprintf (arguments, "-format %s -mode %s -style %s -view %s", fffmt [(int) format], ffmod [(int) mode], ffstl [(int) style], ffvew [(int) format]); sprintf (cmmd, "%s %s -i %s -o %s", asn2flat, arguments, path3, path2); system (cmmd); - if (useFfdiff) { - sprintf (cmmd, "%s -o %s -n %s -d reports", ffdiff, path1, path2); - system (cmmd); - - sprintf (cmmd, "rm %s; rm %s", path1, path2); - system (cmmd); - } else { - sprintf (cmmd, "sort %s | uniq -c > %s.suc; rm %s", path1, path1, path1); - system (cmmd); - - sprintf (cmmd, "sort %s | uniq -c > %s.suc; rm %s", path2, path2, path2); - system (cmmd); + ReportDiffs (path1, path2, path3, fp, ffdiff, useFfdiff); - sprintf (cmmd, "diff %s.suc %s.suc > %s", path1, path2, path3); - system (cmmd); - - sprintf (cmmd, "cat %s", path3); - fpo = popen (cmmd, "r"); - if (fpo != NULL) { - while ((ct = fread (buf, 1, sizeof (buf), fpo)) > 0) { - fwrite (buf, 1, ct, fp); - fflush (fp); - } - pclose (fpo); - } - - sprintf (cmmd, "rm %s.suc; rm %s.suc", path1, path2); - system (cmmd); - } - - } else if (batch == 5) { + } else if (batch == 6) { aip = AsnIoOpen (path3, "w"); if (aip == NULL) return; @@ -974,7 +952,7 @@ static Int2 HandleMultipleRecords ( return 1; } - if ((batch == 1 || batch == 3 || batch == 4 || batch == 5 || format != GENBANK_FMT) && extra == NULL) { + if ((batch == 1 || batch == 4 || batch == 5 || format != GENBANK_FMT) && extra == NULL) { ofp = FileOpen (outputFile, "w"); if (ofp == NULL) { AsnIoClose (aip); @@ -1064,7 +1042,7 @@ static Int2 HandleMultipleRecords ( if (batch != 1) { printf ("%s\n", buf); fflush (stdout); - if (batch != 3 && batch != 4 && batch != 5) { + if (batch != 4 && batch != 5) { if (ofp != NULL) { fprintf (ofp, "%s\n", buf); fflush (ofp); @@ -1282,6 +1260,24 @@ static SeqEntryPtr SeqEntryFromAccnOrGi ( TrimSpacesAroundString (accn); +#ifdef INTERNAL_NCBI_ASN2GB + /* temporary code to test PUBSEQGetAccnVer in accpubseq.c */ + + if (*accn == '*') { + Char buf [64]; + accn++; + if (sscanf (accn, "%ld", &val) == 1) { + uid = (Int4) val; + if (GetAccnVerFromServer (uid, buf)) { + Message (MSG_POST, "GetAccnVerFromServer returned %s", buf); + } else { + Message (MSG_POST, "GetAccnVerFromServer failed"); + } + } + return NULL; + } +#endif + alldigits = TRUE; ptr = accn; ch = *ptr; @@ -1368,9 +1364,17 @@ Args myargs [] = { FALSE, 'h', ARG_INT, 0.0, 0, NULL}, {"Custom Flags (4 HideFeats, 1792 HideRefs, 8192 HideSources, 262144 HideTranslation)", "0", NULL, NULL, FALSE, 'u', ARG_INT, 0.0, 0, NULL}, - {"ASN.1 Type (a Any, e Seq-entry, b Bioseq, s Bioseq-set, m Seq-submit, t Batch Bioseq-set, u Batch Seq-submit)", "a", NULL, NULL, + {"ASN.1 Type\n" + " Single Record: a Any, e Seq-entry, b Bioseq, s Bioseq-set, m Seq-submit\n" + " Release File: t Batch Bioseq-set, u Batch Seq-submit\n", "a", NULL, NULL, TRUE, 'a', ARG_STRING, 0.0, 0, NULL}, - {"Batch (1 Report, 2 Sequin/Release, 3 asn2gb/asn2flat, 4 asn2flat BSEC/nocleanup, 5 oldasn2gb/newasn2gb)", "0", "0", "5", + {"Batch\n" + " 1 Report\n" + " 2 Sequin/Release\n" + " 3 asn2gb SSEC/nocleanup\n" + " 4 asn2flat BSEC/nocleanup\n" + " 5 asn2gb/asn2flat\n" + " 6 oldasn2gb/newasn2gb)", "0", "0", "5", FALSE, 't', ARG_INT, 0.0, 0, NULL}, {"Input File is Binary", "F", NULL, NULL, TRUE, 'b', ARG_BOOLEAN, 0.0, 0, NULL}, @@ -1534,6 +1538,12 @@ Int2 Main ( do_gbseq = TRUE; format = GENPEPT_FMT; + } else if (StringICmp (str, "xz") == 0 || StringICmp (str, "zx") == 0) { + do_gbseq = TRUE; + do_insdseq = TRUE; + format = GENBANK_FMT; + altformat = GENPEPT_FMT; + } else if (StringICmp (str, "x") == 0) { do_gbseq = TRUE; do_insdseq = TRUE; diff --git a/demo/asn2idx.c b/demo/asn2idx.c index cee0b4db..e47842c9 100644 --- a/demo/asn2idx.c +++ b/demo/asn2idx.c @@ -29,7 +29,7 @@ * * Version Creation Date: 8/2/04 * -* $Revision: 1.4 $ +* $Revision: 1.5 $ * * File Description: * @@ -181,7 +181,7 @@ Args myargs [] = { TRUE, 'd', ARG_STRING, 0.0, 0, NULL}, {"File Selection Substring", ".aso", NULL, NULL, TRUE, 'x', ARG_STRING, 0.0, 0, NULL}, - {"Filter", "gbcon,gbest,gbgss,gbsts", NULL, NULL, + {"Filter", "gbcon,gbest,gbgss,gbhtg,gbsts", NULL, NULL, FALSE, 'f', ARG_STRING, 0.0, 0, NULL}, {"Bioseq-sets are Binary", "F", NULL, NULL, TRUE, 'b', ARG_BOOLEAN, 0.0, 0, NULL}, diff --git a/demo/asnbarval.c b/demo/asnbarval.c index 7e925556..8c940404 100644 --- a/demo/asnbarval.c +++ b/demo/asnbarval.c @@ -29,7 +29,7 @@ * * Version Creation Date: 1/23/07 * -* $Revision: 1.3 $ +* $Revision: 1.5 $ * * File Description: * @@ -81,7 +81,6 @@ typedef struct brflags { Int4 numrecords; ValNodePtr sep_list; ValNodePtr bsplist; - ValNodePtr filename_list; BarcodeTestConfigData bcd; } BRFlagData, PNTR BRFlagPtr; @@ -568,8 +567,6 @@ static void ProcessSeqEntryList (BRFlagPtr drfp, CharPtr filename) SeqEntrySetScope (NULL); drfp->sep_list = ValNodeFree (drfp->sep_list); - drfp->filename_list = FreeFilenameList (drfp->filename_list); - drfp->bsplist = UnlockFarComponents (drfp->bsplist); if (ofp != NULL && need_ofp_close) { @@ -682,8 +679,6 @@ static void ProcessSingleRecord ( if (sep != NULL) { ValNodeAddPointer (&(drfp->sep_list), 0, sep); - ValNodeAddInt (&(drfp->filename_list), FILENAME_LIST_ENTITY_ID_ITEM, (Int4) entityID); - ValNodeAddPointer (&(drfp->filename_list), FILENAME_LIST_FILENAME_ITEM, StringSave (filename)); } } else { @@ -1115,6 +1110,9 @@ Int2 Main (void) /* minimum length */ dfd.bcd.min_length = myargs[l_argMinLength].intvalue; + /* require keyword to be present */ + dfd.bcd.require_keyword = TRUE; + /* set up Barcode Report Configuration */ enabled_list = (CharPtr) myargs [e_argEnableTests].strvalue; disabled_list = (CharPtr) myargs [d_argDisableTests].strvalue; diff --git a/demo/asndisc.c b/demo/asndisc.c index a5da7aef..87bf938d 100644 --- a/demo/asndisc.c +++ b/demo/asndisc.c @@ -29,7 +29,7 @@ * * Version Creation Date: 1/23/07 * -* $Revision: 1.20 $ +* $Revision: 1.24 $ * * File Description: * @@ -80,10 +80,10 @@ typedef struct drflags { CharPtr output_dir; FILE *outfp; Int4 numrecords; - DiscReportOutputConfigData ocd; - DiscrepancyConfigData dcd; ValNodePtr sep_list; ValNodePtr bsplist; + + GlobalDiscrepReportPtr global_report; } DRFlagData, PNTR DRFlagPtr; #ifdef INTERNAL_NCBI_ASNDISC @@ -493,67 +493,17 @@ static ValNodePtr DoLockFarComponents ( return rsult; } -static void ProcessSeqEntryList (DRFlagPtr drfp, CharPtr filename) + +static void ReleaseDiscrepancyReportSeqEntries (DRFlagPtr drfp) { - ValNodePtr discrepancy_list, vnp; - ObjMgrPtr omp; + ValNodePtr vnp; SeqEntryPtr sep; - FILE *ofp = NULL; - Boolean need_ofp_close = FALSE; - Char path [PATH_MAX]; - CharPtr ptr; - - if (drfp == NULL || drfp->sep_list == NULL) return; + ObjMgrPtr omp; - if (drfp->outfp == NULL) { - if (StringDoesHaveText (drfp->output_dir)) { - if (StringLen (drfp->output_dir) > PATH_MAX) { - Message (MSG_ERROR, "Unable to generate output file - path name is too long"); - return; - } - StringCpy (path, drfp->output_dir); -#ifdef OS_WINNT - ptr = StringRChr (filename, '\\'); - if (path[StringLen(path) - 1] != '\\') { - StringCat (path, "\\"); - } -#else - ptr = StringRChr (filename, '/'); - if (path[StringLen(path) - 1] != '/') { - StringCat (path, "/"); - } -#endif; - if (ptr == NULL) { - StringNCat (path, filename, PATH_MAX - StringLen(path) - 1); - } else { - StringNCat (path, ptr + 1, PATH_MAX - StringLen(path) - 1); - } - } else { - StringNCpy_0 (path, filename, sizeof (path)); - } - ptr = StringRChr (path, '.'); - if (ptr != NULL) { - *ptr = '\0'; - } - if (StringDoesHaveText (drfp->output_suffix)) { - StringNCat (path, drfp->output_suffix, PATH_MAX - StringLen(path) - 1); - path[PATH_MAX - 1] = 0; - } else { - StringCat (path, ".dr"); - } - if (drfp->outfp == NULL) { - ofp = FileOpen (path, "w"); - need_ofp_close = TRUE; - } else { - ofp = drfp->outfp; - } - } else { - ofp = drfp->outfp; + if (drfp == NULL) { + return; } - discrepancy_list = CollectDiscrepancies (&(drfp->dcd), drfp->sep_list, taxlookup); - WriteAsnDiscReport (discrepancy_list, ofp, &(drfp->ocd), TRUE); - discrepancy_list = FreeClickableList (discrepancy_list); for (vnp = drfp->sep_list; vnp != NULL; vnp = vnp->next) { sep = vnp->data.ptrvalue; SeqEntryFree (sep); @@ -566,13 +516,61 @@ static void ProcessSeqEntryList (DRFlagPtr drfp, CharPtr filename) SeqEntrySetScope (NULL); drfp->sep_list = ValNodeFree (drfp->sep_list); - drfp->ocd.filename_list = FreeFilenameList (drfp->ocd.filename_list); - drfp->bsplist = UnlockFarComponents (drfp->bsplist); +} + + +static void ProcessSeqEntryList (DRFlagPtr drfp, CharPtr filename) +{ + ValNodePtr discrepancy_list; + FILE *ofp = NULL; + Char path [PATH_MAX]; + CharPtr ptr; - if (ofp != NULL && need_ofp_close) { - FileClose (ofp); + if (drfp == NULL || drfp->sep_list == NULL) return; + + if (StringDoesHaveText (drfp->output_dir)) { + if (StringLen (drfp->output_dir) > PATH_MAX) { + Message (MSG_ERROR, "Unable to generate output file - path name is too long"); + return; + } + StringCpy (path, drfp->output_dir); +#ifdef OS_WINNT + ptr = StringRChr (filename, '\\'); + if (path[StringLen(path) - 1] != '\\') { + StringCat (path, "\\"); + } +#else + ptr = StringRChr (filename, '/'); + if (path[StringLen(path) - 1] != '/') { + StringCat (path, "/"); + } +#endif; + if (ptr == NULL) { + StringNCat (path, filename, PATH_MAX - StringLen(path) - 1); + } else { + StringNCat (path, ptr + 1, PATH_MAX - StringLen(path) - 1); + } + } else { + StringNCpy_0 (path, filename, sizeof (path)); } + ptr = StringRChr (path, '.'); + if (ptr != NULL) { + *ptr = '\0'; + } + if (StringDoesHaveText (drfp->output_suffix)) { + StringNCat (path, drfp->output_suffix, PATH_MAX - StringLen(path) - 1); + path[PATH_MAX - 1] = 0; + } else { + StringCat (path, ".dr"); + } + ofp = FileOpen (path, "w"); + + discrepancy_list = CollectDiscrepancies (drfp->global_report->test_config, drfp->sep_list, taxlookup); + WriteAsnDiscReport (discrepancy_list, ofp, drfp->global_report->output_config, TRUE); + discrepancy_list = FreeClickableList (discrepancy_list); + + FileClose (ofp); } @@ -681,8 +679,6 @@ static void ProcessSingleRecord ( if (sep != NULL) { ValNodeAddPointer (&(drfp->sep_list), 0, sep); - ValNodeAddInt (&(drfp->ocd.filename_list), FILENAME_LIST_ENTITY_ID_ITEM, (Int4) entityID); - ValNodeAddPointer (&(drfp->ocd.filename_list), FILENAME_LIST_FILENAME_ITEM, StringSave (filename)); if (drfp->lock) { bsplist_next = DoLockFarComponents (sep, drfp); @@ -883,11 +879,23 @@ static void ProcessMultipleRecord ( } -static void ProcessOneRecord ( - CharPtr filename, - Pointer userdata -) +static void ProcessSeqEntryListWithCollation (GlobalDiscrepReportPtr g, ValNodePtr sep_list, CharPtr filename) +{ + ValNodePtr vnp; + SeqEntryPtr sep; + + if (g == NULL || sep_list == NULL) return; + + for (vnp = sep_list; vnp != NULL; vnp = vnp->next) { + sep = vnp->data.ptrvalue; + AddSeqEntryToGlobalDiscrepReport (sep, g, filename); + } + +} + + +static void ProcessOneRecord (CharPtr filename, Pointer userdata) { DRFlagPtr drfp; @@ -899,11 +907,16 @@ static void ProcessOneRecord ( } else { ProcessSingleRecord (filename, drfp); } + if (drfp->outfp == NULL) { ProcessSeqEntryList (drfp, filename); + } else { + ProcessSeqEntryListWithCollation (drfp->global_report, drfp->sep_list, filename); } + ReleaseDiscrepancyReportSeqEntries (drfp); } + /* Args structure contains command-line arguments */ typedef enum { @@ -928,6 +941,7 @@ typedef enum { T_argThreads, X_argExpandCategories, S_argSummaryReport, + B_argBigSequenceReport, C_argMaxCount } DRFlagNum; @@ -989,6 +1003,8 @@ Args myargs [] = { TRUE, 'X', ARG_STRING, 0.0, 0, NULL}, {"Summary Report", "F", NULL, NULL, TRUE, 'S', ARG_BOOLEAN, 0.0, 0, NULL}, + {"Big Sequence Report", "F", NULL, NULL, + TRUE, 'B', ARG_BOOLEAN, 0.0, 0, NULL}, {"Max Count", "0", NULL, NULL, TRUE, 'C', ARG_INT, 0.0, 0, NULL}, }; @@ -1016,6 +1032,7 @@ static CharPtr GetTestNameList (CharPtr intro) return text; } + Int2 Main (void) { @@ -1024,8 +1041,9 @@ Int2 Main (void) CharPtr enabled_list, disabled_list, err_msg; Boolean batch, binary, compressed, dorecurse, indexed, local, lock, remote, usethreads; - Int2 type = 0, k; + Int2 type = 0; DRFlagData dfd; + Boolean big_sequence_report; /* standard setup */ @@ -1099,36 +1117,50 @@ Int2 Main (void) lock = (Boolean) myargs [l_argLockFar].intvalue; usethreads = (Boolean) myargs [T_argThreads].intvalue; dfd.farFetchCDSproducts = (Boolean) myargs [Z_argRemoteCDS].intvalue; - ExpandDiscrepancyReportTestsFromString ((CharPtr) myargs [X_argExpandCategories].strvalue, TRUE, &dfd.ocd); - dfd.ocd.summary_report = (Boolean) myargs [S_argSummaryReport].intvalue; /* set up Discrepancy Report Configuration */ + dfd.global_report = GlobalDiscrepReportNew (); + dfd.global_report->test_config = DiscrepancyConfigNew(); + + ExpandDiscrepancyReportTestsFromString ((CharPtr) myargs [X_argExpandCategories].strvalue, TRUE, dfd.global_report->output_config); + dfd.global_report->output_config->summary_report = (Boolean) myargs [S_argSummaryReport].intvalue; + + big_sequence_report = (Boolean) myargs [B_argBigSequenceReport].intvalue; + enabled_list = (CharPtr) myargs [e_argEnableTests].strvalue; disabled_list = (CharPtr) myargs [d_argDisableTests].strvalue; + +#ifdef INTERNAL_NCBI_ASNDISC + dfd.global_report->taxlookup = CheckTaxNamesAgainstTaxDatabase; +#endif + err_msg = NULL; if (StringDoesHaveText (enabled_list) && StringDoesHaveText (disabled_list)) { err_msg = StringSave ("Cannot specify both -e and -d. Choose -e to enable only a few tests and disable the rest, choose -d to disable only a few tests and enable the rest."); } else if (StringDoesHaveText (disabled_list)) { - for (k = 0; k < MAX_DISC_TYPE; k++) { - dfd.dcd.conf_list[k] = TRUE; + if (big_sequence_report) { + ConfigureForBigSequence (dfd.global_report->test_config); + } else { + ConfigureForGenomes (dfd.global_report->test_config); } - DisableTRNATests (&(dfd.dcd)); /* now disable tests from string */ - err_msg = SetDiscrepancyReportTestsFromString (disabled_list, FALSE, &(dfd.dcd)); + err_msg = SetDiscrepancyReportTestsFromString (disabled_list, FALSE, dfd.global_report->test_config); } else if (StringDoesHaveText (enabled_list)) { - for (k = 0; k < MAX_DISC_TYPE; k++) { - dfd.dcd.conf_list[k] = FALSE; + if (big_sequence_report) { + ConfigureForBigSequence (dfd.global_report->test_config); + } else { + ConfigureForGenomes (dfd.global_report->test_config); } /* now enable tests from string */ - err_msg = SetDiscrepancyReportTestsFromString (enabled_list, TRUE, &(dfd.dcd)); + err_msg = SetDiscrepancyReportTestsFromString (enabled_list, TRUE, dfd.global_report->test_config); } else { - /* enable all tests by default */ - for (k = 0; k < MAX_DISC_TYPE; k++) { - dfd.dcd.conf_list[k] = TRUE; + if (big_sequence_report) { + ConfigureForBigSequence (dfd.global_report->test_config); + } else { + ConfigureForGenomes (dfd.global_report->test_config); } - DisableTRNATests (&(dfd.dcd)); } if (err_msg != NULL) { Message (MSG_FATAL, err_msg); @@ -1137,8 +1169,8 @@ Int2 Main (void) } if ((Boolean) myargs[f_argUseFT].intvalue) { - dfd.dcd.use_feature_table_format = TRUE; - dfd.ocd.use_feature_table_format = TRUE; + dfd.global_report->test_config->use_feature_table_format = TRUE; + dfd.global_report->output_config->use_feature_table_format = TRUE; } dfd.maxcount = (Int4) myargs [C_argMaxCount].intvalue; @@ -1226,27 +1258,21 @@ Int2 Main (void) AsnIndexedLibFetchEnable (asnidx, TRUE); } - /* recurse through all files within source directory or subdirectories */ - if (StringDoesHaveText (directory)) { - DirExplore (directory, NULL, suffix, dorecurse, ProcessOneRecord, (Pointer) &dfd); - if (dfd.outfp != NULL) { - ProcessSeqEntryList (&dfd, NULL); - } } else if (StringDoesHaveText (infile)) { ProcessOneRecord (infile, (Pointer) &dfd); - if (dfd.outfp != NULL) { - ProcessSeqEntryList (&dfd, NULL); - } } - if (dfd.outfp != NULL) { + WriteGlobalDiscrepancyReport (dfd.global_report, dfd.outfp); FileClose (dfd.outfp); + dfd.outfp = NULL; } + dfd.global_report = GlobalDiscrepReportFree (dfd.global_report); + /* close fetch functions */ if (indexed) { diff --git a/demo/asnmacro.c b/demo/asnmacro.c index 7e980c28..54fbd82c 100755 --- a/demo/asnmacro.c +++ b/demo/asnmacro.c @@ -29,7 +29,7 @@ * * Version Creation Date: 4/12/07 * -* $Revision: 1.3 $ +* $Revision: 1.4 $ * * File Description: * @@ -146,7 +146,7 @@ static SeqAlignPtr LIBCALLBACK GetSeqAlignPiece (SeqLocPtr slp1, SeqLocPtr slp2) static SeqAlignPtr GlobalAlign2Seq (BioseqPtr bsp1, BioseqPtr bsp2, BoolPtr revcomp) { - return Sqn_GlobalAlign2SeqEx (bsp1, bsp2, revcomp, GetSeqAlign, GetSeqAlignPiece); + return Sqn_GlobalAlign2SeqEx (bsp1, bsp2, revcomp, GetSeqAlign, GetSeqAlignPiece, TRUE); } diff --git a/demo/asnval.c b/demo/asnval.c index 4a676193..7fbfcf1c 100644 --- a/demo/asnval.c +++ b/demo/asnval.c @@ -29,7 +29,7 @@ * * Version Creation Date: 11/3/04 * -* $Revision: 1.56 $ +* $Revision: 1.76 $ * * File Description: * @@ -60,7 +60,7 @@ #include <accpubseq.h> #endif -#define ASNVAL_APP_VER "4.8" +#define ASNVAL_APP_VER "5.9" CharPtr ASNVAL_APPLICATION = ASNVAL_APP_VER; @@ -78,17 +78,20 @@ typedef struct valflags { Boolean farFetchMRNAproducts; Boolean locusTagGeneralMatch; Boolean validateIDSet; + Boolean seqSubmitParent; Boolean ignoreExceptions; Boolean validateExons; Boolean inferenceAccnCheck; Boolean testLatLonSubregion; Boolean strictLatLonCountry; + Boolean indexerVersion; Boolean batch; Boolean binary; Boolean compressed; Boolean lock; Boolean useThreads; Boolean usePUBSEQ; + Boolean validateBarcode; Int2 verbosity; Int2 type; Int4 skipcount; @@ -98,6 +101,7 @@ typedef struct valflags { FILE *logfp; Int4 num_errors; Int4 fatal_errors; + Boolean has_errors; Boolean io_failure; Char longest [64]; time_t worsttime; @@ -514,7 +518,7 @@ static ValNodePtr DoLockFarComponents ( } static CharPtr severityLabel [] = { - "NONE", "INFO", "WARN", "ERROR", "REJECT", "FATAL", "MAX", NULL + "NONE", "INFO", "WARNING", "ERROR", "REJECT", "FATAL", "MAX", NULL }; static CharPtr compatSeverityLabel [] = { @@ -522,13 +526,64 @@ static CharPtr compatSeverityLabel [] = { }; typedef struct vcdaa { - FILE *ofp; - Int2 verbosity; - Int2 lowCutoff; - Int2 highCutoff; - CharPtr errcode; + FILE *ofp; + Int2 verbosity; + Int2 lowCutoff; + Int2 highCutoff; + CharPtr errcode; + ValFlagPtr vfp; } VCData, PNTR VCPtr; +static void XmlEncode (CharPtr dst, CharPtr src) + +{ + Char ch; + + if (dst == NULL || src == NULL) return; + + ch = *src; + while (ch != '\0') { + if (ch == '<') { + *dst = '&'; + dst++; + *dst = 'l'; + dst++; + *dst = 't'; + dst++; + *dst = ';'; + dst++; + } else if (ch == '>') { + *dst = '&'; + dst++; + *dst = 'g'; + dst++; + *dst = 't'; + dst++; + *dst = ';'; + dst++; + } else { + *dst = ch; + dst++; + } + src++; + ch = *src; + } + *dst = '\0'; +} + + +static CharPtr GetXmlHeaderText (ErrSev cutoff) +{ + CharPtr xml_header = NULL; + CharPtr xml_4_fmt = "asnval version=\"%s\" severity_cutoff=\"%s\""; + + xml_header = (CharPtr) MemNew (sizeof (Char) * (10 + StringLen (xml_4_fmt) + + StringLen (ASNVAL_APPLICATION) + StringLen (severityLabel[cutoff]))); + sprintf (xml_header, xml_4_fmt, ASNVAL_APPLICATION, severityLabel[cutoff]); + return xml_header; +} + + static void LIBCALLBACK ValidCallback ( ErrSev severity, int errcode, @@ -547,15 +602,21 @@ static void LIBCALLBACK ValidCallback ( ) { - Char buf [256]; - CharPtr catname, errname; - FILE *fp; - VCPtr vcp; + Char buf [256]; + CharPtr catname, errname, urlmssg = NULL; + ErrSev cutoff; + FILE *fp; + size_t len; + VCPtr vcp; + ValFlagPtr vfp; + CharPtr xml_header; vcp = (VCPtr) userdata; if (vcp == NULL) return; fp = vcp->ofp; if (fp == NULL) return; + vfp = vcp->vfp; + if (vfp == NULL) return; if (severity < SEV_NONE || severity > SEV_MAX) { severity = SEV_MAX; @@ -628,7 +689,32 @@ static void LIBCALLBACK ValidCallback ( accession, severityLabel [severity], catname, errname); + } else if (vcp->verbosity == 4) { + + if (! vfp->has_errors) { + cutoff = (ErrSev) vcp->lowCutoff; + if (cutoff < SEV_NONE || cutoff > SEV_MAX) { + cutoff = SEV_MAX; + } + + xml_header = GetXmlHeaderText (cutoff); + fprintf (fp, "<%s>\n", xml_header); + xml_header = MemFree (xml_header); + } + + len = StringLen (message); + if (len > 0) { + urlmssg = MemNew (len * 3 + 2); + if (urlmssg != NULL) { + XmlEncode (urlmssg, message); + fprintf (fp, " <message severity=\"%s\" seq-id=\"%s\" code=\"%s_%s\">%s</message>\n", + severityLabel [severity], accession, catname, errname, urlmssg); + MemFree (urlmssg); + } + } } + + vfp->has_errors = TRUE; } static void DoValidation ( @@ -641,6 +727,8 @@ static void DoValidation ( Int2 i; VCData vcd; ValidStructPtr vsp; + ErrSev cutoff; + CharPtr xml_header = NULL; if (vfp == NULL) return; @@ -653,6 +741,7 @@ static void DoValidation ( vsp->cutoff = vfp->lowCutoff; vsp->validateAlignments = vfp->validateAlignments; + vsp->alignFindRemoteBsp = vfp->alignFindRemoteBsp; vsp->doSeqHistAssembly = vfp->doSeqHistAssembly; vsp->farIDsInAlignments = vfp->farIDsInAlignments; vsp->alwaysRequireIsoJTA = vfp->alwaysRequireIsoJTA; @@ -660,11 +749,13 @@ static void DoValidation ( vsp->farFetchMRNAproducts = vfp->farFetchMRNAproducts; vsp->locusTagGeneralMatch = vfp->locusTagGeneralMatch; vsp->validateIDSet = vfp->validateIDSet; + vsp->seqSubmitParent = vfp->seqSubmitParent; vsp->ignoreExceptions = vfp->ignoreExceptions; vsp->validateExons = vfp->validateExons; vsp->inferenceAccnCheck = vfp->inferenceAccnCheck; vsp->testLatLonSubregion = vfp->testLatLonSubregion; vsp->strictLatLonCountry = vfp->strictLatLonCountry; + vsp->indexerVersion = vfp->indexerVersion; if (ofp == NULL && vfp->outfp != NULL) { ofp = vfp->outfp; @@ -675,6 +766,7 @@ static void DoValidation ( vcd.lowCutoff = vfp->lowCutoff; vcd.highCutoff = vfp->highCutoff; vcd.errcode = vfp->errcode; + vcd.vfp = vfp; vsp->errfunc = ValidCallback; vsp->userdata = (Pointer) &vcd; vsp->convertGiToAccn = FALSE; @@ -690,6 +782,22 @@ static void DoValidation ( } ValidStructFree (vsp); + if (vfp->validateBarcode) { + if (vfp->verbosity == 4 && !vfp->has_errors) { + cutoff = (ErrSev) vfp->lowCutoff; + if (cutoff < SEV_NONE || cutoff > SEV_MAX) { + cutoff = SEV_MAX; + } + xml_header = GetXmlHeaderText(cutoff); + } + if (!BarcodeValidateOneSeqEntry (ofp, sep, FALSE, + vfp->verbosity == 4, + !vfp->has_errors, + xml_header)) { + vfp->has_errors = TRUE; + } + xml_header = MemFree (xml_header); + } } static void ProcessSingleRecord ( @@ -721,7 +829,7 @@ static void ProcessSingleRecord ( return; } - dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, FALSE, FALSE, FALSE, FALSE); + dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, FALSE, FALSE, TRUE, FALSE); FileClose (fp); @@ -840,6 +948,12 @@ static void ProcessSingleRecord ( bsplist = UnlockFarComponents (bsplist); if (ofp != NULL) { + if (vfp->has_errors) { + if (vfp->verbosity == 4) { + fprintf (ofp, "</asnval>\n"); + } + vfp->has_errors = FALSE; + } FileClose (ofp); } @@ -1184,6 +1298,12 @@ static void ProcessMultipleRecord ( } if (ofp != NULL) { + if (vfp->has_errors) { + if (vfp->verbosity == 4) { + fprintf (ofp, "</asnval>\n"); + } + vfp->has_errors = FALSE; + } FileClose (ofp); } @@ -1261,9 +1381,12 @@ static void ProcessOneRecord ( #define T_argThreads 26 #define L_argLogFile 27 #define S_argSkipCount 28 -#define C_argMaxCount 29 +#define B_argBarcodeVal 29 +#define C_argMaxCount 30 #ifdef INTERNAL_NCBI_ASN2VAL -#define H_argAccessHUP 30 +#define w_argSeqSubParent 31 +#define H_argAccessHUP 32 +#define y_argAIndexer 33 #endif #define LAT_LON_STATE 1 @@ -1306,7 +1429,7 @@ Args myargs [] = { TRUE, 'Y', ARG_BOOLEAN, 0.0, 0, NULL}, {"Ignore Transcription/Translation Exceptions", "F", NULL, NULL, TRUE, 'e', ARG_BOOLEAN, 0.0, 0, NULL}, - {"Verbosity", "0", "0", "3", + {"Verbosity", "0", "0", "4", FALSE, 'v', ARG_INT, 0.0, 0, NULL}, {"ASN.1 Type (a Any, e Seq-entry, b Bioseq, s Bioseq-set, m Seq-submit, t Batch Bioseq-set, u Batch Seq-submit)", "a", NULL, NULL, TRUE, 'a', ARG_STRING, 0.0, 0, NULL}, @@ -1328,11 +1451,17 @@ Args myargs [] = { TRUE, 'L', ARG_FILE_OUT, 0.0, 0, NULL}, {"Skip Count", "0", NULL, NULL, TRUE, 'S', ARG_INT, 0.0, 0, NULL}, + {"Barcode Validate", "F", NULL, NULL, + TRUE, 'B', ARG_BOOLEAN, 0.0, 0, NULL}, {"Max Count", "0", NULL, NULL, TRUE, 'C', ARG_INT, 0.0, 0, NULL}, #ifdef INTERNAL_NCBI_ASN2VAL + {"SeqSubmitParent Flag", "F", NULL, NULL, + TRUE, 'w', ARG_BOOLEAN, 0.0, 0, NULL}, {"Internal Access to HUP", "F", NULL, NULL, TRUE, 'H', ARG_BOOLEAN, 0.0, 0, NULL}, + {"Special Indexer Tests", "F", NULL, NULL, + TRUE, 'y', ARG_BOOLEAN, 0.0, 0, NULL}, #endif }; @@ -1413,6 +1542,7 @@ Int2 Main (void) vfd.highCutoff = (Int2) myargs [P_argHighCutoff].intvalue; vfd.errcode = (CharPtr) myargs [E_argOnlyThisErr].strvalue; vfd.validateAlignments = (Boolean) myargs [A_argAlignments].intvalue; + vfd.alignFindRemoteBsp = (Boolean) (vfd.validateAlignments && remote); vfd.doSeqHistAssembly = (Boolean) myargs [A_argAlignments].intvalue; vfd.farIDsInAlignments = (Boolean) myargs [A_argAlignments].intvalue; vfd.alwaysRequireIsoJTA = (Boolean) myargs [J_argIsoJta].intvalue; @@ -1423,6 +1553,8 @@ Int2 Main (void) vfd.ignoreExceptions = (Boolean) myargs [e_argIgnoreExcept].intvalue; vfd.validateExons = (Boolean) myargs [X_argExonSplice].intvalue; vfd.inferenceAccnCheck = (Boolean) myargs [G_argInfAccns].intvalue; + vfd.validateBarcode = (Boolean) myargs[B_argBarcodeVal].intvalue; + val = (Int2) myargs [N_argLatLonStrict].intvalue; vfd.testLatLonSubregion = (Boolean) ((val & LAT_LON_STATE) != 0); @@ -1436,6 +1568,11 @@ Int2 Main (void) vfd.maxcount = INT4_MAX; } +#ifdef INTERNAL_NCBI_ASN2VAL + vfd.seqSubmitParent = (Boolean) myargs [w_argSeqSubParent].intvalue; + vfd.indexerVersion = (Boolean) myargs [y_argAIndexer].intvalue; +#endif + batch = FALSE; binary = (Boolean) myargs [b_argBinary].intvalue; compressed = (Boolean) myargs [c_argCompressed].intvalue; @@ -1488,6 +1625,7 @@ Int2 Main (void) vfd.logfp = NULL; vfd.num_errors = 0; vfd.fatal_errors = 0; + vfd.has_errors = FALSE; vfd.io_failure = FALSE; vfd.longest [0] = '\0'; vfd.worsttime = 0; @@ -1563,6 +1701,12 @@ Int2 Main (void) run_time = stop_time - start_time; if (vfd.outfp != NULL) { + if (vfd.has_errors) { + if (vfd.verbosity == 4) { + fprintf (vfd.outfp, "</asnval>\n"); + } + vfd.has_errors = FALSE; + } FileClose (vfd.outfp); } diff --git a/demo/blastall.c b/demo/blastall.c index 18f6dd4d..2cdfed86 100644 --- a/demo/blastall.c +++ b/demo/blastall.c @@ -1,6 +1,6 @@ -static char const rcsid[] = "$Id: blastall.c,v 6.201 2008/01/02 14:02:06 madden Exp $"; +static char const rcsid[] = "$Id: blastall.c,v 6.202 2008/07/01 18:38:14 madden Exp $"; -/* $Id: blastall.c,v 6.201 2008/01/02 14:02:06 madden Exp $ +/* $Id: blastall.c,v 6.202 2008/07/01 18:38:14 madden Exp $ ************************************************************************** * * * COPYRIGHT NOTICE * @@ -28,6 +28,9 @@ static char const rcsid[] = "$Id: blastall.c,v 6.201 2008/01/02 14:02:06 madden ************************************************************************** * * $Log: blastall.c,v $ + * Revision 6.202 2008/07/01 18:38:14 madden + * Correct X3 value for blastn/megablast + * * Revision 6.201 2008/01/02 14:02:06 madden * Make composition-based score adjustments the default for blastp and tblastn * @@ -1058,7 +1061,7 @@ static Args myargs[] = { "0.0", NULL, NULL, FALSE, 'y', ARG_FLOAT, 0.0, 0, NULL}, /* ARG_XDROP_UNGAPPED */ { "X dropoff value for final gapped alignment in bits " "(0.0 invokes default behavior)\n" - " blastn/megablast 50, tblastx 0, all others 25", + " blastn/megablast 100, tblastx 0, all others 25", "0", NULL, NULL, FALSE, 'Z', ARG_INT, 0.0, 0, NULL}, /* ARG_XDROP_FINAL */ #ifdef BLAST_CS_API { "RPS Blast search", /* 34 */ diff --git a/demo/blastpgp.c b/demo/blastpgp.c index 31ac2210..71ac8afa 100644 --- a/demo/blastpgp.c +++ b/demo/blastpgp.c @@ -1,6 +1,6 @@ -static char const rcsid[] = "$Id: blastpgp.c,v 6.139 2008/01/02 20:16:11 madden Exp $"; +static char const rcsid[] = "$Id: blastpgp.c,v 6.140 2008/03/31 13:35:18 madden Exp $"; -/* $Id: blastpgp.c,v 6.139 2008/01/02 20:16:11 madden Exp $ */ +/* $Id: blastpgp.c,v 6.140 2008/03/31 13:35:18 madden Exp $ */ /************************************************************************** * * * COPYRIGHT NOTICE * @@ -26,8 +26,12 @@ static char const rcsid[] = "$Id: blastpgp.c,v 6.139 2008/01/02 20:16:11 madden * appreciated. * * * ************************************************************************** - * $Revision: 6.139 $ + * $Revision: 6.140 $ * $Log: blastpgp.c,v $ + * Revision 6.140 2008/03/31 13:35:18 madden + * Change semantics of -c option, so that a new method for effective observations is used always and a new entropy-based method for column-specific PSI-BLAST pseudocounts is used by default. If default is used (-c 0), then all constants are defined in posit.c; if only the new method of effective observations is used, then the value of -c should be set by the user at approximately 30. (Changes + * submitted by Alejandro Schaffer). + * * Revision 6.139 2008/01/02 20:16:11 madden * XML output respects -v and -b option, JIRA SB-30 * @@ -674,8 +678,8 @@ static Args myargs[] = { "F", NULL, NULL, FALSE, 'I', ARG_BOOLEAN, 0.0, 0, NULL}, { "e-value threshold for inclusion in multipass model", /* ARG_EVALUE_INCLUSION_THRESHOLD */ "0.002", NULL, NULL, FALSE, 'h', ARG_FLOAT, 0.0, 0, NULL}, - { "Constant in pseudocounts for multipass version", /* ARG_PSEUDOCOUNT_CONSTANT */ - "9", NULL, NULL, FALSE, 'c', ARG_INT, 0.0, 0, NULL}, + { "Constant in pseudocounts for multipass version; 0 uses entropy method; otherwise a value near 30 is recommended", /* ARG_PSEUDOCOUNT_CONSTANT */ + "0", NULL, NULL, FALSE, 'c', ARG_INT, 0.0, 0, NULL}, { "Maximum number of passes to use in multipass version", /* ARG_MAX_PASSES */ "1", NULL, NULL, FALSE, 'j', ARG_INT, 0.0, 0, NULL}, { "Believe the query defline", /* ARG_BELIEVEQUERY */ diff --git a/demo/cleanasn.c b/demo/cleanasn.c index 297ad432..2bf45c84 100644 --- a/demo/cleanasn.c +++ b/demo/cleanasn.c @@ -29,7 +29,7 @@ * * Version Creation Date: 10/19/99 * -* $Revision: 6.15 $ +* $Revision: 6.29 $ * * File Description: * @@ -48,33 +48,50 @@ #include <objfdef.h> #include <objsub.h> #include <sequtil.h> +#include <gather.h> #include <sqnutils.h> #include <explore.h> #include <tofasta.h> #include <toasn3.h> +#include <subutil.h> +#include <asn2gnbk.h> #include <pmfapi.h> #include <tax3api.h> #ifdef INTERNAL_NCBI_CLEANASN #include <accpubseq.h> #endif -#define CLEANASN_APP_VER "1.6" +#define CLEANASN_APP_VER "2.2" CharPtr CLEANASN_APPLICATION = CLEANASN_APP_VER; typedef struct cleanflags { + Char buf [64]; Boolean batch; Boolean binary; Boolean compressed; Int2 type; CharPtr results; CharPtr outfile; + CharPtr report; + CharPtr ffdiff; + ModType ffmode; CharPtr clean; + CharPtr modernize; CharPtr link; CharPtr feat; + CharPtr desc; CharPtr mods; Boolean taxon; Boolean pub; + Int4 okay; + Int4 bsec; + Int4 ssec; + Int4 norm; + Int4 cumokay; + Int4 cumbsec; + Int4 cumssec; + Int4 cumnorm; AsnModulePtr amp; AsnTypePtr atp_bss; AsnTypePtr atp_bsss; @@ -169,17 +186,15 @@ static void RemoveUnnecGeneXref ( grpx = (GeneRefPtr) sfpx->data.value.ptrvalue; if (grpx == NULL) return; - if ((StringDoesHaveText (grp->locus)) && - (StringDoesHaveText (grpx->locus))) { - if ((StringICmp (grp->locus, grpx->locus) != 0)) return; - } else if (StringDoesHaveText (grp->locus_tag) && - StringDoesHaveText (grpx->locus_tag)) { - if ((StringICmp (grp->locus_tag, grpx->locus_tag) != 0)) return; + if (StringDoesHaveText (grp->locus_tag) && StringDoesHaveText (grpx->locus_tag)) { + if (StringICmp (grp->locus_tag, grpx->locus_tag) != 0) return; + } else if (StringDoesHaveText (grp->locus) && StringDoesHaveText (grpx->locus)) { + if (StringICmp (grp->locus, grpx->locus) != 0) return; } else if (grp->syn != NULL && grpx->syn != NULL) { syn1 = (CharPtr) grp->syn->data.ptrvalue; syn2 = (CharPtr) grpx->syn->data.ptrvalue; - if ((StringDoesHaveText (syn1)) && (StringDoesHaveText (syn2))) { - if ((StringICmp (syn1, syn2) != 0)) return; + if (StringDoesHaveText (syn1) && StringDoesHaveText (syn2)) { + if (StringICmp (syn1, syn2) != 0) return; } } @@ -207,7 +222,24 @@ static void RemoveUnnecGeneXref ( } } -static void AddSpTaxnameToList (SeqDescrPtr sdp, Pointer userdata) +static void MarkTitles ( + SeqDescrPtr sdp, + Pointer userdata +) + +{ + ObjValNodePtr ovn; + + if (sdp == NULL || sdp->choice != Seq_descr_title) return; + if (sdp->extended == 0) return; + ovn = (ObjValNodePtr) sdp; + ovn->idx.deleteme = TRUE; +} + +static void AddSpTaxnameToList ( + SeqDescrPtr sdp, + Pointer userdata +) { BioSourcePtr biop; @@ -220,7 +252,10 @@ static void AddSpTaxnameToList (SeqDescrPtr sdp, Pointer userdata) } -static Boolean ShouldExcludeSp (SeqEntryPtr sep) +static Boolean ShouldExcludeSp ( + SeqEntryPtr sep +) + { ValNodePtr name_list = NULL, vnp1, vnp2; Boolean all_diff = TRUE; @@ -356,6 +391,506 @@ static void LookupPubdesc ( PubmedEntryFree (pep); } +static void ModGenes (SeqFeatPtr sfp, Pointer userdata) + +{ + ModernizeGeneFields (sfp); +} + +static void ModRNAs (SeqFeatPtr sfp, Pointer userdata) + +{ + ModernizeRNAFields (sfp); +} + +static void ModPCRs (BioSourcePtr biop, Pointer userdata) + +{ + ModernizePCRPrimers (biop); +} + +static CharPtr Se2Str ( + SeqEntryPtr sep +) + +{ + AsnIoBSPtr aibp; + ByteStorePtr bs; + CharPtr str; + + if (sep == NULL) return NULL; + + bs = BSNew (1000); + if (bs == NULL) return NULL; + aibp = AsnIoBSOpen ("w", bs); + if (aibp == NULL) return NULL; + + SeqEntryAsnWrite (sep, aibp->aip, NULL); + + AsnIoFlush (aibp->aip); + AsnIoBSClose (aibp); + + str = BSMerge (bs, NULL); + BSFree (bs); + + return str; +} + +typedef struct chgdata { + Boolean rubisco; + Boolean rbc; + Boolean its; + Boolean rnaother; + Boolean trnanote; + Boolean oldbiomol; + Int4 protdesc; + Int4 sfpnote; + Int4 gbsource; + Int4 cdsconf; +} ChangeData, PNTR ChangeDataPtr; + +static Boolean IsRubisco ( + CharPtr name +) + +{ + return (StringICmp (name, "rubisco large subunit") == 0 || + StringICmp (name, "rubisco small subunit") == 0); +} + +static Boolean IsRbc ( + CharPtr name +) + +{ + return (StringICmp (name, "RbcL") == 0 || + StringICmp (name, "RbcS") == 0); +} + +static Boolean IsITS ( + CharPtr name +) + +{ + return (StringICmp (name, "its1") == 0 || + StringICmp (name, "its 1") == 0 || + StringICmp (name, "its2") == 0 || + StringICmp (name, "its 2") == 0 || + StringICmp (name, "its3") == 0 || + StringICmp (name, "its 3") == 0 || + StringICmp (name, "Ribosomal DNA internal transcribed spacer 1") == 0 || + StringICmp (name, "Ribosomal DNA internal transcribed spacer 2") == 0 || + StringICmp (name, "Ribosomal DNA internal transcribed spacer 3") == 0 || + StringICmp (name, "internal transcribed spacer 1 (ITS1)") == 0 || + StringICmp (name, "internal transcribed spacer 2 (ITS2)") == 0 || + StringICmp (name, "internal transcribed spacer 3 (ITS3)") == 0); +} + +static void ScoreFeature ( + SeqFeatPtr sfp, + Pointer userdata +) + +{ + ChangeDataPtr cdp; + CharPtr comment; + CdRegionPtr crp; + CharPtr desc; + GBQualPtr gbq; + CharPtr name; + ProtRefPtr prp; + Uint1 residue; + RnaRefPtr rrp; + CharPtr str; + ValNodePtr vnp; + + if (sfp == NULL) return; + cdp = (ChangeDataPtr) userdata; + if (cdp == NULL) return; + + comment = sfp->comment; + if (StringDoesHaveText (comment)) { + (cdp->sfpnote)++; + } + + /* skip feature types that do not use data.value.ptrvalue */ + switch (sfp->data.choice) { + case SEQFEAT_COMMENT: + case SEQFEAT_BOND: + case SEQFEAT_SITE: + case SEQFEAT_PSEC_STR: + return; + default: + break; + } + + if (sfp->data.value.ptrvalue == NULL) return; + + switch (sfp->data.choice) { + case SEQFEAT_CDREGION: + crp = (CdRegionPtr) sfp->data.value.ptrvalue; + if (crp->conflict) { + (cdp->cdsconf)++; + } + break; + case SEQFEAT_PROT: + prp = (ProtRefPtr) sfp->data.value.ptrvalue; + desc = prp->desc; + if (StringDoesHaveText (desc)) { + (cdp->protdesc)++; + } + for (vnp = prp->name; vnp != NULL; vnp = vnp->next) { + str = (CharPtr) vnp->data.ptrvalue; + if (StringHasNoText (str)) continue; + if (IsRubisco (str)) { + cdp->rubisco = TRUE; + } + if (IsRbc (str)) { + cdp->rbc = TRUE; + } + } + break; + case SEQFEAT_RNA : + rrp = (RnaRefPtr) sfp->data.value.ptrvalue; + if (rrp->type == 255 && rrp->ext.choice == 1) { + name = (CharPtr) rrp->ext.value.ptrvalue; + if (StringCmp (name, "misc_RNA") == 0) { + for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) { + if (StringCmp (gbq->qual, "product") != 0) continue; + name = gbq->val; + if (StringHasNoText (name)) continue; + if (IsITS (name)) { + cdp->its = TRUE; + } + } + } else if (StringCmp (name, "ncRNA") == 0 || StringCmp (name, "tmRNA") == 0) { + } else { + cdp->rnaother = TRUE; + if (IsITS (name)) { + cdp->its = TRUE; + } + } + } else if (rrp->type == 3 && rrp->ext.choice == 2) { + if (StringDoesHaveText (comment)) { + if (StringNCmp (comment, "aa: ", 4) == 0) { + comment += 4; + } + residue = FindTrnaAA3 (comment); + if (residue > 0 && residue != 255) { + cdp->trnanote = TRUE; + } + residue = FindTrnaAA (comment); + if (residue > 0 && residue != 255) { + cdp->trnanote = TRUE; + } + } + } + break; + default: + break; + } +} + +static void ScoreDescriptor ( + SeqDescrPtr sdp, + Pointer userdata +) + +{ + ChangeDataPtr cdp; + GBBlockPtr gbp; + MolInfoPtr mip; + + if (sdp == NULL) return; + cdp = (ChangeDataPtr) userdata; + if (cdp == NULL) return; + + switch (sdp->choice) { + case Seq_descr_genbank : + gbp = (GBBlockPtr) sdp->data.ptrvalue; + if (gbp != NULL) { + if (StringDoesHaveText (gbp->source)) { + (cdp->gbsource)++; + } + } + break; + case Seq_descr_molinfo : + mip = (MolInfoPtr) sdp->data.ptrvalue; + if (mip != NULL) { + switch (mip->biomol) { + case MOLECULE_TYPE_SNRNA: + case MOLECULE_TYPE_SCRNA: + case MOLECULE_TYPE_SNORNA: + cdp->oldbiomol = TRUE; + break; + default : + break; + } + } + break; + default : + break; + } +} + +static void CheckForChanges ( + SeqEntryPtr sep, + ChangeDataPtr cdp +) + +{ + if (sep == NULL || cdp == NULL) return; + + VisitFeaturesInSep (sep, (Pointer) cdp, ScoreFeature); + VisitDescriptorsInSep (sep, (Pointer) cdp, ScoreDescriptor); +} + +static void DoASNReport ( + SeqEntryPtr sep, + CleanFlagPtr cfp +) + +{ + Boolean bsec = FALSE, ssec = FALSE, norm = FALSE; + ChangeData cdbefore, cdafter; + CharPtr str1, str2, str3, str4; + + if (sep == NULL || cfp == NULL) return; + + MemSet ((Pointer) &cdbefore, 0, sizeof (ChangeData)); + MemSet ((Pointer) &cdafter, 0, sizeof (ChangeData)); + + CheckForChanges (sep, &cdbefore); + + str1 = Se2Str (sep); + NormalizeDescriptorOrder (sep); + str2 = Se2Str (sep); + if (StringCmp (str1, str2) != 0) { + norm = TRUE; + } + BasicSeqEntryCleanup (sep); + str3 = Se2Str (sep); + if (StringCmp (str2, str3) != 0) { + bsec = TRUE; + } + SeriousSeqEntryCleanup (sep, NULL, NULL); + NormalizeDescriptorOrder (sep); + str4 = Se2Str (sep); + if (StringCmp (str3, str4) != 0) { + ssec = TRUE; + } + + CheckForChanges (sep, &cdafter); + + if (ssec) { + (cfp->ssec)++; + (cfp->cumssec)++; + if (cfp->logfp != NULL) { + fprintf (cfp->logfp, "SSEC %s\n", cfp->buf); + fflush (cfp->logfp); + } + } else if (bsec) { + (cfp->bsec)++; + (cfp->cumbsec)++; + if (cfp->logfp != NULL) { + fprintf (cfp->logfp, "BSEC %s\n", cfp->buf); + fflush (cfp->logfp); + } + } else if (norm) { + (cfp->norm)++; + (cfp->cumnorm)++; + if (cfp->logfp != NULL) { + fprintf (cfp->logfp, "NORM %s\n", cfp->buf); + fflush (cfp->logfp); + } + } else { + (cfp->okay)++; + (cfp->cumokay)++; + if (cfp->logfp != NULL) { + fprintf (cfp->logfp, "OKAY %s\n", cfp->buf); + fflush (cfp->logfp); + } + } + + if (cdbefore.rubisco) { + if (cfp->logfp != NULL) { + fprintf (cfp->logfp, "RUB %s\n", cfp->buf); + fflush (cfp->logfp); + } + } + if (cdbefore.rbc) { + if (cfp->logfp != NULL) { + fprintf (cfp->logfp, "RBC %s\n", cfp->buf); + fflush (cfp->logfp); + } + } + if (cdbefore.its) { + if (cfp->logfp != NULL) { + fprintf (cfp->logfp, "ITS %s\n", cfp->buf); + fflush (cfp->logfp); + } + } + if (cdbefore.rnaother) { + if (cfp->logfp != NULL) { + fprintf (cfp->logfp, "RNA %s\n", cfp->buf); + fflush (cfp->logfp); + } + } + if (cdbefore.trnanote) { + if (cfp->logfp != NULL) { + fprintf (cfp->logfp, "TRN %s\n", cfp->buf); + fflush (cfp->logfp); + } + } + if (cdbefore.oldbiomol) { + if (cfp->logfp != NULL) { + fprintf (cfp->logfp, "MOL %s\n", cfp->buf); + fflush (cfp->logfp); + } + } + + if (cdbefore.protdesc != cdafter.protdesc) { + if (cfp->logfp != NULL) { + fprintf (cfp->logfp, "PRT %s\n", cfp->buf); + fflush (cfp->logfp); + } + } + if (cdbefore.sfpnote != cdafter.sfpnote) { + if (cfp->logfp != NULL) { + fprintf (cfp->logfp, "COM %s\n", cfp->buf); + fflush (cfp->logfp); + } + } + if (cdbefore.gbsource != cdafter.gbsource) { + if (cfp->logfp != NULL) { + fprintf (cfp->logfp, "SRC %s\n", cfp->buf); + fflush (cfp->logfp); + } + } + if (cdbefore.cdsconf != cdafter.cdsconf) { + if (cfp->logfp != NULL) { + fprintf (cfp->logfp, "CNF %s\n", cfp->buf); + fflush (cfp->logfp); + } + } + + MemFree (str1); + MemFree (str2); + MemFree (str3); + MemFree (str4); +} + +static void DoGBFFReport ( + SeqEntryPtr sep, + CleanFlagPtr cfp +) + +{ +#ifdef OS_UNIX + BioseqPtr bsp; + Char cmmd [256]; + FILE *fp; + SeqEntryPtr fsep; + Char path1 [PATH_MAX]; + Char path2 [PATH_MAX]; + CharPtr rep = "reports"; + SeqIdPtr sip; + + if (sep == NULL || cfp == NULL) return; + + if (cfp->logfp != NULL) { + fprintf (cfp->logfp, "%s\n", cfp->buf); + fflush (cfp->logfp); + } + + fsep = FindNthBioseq (sep, 1); + if (fsep != NULL && fsep->choice == 1) { + bsp = (BioseqPtr) fsep->data.ptrvalue; + if (bsp != NULL) { + for (sip = bsp->id; sip != NULL; sip = sip->next) { + switch (sip->choice) { + case SEQID_GENBANK : + rep = "gbreports"; + break; + case SEQID_EMBL : + rep = "ebreports"; + break; + case SEQID_DDBJ : + rep = "djreports"; + break; + case SEQID_OTHER : + rep = "rfreports"; + break; + default : + break; + } + } + } + } + + TmpNam (path1); + TmpNam (path2); + + fp = FileOpen (path1, "w"); + if (fp != NULL) { + SeqEntryToGnbk (sep, NULL, GENBANK_FMT, cfp->ffmode, NORMAL_STYLE, 0, 0, 0, NULL, fp); + } + FileClose (fp); + SeriousSeqEntryCleanupBulk (sep); + fp = FileOpen (path2, "w"); + if (fp != NULL) { + SeqEntryToGnbk (sep, NULL, GENBANK_FMT, cfp->ffmode, NORMAL_STYLE, 0, 0, 0, NULL, fp); + } + FileClose (fp); + + sprintf (cmmd, "%s -o %s -n %s -d %s", cfp->ffdiff, path1, path2, rep); + system (cmmd); + + sprintf (cmmd, "rm %s; rm %s", path1, path2); + system (cmmd); +#endif +} + +static void DoModernizeReport ( + SeqEntryPtr sep, + CleanFlagPtr cfp +) + +{ + CharPtr str1, str2, str3, str4; + + str1 = Se2Str (sep); + VisitFeaturesInSep (sep, NULL, ModGenes); + str2 = Se2Str (sep); + if (StringCmp (str1, str2) != 0) { + if (cfp->logfp != NULL) { + fprintf (cfp->logfp, "GEN %s\n", cfp->buf); + fflush (cfp->logfp); + } + } + VisitFeaturesInSep (sep, NULL, ModRNAs); + str3 = Se2Str (sep); + if (StringCmp (str2, str3) != 0) { + if (cfp->logfp != NULL) { + fprintf (cfp->logfp, "NCR %s\n", cfp->buf); + fflush (cfp->logfp); + } + } + VisitBioSourcesInSep (sep, NULL, ModPCRs); + str4 = Se2Str (sep); + if (StringCmp (str3, str4) != 0) { + if (cfp->logfp != NULL) { + fprintf (cfp->logfp, "PCR %s\n", cfp->buf); + fflush (cfp->logfp); + } + } + + MemFree (str1); + MemFree (str2); + MemFree (str3); + MemFree (str4); +} + static void DoCleanup ( SeqEntryPtr sep, Uint2 entityID, @@ -363,14 +898,63 @@ static void DoCleanup ( ) { + BioseqPtr bsp; + SeqEntryPtr fsep; + SeqIdPtr sip, siphead; + if (sep == NULL || cfp == NULL) return; + StringCpy (cfp->buf, ""); + fsep = FindNthBioseq (sep, 1); + if (fsep != NULL && fsep->choice == 1) { + bsp = (BioseqPtr) fsep->data.ptrvalue; + if (bsp != NULL) { + siphead = SeqIdSetDup (bsp->id); + for (sip = siphead; sip != NULL; sip = sip->next) { + SeqIdStripLocus (sip); + } + SeqIdWrite (siphead, cfp->buf, PRINTID_FASTA_LONG, sizeof (cfp->buf)); + SeqIdSetFree (siphead); + } + } + + if (StringChr (cfp->report, 'r') != NULL) { + DoASNReport (sep, cfp); + return; + } + if (StringChr (cfp->report, 'g') != NULL) { + DoGBFFReport (sep, cfp); + return; + } + if (StringChr (cfp->report, 'm') != NULL) { + DoModernizeReport (sep, cfp); + return; + } + + if (cfp->logfp != NULL) { + fprintf (cfp->logfp, "%s\n", cfp->buf); + fflush (cfp->logfp); + } + if (StringChr (cfp->clean, 'b') != NULL) { BasicSeqEntryCleanup (sep); } if (StringChr (cfp->clean, 's') != NULL) { SeriousSeqEntryCleanup (sep, NULL, NULL); } + if (StringChr (cfp->clean, 'n') != NULL) { + NormalizeDescriptorOrder (sep); + } + + if (StringChr (cfp->modernize, 'g') != NULL) { + VisitFeaturesInSep (sep, NULL, ModGenes); + } + if (StringChr (cfp->modernize, 'r') != NULL) { + VisitFeaturesInSep (sep, NULL, ModRNAs); + } + if (StringChr (cfp->modernize, 'p') != NULL) { + VisitBioSourcesInSep (sep, NULL, ModPCRs); + } if (cfp->taxon) { Taxon3ReplaceOrgInSeqEntry (sep, FALSE); @@ -407,6 +991,11 @@ static void DoCleanup ( VisitFeaturesInSep (sep, NULL, RemoveUnnecGeneXref); } + if (StringChr (cfp->desc, 't') != NULL) { + VisitDescriptorsInSep (sep, NULL, MarkTitles); + DeleteMarkedObjects (entityID, 0, NULL); + } + if (StringChr (cfp->mods, 'd') != NULL) { SeqMgrIndexFeatures (entityID, 0); DoAutoDef (sep, entityID); @@ -420,7 +1009,6 @@ static void CleanupSingleRecord ( { AsnIoPtr aip, aop; - AsnTypePtr atp = NULL; BioseqPtr bsp; BioseqSetPtr bssp; Pointer dataptr = NULL; @@ -566,13 +1154,10 @@ static void CleanupMultipleRecord ( AsnIoPtr aip, aop; AsnTypePtr atp; DataVal av; - BioseqPtr bsp; - Char buf [41]; Uint2 entityID; FILE *fp; - SeqEntryPtr fsep; size_t len; - Char longest [41]; + Char longest [64]; Int4 numrecords; Char path [PATH_MAX]; CharPtr ptr; @@ -689,25 +1274,13 @@ static void CleanupMultipleRecord ( entityID = ObjMgrGetEntityIDForChoice (sep); - fsep = FindNthBioseq (sep, 1); - if (fsep != NULL && fsep->choice == 1) { - bsp = (BioseqPtr) fsep->data.ptrvalue; - if (bsp != NULL) { - SeqIdWrite (bsp->id, buf, PRINTID_FASTA_LONG, sizeof (buf)); - if (cfp->logfp != NULL) { - fprintf (cfp->logfp, "%s\n", buf); - fflush (cfp->logfp); - } - } - } - starttime = GetSecs (); DoCleanup (sep, entityID, cfp); stoptime = GetSecs (); if (stoptime - starttime > worsttime) { worsttime = stoptime - starttime; - StringCpy (longest, buf); + StringCpy (longest, cfp->buf); } numrecords++; @@ -738,10 +1311,16 @@ static void CleanupMultipleRecord ( #else FileClose (fp); #endif - if (cfp->logfp != NULL && (! StringHasNoText (longest))) { - fprintf (cfp->logfp, "Longest processing time %ld seconds on %s\n", - (long) worsttime, longest); + if (cfp->logfp != NULL) { fprintf (cfp->logfp, "Total number of records %ld\n", (long) numrecords); + if (StringDoesHaveText (longest)) { + fprintf (cfp->logfp, "Longest processing time %ld seconds on %s\n", + (long) worsttime, longest); + } + if (cfp->okay > 0 || cfp->norm > 0 || cfp->bsec > 0 || cfp->ssec > 0) { + fprintf (cfp->logfp, "%ld OKAY, %ld NORM, %ld BSEC, %ld SSEC\n", + (long) cfp->okay, (long) cfp->norm, (long) cfp->bsec, (long) cfp->ssec); + } fflush (cfp->logfp); } } @@ -758,6 +1337,11 @@ static void CleanupOneRecord ( cfp = (CleanFlagPtr) userdata; if (cfp == NULL) return; + cfp->okay = 0; + cfp->bsec = 0; + cfp->ssec = 0; + cfp->norm = 0; + if (cfp->batch) { CleanupMultipleRecord (filename, cfp); } else { @@ -778,12 +1362,17 @@ static void CleanupOneRecord ( #define c_argCompressed 8 #define L_argLogFile 9 #define R_argRemote 10 -#define K_argClean 11 -#define N_argLink 12 -#define F_argFeat 13 -#define M_argMods 14 -#define T_argTaxonLookup 15 -#define P_argPubLookup 16 +#define Q_argReport 11 +#define q_argFfDiff 12 +#define m_argFfMode 13 +#define K_argClean 14 +#define U_argModernize 15 +#define N_argLink 16 +#define F_argFeat 17 +#define D_argDesc 18 +#define M_argMods 19 +#define T_argTaxonLookup 20 +#define P_argPubLookup 21 Args myargs [] = { {"Path to Files", NULL, NULL, NULL, @@ -814,10 +1403,29 @@ Args myargs [] = { TRUE, 'L', ARG_FILE_OUT, 0.0, 0, NULL}, {"Remote Fetching from ID", "F", NULL, NULL, TRUE, 'R', ARG_BOOLEAN, 0.0, 0, NULL}, + {"Report\n" + " r ASN.1 BSEC/SSEC Report\n" + " g GenBank SSEC Diff\n" + " m Modernize Gene/RNA/PCR", NULL, NULL, NULL, + TRUE, 'Q', ARG_STRING, 0.0, 0, NULL}, + {"Ffdiff Executable", "/netopt/genbank/subtool/bin/ffdiff", NULL, NULL, + TRUE, 'q', ARG_FILE_IN, 0.0, 0, NULL}, + {"Flatfile Mode\n" + " r Release\n" + " e Entrez\n" + " s Sequin\n" + " d Dump\n", NULL, NULL, NULL, + TRUE, 'm', ARG_STRING, 0.0, 0, NULL}, {"Cleanup\n" " b BasicSeqEntryCleanup\n" - " s SeriousSeqEntryCleanup", NULL, NULL, NULL, + " s SeriousSeqEntryCleanup\n" + " n Normalize Descriptor Order", NULL, NULL, NULL, TRUE, 'K', ARG_STRING, 0.0, 0, NULL}, + {"Modernize\n" + " g Gene\n" + " r RNA\n" + " p PCR Primers", NULL, NULL, NULL, + TRUE, 'U', ARG_STRING, 0.0, 0, NULL}, {"Link\n" " o LinkCDSmRNAbyOverlap\n" " p LinkCDSmRNAbyProduct\n" @@ -829,6 +1437,9 @@ Args myargs [] = { " d Remove db_xref\n" " r Remove Redundant Gene xref", NULL, NULL, NULL, TRUE, 'F', ARG_STRING, 0.0, 0, NULL}, + {"Descriptor\n" + " t Remove Title", NULL, NULL, NULL, + TRUE, 'D', ARG_STRING, 0.0, 0, NULL}, {"Miscellaneous\n" " d Automatic Definition Line", NULL, NULL, NULL, TRUE, 'M', ARG_STRING, 0.0, 0, NULL}, @@ -841,7 +1452,7 @@ Args myargs [] = { Int2 Main (void) { - Char app [64], type; + Char app [64], mode, type; CleanFlagData cfd; CharPtr directory, filter, infile, logfile, outfile, results, str, suffix; Boolean remote; @@ -937,9 +1548,41 @@ Int2 Main (void) remote = (Boolean) myargs [R_argRemote].intvalue; + cfd.report = myargs [Q_argReport].strvalue; + cfd.ffdiff = myargs [q_argFfDiff].strvalue; + + str = myargs [m_argFfMode].strvalue; + TrimSpacesAroundString (str); + if (StringDoesHaveText (str)) { + mode = str [0]; + } else { + mode = 'e'; + } + + mode = TO_LOWER (mode); + switch (mode) { + case 'r' : + cfd.ffmode = RELEASE_MODE; + break; + case 'e' : + cfd.ffmode = ENTREZ_MODE; + break; + case 's' : + cfd.ffmode = SEQUIN_MODE; + break; + case 'd' : + cfd.ffmode = DUMP_MODE; + break; + default : + cfd.ffmode = ENTREZ_MODE; + break; + } + cfd.clean = myargs [K_argClean].strvalue; + cfd.modernize = myargs [U_argModernize].strvalue; cfd.link = myargs [N_argLink].strvalue; cfd.feat = myargs [F_argFeat].strvalue; + cfd.desc = myargs [D_argDesc].strvalue; cfd.mods = myargs [M_argMods].strvalue; cfd.taxon = (Boolean) myargs [T_argTaxonLookup].intvalue; cfd.pub = (Boolean) myargs [P_argPubLookup].intvalue; @@ -974,10 +1617,17 @@ Int2 Main (void) starttime = GetSecs (); if (StringDoesHaveText (directory)) { + if (StringCmp (directory, results) == 0) { + Message (MSG_POSTERR, "-r results path must be different than -p data path"); + if (cfd.logfp != NULL) { + fprintf (cfd.logfp, "-r results path must be different than -p data path\n"); + } + } else { - cfd.results = results; + cfd.results = results; - DirExplore (directory, NULL, suffix, FALSE, CleanupOneRecord, (Pointer) &cfd); + DirExplore (directory, NULL, suffix, FALSE, CleanupOneRecord, (Pointer) &cfd); + } } else if (StringDoesHaveText (infile) && StringDoesHaveText (outfile)) { @@ -990,6 +1640,10 @@ Int2 Main (void) runtime = stoptime - starttime; if (cfd.logfp != NULL) { fprintf (cfd.logfp, "Finished in %ld seconds\n", (long) runtime); + if (cfd.cumokay > 0 || cfd.cumnorm > 0 || cfd.cumbsec > 0 || cfd.cumssec > 0) { + fprintf (cfd.logfp, "Cumulative counts - %ld OKAY, %ld NORM, %ld BSEC, %ld SSEC\n", + (long) cfd.cumokay, (long) cfd.cumnorm, (long) cfd.cumbsec, (long) cfd.cumssec); + } FileClose (cfd.logfp); } diff --git a/demo/copymat.c b/demo/copymat.c index cc09a1f7..761503f7 100644 --- a/demo/copymat.c +++ b/demo/copymat.c @@ -1,4 +1,4 @@ -static char const rcsid[] = "$Id: copymat.c,v 6.48 2008/02/01 14:04:25 madden Exp $"; +static char const rcsid[] = "$Id: copymat.c,v 6.49 2008/11/04 16:44:38 maning Exp $"; /* * =========================================================================== @@ -36,6 +36,9 @@ Contents: main routines for copymatrices program to convert score matrices output by makematrices into a single byte-encoded file. $Log: copymat.c,v $ +Revision 6.49 2008/11/04 16:44:38 maning +add type cast to fix compilation error + Revision 6.48 2008/02/01 14:04:25 madden LookupTableWrapInit prototype change @@ -448,19 +451,26 @@ static Boolean RPSUpdateOffsets(BlastAaLookupTable *lookup) Int4 index; Int4 num_used; Int4 offset_diff; + AaLookupBackboneCell *bbc; + Int4 *ovf; len = lookup->backbone_size; offset_diff = lookup->word_length - 1; + // database assumes backbone type of lookup table + ASSERT(lookup->bone_type == eBackbone); + bbc = (AaLookupBackboneCell *)(lookup->thick_backbone); + ovf = (Int4 *)(lookup->overflow); + /* Walk through table, copying info into mod_lt[] */ for(index = 0; index < len; index++) { - if((num_used=lookup->thick_backbone[index].num_used) <= 3) + if((num_used=bbc[index].num_used) <= 3) { while (num_used > 0) { num_used--; - lookup->thick_backbone[index].payload.entries[num_used] += offset_diff; + bbc[index].payload.entries[num_used] += offset_diff; } } else @@ -468,7 +478,7 @@ static Boolean RPSUpdateOffsets(BlastAaLookupTable *lookup) while (num_used > 0) { num_used--; - lookup->overflow [ lookup->thick_backbone[index].payload.overflow_cursor + num_used] += offset_diff; + ovf[ bbc[index].payload.overflow_cursor + num_used] += offset_diff; } } } @@ -493,6 +503,13 @@ Boolean RPSUpdatePointers(BlastAaLookupTable *lookup, Uint4 *new_overflow, Uint4 Uint4 *new_overflow_cursor; Int4 *src; Int4 first_hit; + AaLookupBackboneCell *bbc; + Int4 *ovf; + + // database assumes backbone type of lookup table + ASSERT(lookup->bone_type == eBackbone); + bbc = (AaLookupBackboneCell *)(lookup->thick_backbone); + ovf = (Int4 *)(lookup->overflow); len = lookup->backbone_size; @@ -501,22 +518,22 @@ Boolean RPSUpdatePointers(BlastAaLookupTable *lookup, Uint4 *new_overflow, Uint4 /* Walk through table, copying info into mod_lt[] */ for(index = 0; index < len; index++) { - if(lookup->thick_backbone[index].num_used <= 3) + if(bbc[index].num_used <= 3) continue; - src = &(lookup->overflow[lookup->thick_backbone[index].payload.overflow_cursor]); - MemCpy(new_overflow_cursor, &src[1], sizeof(Uint4)*(lookup->thick_backbone[index].num_used-1)); + src = &(ovf[bbc[index].payload.overflow_cursor]); + MemCpy(new_overflow_cursor, &src[1], sizeof(Uint4)*(bbc[index].num_used-1)); mlpp_address = (long) new_overflow_cursor; - new_overflow_cursor += lookup->thick_backbone[index].num_used-1; + new_overflow_cursor += bbc[index].num_used-1; first_hit = src[0]; mlpp_address -= (long) start_address; /* Now this is new relative address - usually small */ - lookup->thick_backbone[index].payload.entries[1] = (Int4) mlpp_address; - lookup->thick_backbone[index].payload.entries[0] = first_hit; + bbc[index].payload.entries[1] = (Int4) mlpp_address; + bbc[index].payload.entries[0] = first_hit; } diff --git a/demo/cspeedtest.c b/demo/cspeedtest.c index 6157d0c3..5b682ac0 100644 --- a/demo/cspeedtest.c +++ b/demo/cspeedtest.c @@ -29,7 +29,7 @@ * * Version Creation Date: 12/17/07 * -* $Revision: 1.10 $ +* $Revision: 1.20 $ * * File Description: * @@ -55,8 +55,16 @@ #include <tofasta.h> #include <asn2gnbk.h> #include <valid.h> +#include <suggslp.h> -#define CSPEEDTEST_APP_VER "1.3" +NLM_EXTERN CharPtr NewCreateDefLine ( + ItemInfoPtr iip, + BioseqPtr bsp, + Boolean ignoreTitle, + Boolean extProtTitle +); + +#define CSPEEDTEST_APP_VER "1.9" CharPtr CSPEEDTEST_APPLICATION = CSPEEDTEST_APP_VER; @@ -69,11 +77,14 @@ typedef struct cspeedflags { Int4 maxcount; CharPtr io; CharPtr clean; + CharPtr skip; CharPtr index; CharPtr seq; CharPtr feat; CharPtr desc; CharPtr verify; + BioseqPtr nucbsp; + Int2 genCode; AsnModulePtr amp; AsnTypePtr atp_bss; AsnTypePtr atp_bsss; @@ -94,6 +105,111 @@ static void DoVisitFeaturesTest ( /* empty visit callback */ } +static void DoVisitCodingRegions ( + SeqFeatPtr sfp, + Pointer userdata +) + +{ + BioseqPtr bsp; + CharPtr caret5, caret3; + CSpeedFlagPtr cfp; + Char id [64]; + SeqLocPtr loc, slp; + Boolean partial5, partial3; + SeqIdPtr sip; + Int4 start, stop; + + if (sfp == NULL || sfp->data.choice != SEQFEAT_CDREGION) return; + cfp = (CSpeedFlagPtr) userdata; + if (cfp == NULL || cfp->ofp == NULL) return; + + loc = sfp->location; + bsp = BioseqFindFromSeqLoc (loc); + if (bsp == NULL) return; + + StringCpy (id, "?"); + if (sfp->product != NULL) { + sip = SeqLocId (sfp->product); + if (sip != NULL) { + SeqIdWrite (sip, id, PRINTID_FASTA_SHORT, sizeof (id) - 1); + } + } + + fprintf (cfp->ofp, "%s\n", id); + slp = SeqLocFindNext (loc, NULL); + while (slp != NULL) { + start = GetOffsetInBioseq (slp, bsp, SEQLOC_START) + 1; + stop = GetOffsetInBioseq (slp, bsp, SEQLOC_STOP) + 1; + caret5 = ""; + caret3 = ""; + CheckSeqLocForPartial (slp, &partial5, &partial3); + if (partial5) { + caret5 = "<"; + } + if (partial3) { + caret3 = ">"; + } + fprintf (cfp->ofp, "%s%ld\t%s%ld\n", caret5, (long) start, caret3, (long) stop); + slp = SeqLocFindNext (loc, slp); + } +} + +static void DoSuggestIntervals ( + BioseqPtr bsp, + Pointer userdata +) + +{ + CharPtr caret5, caret3; + CSpeedFlagPtr cfp; + Char id [64]; + SeqLocPtr loc, slp; + Boolean partial5, partial3; + SeqAnnotPtr sap; + SeqFeatPtr sfp; + SeqIdPtr sip; + Int4 start, stop; + + if (bsp == NULL) return; + if (! ISA_aa (bsp->mol)) return; + cfp = (CSpeedFlagPtr) userdata; + if (cfp == NULL || cfp->ofp == NULL || cfp->nucbsp == NULL) return; + + sip = SeqIdFindBest (bsp->id, 0); + if (sip == NULL) return; + SeqIdWrite (sip, id, PRINTID_FASTA_SHORT, sizeof (id) - 1); + + sap = SuggestCodingRegion (cfp->nucbsp, bsp, cfp->genCode); + if (sap == NULL) return; + if (sap->type == 1) { + sfp = (SeqFeatPtr) sap->data; + if (sfp != NULL && sfp->data.choice == SEQFEAT_CDREGION) { + loc = sfp->location; + if (loc != NULL) { + fprintf (cfp->ofp, "%s\n", id); + slp = SeqLocFindNext (loc, NULL); + while (slp != NULL) { + start = GetOffsetInBioseq (slp, cfp->nucbsp, SEQLOC_START) + 1; + stop = GetOffsetInBioseq (slp, cfp->nucbsp, SEQLOC_STOP) + 1; + caret5 = ""; + caret3 = ""; + CheckSeqLocForPartial (slp, &partial5, &partial3); + if (partial5) { + caret5 = "<"; + } + if (partial3) { + caret3 = ">"; + } + fprintf (cfp->ofp, "%s%ld\t%s%ld\n", caret5, (long) start, caret3, (long) stop); + slp = SeqLocFindNext (loc, slp); + } + } + } + } + SeqAnnotFree (sap); +} + static void DoGeneOverlapPrintTest ( SeqFeatPtr sfp, Pointer userdata @@ -215,6 +331,105 @@ static void DoFastaDefline ( } } +static void DoNewFastaDefline ( + BioseqPtr bsp, + Pointer userdata +) + +{ + BioseqSetPtr bssp; + CSpeedFlagPtr cfp; + Char id [128]; + CharPtr title; + + if (bsp == NULL) return; + cfp = (CSpeedFlagPtr) userdata; + if (cfp == NULL) return; + + if (StringChr (cfp->skip, 's') != NULL) { + if (bsp->idx.parenttype == OBJ_BIOSEQSET) { + bssp = (BioseqSetPtr) bsp->idx.parentptr; + if (bssp != NULL) { + if (bssp->_class == BioseqseqSet_class_segset || + bssp->_class == BioseqseqSet_class_parts) return; + } + } + } + if (StringChr (cfp->skip, 'v') != NULL) { + if (bsp->repr == Seq_repr_virtual) return; + } + + id [0] = '\0'; + SeqIdWrite (bsp->id, id, PRINTID_FASTA_LONG, sizeof (id) - 1); + title = NewCreateDefLine (NULL, bsp, FALSE, FALSE); + if (StringHasNoText (title)) { + title = StringSave ("?"); + } + + if (cfp->ofp != NULL) { + fprintf (cfp->ofp, ">%s %s\n", id, title); + } + + MemFree (title); +} + +static void DoFastaComp ( + BioseqPtr bsp, + Pointer userdata, + Boolean ignoreExisting +) + +{ + Char buf [4096]; + CSpeedFlagPtr cfp; + Char id [128]; + CharPtr title; + + if (bsp == NULL) return; + cfp = (CSpeedFlagPtr) userdata; + if (cfp == NULL) return; + + id [0] = '\0'; + SeqIdWrite (bsp->id, id, PRINTID_FASTA_LONG, sizeof (id) - 1); + buf [0] = '\0'; + CreateDefLineExEx (NULL, bsp, buf, sizeof (buf) - 1, 0, + NULL, NULL, ignoreExisting, FALSE); + title = NewCreateDefLine (NULL, bsp, ignoreExisting, FALSE); + if (StringHasNoText (title)) { + title = StringSave ("?"); + } + + if (StringCmp (buf, title) != 0) { + if (cfp->ofp != NULL) { + fprintf (cfp->ofp, "< %s %s\n", id, buf); + fprintf (cfp->ofp, "> %s %s\n", id, title); + } + printf ("< %s %s\n", id, buf); + printf ("> %s %s\n", id, title); + fflush (stdout); + } + + MemFree (title); +} + +static void DoFastaExist ( + BioseqPtr bsp, + Pointer userdata +) + +{ + DoFastaComp (bsp, userdata, FALSE); +} + +static void DoFastaRegen ( + BioseqPtr bsp, + Pointer userdata +) + +{ + DoFastaComp (bsp, userdata, TRUE); +} + static void DoFastaFeat ( SeqFeatPtr sfp, Pointer userdata @@ -344,6 +559,20 @@ static void LIBCALLBACK ValidCallback ( fprintf (fp, "\n"); } +static void MarkTitles ( + SeqDescrPtr sdp, + Pointer userdata +) + +{ + ObjValNodePtr ovn; + + if (sdp == NULL || sdp->choice != Seq_descr_title) return; + if (sdp->extended == 0) return; + ovn = (ObjValNodePtr) sdp; + ovn->idx.deleteme = TRUE; +} + static void DoProcess ( SeqEntryPtr sep, Uint2 entityID, @@ -351,11 +580,19 @@ static void DoProcess ( ) { + Char id [64]; ErrSev oldErrSev; ValidStructPtr vsp; if (sep == NULL || cfp == NULL) return; + if (StringChr (cfp->clean, 't') != NULL) { + VisitDescriptorsInSep (sep, NULL, MarkTitles); + DeleteMarkedObjects (entityID, 0, NULL); + } + if (StringChr (cfp->clean, 'a') != NULL) { + AssignIDsInEntity (entityID, 0, NULL); + } if (StringChr (cfp->clean, 'b') != NULL) { BasicSeqEntryCleanup (sep); } @@ -367,15 +604,49 @@ static void DoProcess ( SeqMgrIndexFeatures (entityID, 0); } + if (StringChr (cfp->seq, 'c') != NULL) { + VisitBioseqsInSep (sep, (Pointer) cfp, DoFastaExist); + } + if (StringChr (cfp->seq, 'C') != NULL) { + VisitBioseqsInSep (sep, (Pointer) cfp, DoFastaRegen); + } if (StringChr (cfp->seq, 's') != NULL) { VisitBioseqsInSep (sep, (Pointer) cfp, DoFastaSeq); } + if (StringChr (cfp->seq, 'S') != NULL) { + if (SeqMgrFeaturesAreIndexed (entityID) == 0) { + SeqMgrIndexFeatures (entityID, 0); + } + VisitBioseqsInSep (sep, (Pointer) cfp, DoFastaSeq); + } if (StringChr (cfp->seq, 'r') != NULL) { VisitBioseqsInSep (sep, (Pointer) cfp, DoFastaRaw); } if (StringChr (cfp->seq, 'd') != NULL) { VisitBioseqsInSep (sep, (Pointer) cfp, DoFastaDefline); } + if (StringChr (cfp->seq, 'D') != NULL) { + if (SeqMgrFeaturesAreIndexed (entityID) == 0) { + SeqMgrIndexFeatures (entityID, 0); + } + VisitBioseqsInSep (sep, (Pointer) cfp, DoFastaDefline); + } + if (StringChr (cfp->seq, 'T') != NULL) { + VisitDescriptorsInSep (sep, NULL, MarkTitles); + DeleteMarkedObjects (entityID, 0, NULL); + SeqMgrIndexFeatures (entityID, 0); + VisitBioseqsInSep (sep, (Pointer) cfp, DoFastaDefline); + } + if (StringChr (cfp->seq, 'x') != NULL) { + VisitBioseqsInSep (sep, (Pointer) cfp, DoNewFastaDefline); + } + if (StringChr (cfp->seq, 'X') != NULL) { + VisitDescriptorsInSep (sep, NULL, MarkTitles); + DeleteMarkedObjects (entityID, 0, NULL); + SeqMgrIndexFeatures (entityID, 0); + VisitBioseqsInSep (sep, (Pointer) cfp, DoNewFastaDefline); + } + if (StringChr (cfp->seq, 'f') != NULL) { VisitFeaturesInSep (sep, (Pointer) cfp, DoFastaFeat); } @@ -408,6 +679,39 @@ static void DoProcess ( SeqEntryToGnbk (sep, NULL, FTABLE_FMT, SEQUIN_MODE, NORMAL_STYLE, 0, 0, SHOW_PROT_FTABLE, NULL, cfp->ofp); } + if (StringChr (cfp->feat, 's') != NULL) { + if (SeqMgrFeaturesAreIndexed (entityID) == 0) { + SeqMgrIndexFeatures (entityID, 0); + } + cfp->nucbsp = FindNucBioseq (sep); + if (cfp->nucbsp != NULL) { + BioseqToGeneticCode (cfp->nucbsp, &(cfp->genCode), NULL, NULL, NULL, 0, NULL); + SeqIdWrite (cfp->nucbsp->id, id, PRINTID_FASTA_LONG, sizeof (id) - 1); + fprintf (cfp->ofp, "%s\n", id); + VisitBioseqsInSep (sep, (Pointer) cfp, DoSuggestIntervals); + cfp->nucbsp = NULL; + cfp->genCode = 0; + } + } + if (StringChr (cfp->feat, 'S') != NULL) { + if (SeqMgrFeaturesAreIndexed (entityID) == 0) { + SeqMgrIndexFeatures (entityID, 0); + } + cfp->nucbsp = FindNucBioseq (sep); + if (cfp->nucbsp != NULL) { + BioseqToGeneticCode (cfp->nucbsp, &(cfp->genCode), NULL, NULL, NULL, 0, NULL); + SetBatchSuggestNucleotide (cfp->nucbsp, cfp->genCode); + SeqIdWrite (cfp->nucbsp->id, id, PRINTID_FASTA_LONG, sizeof (id) - 1); + fprintf (cfp->ofp, "%s\n", id); + VisitBioseqsInSep (sep, (Pointer) cfp, DoSuggestIntervals); + ClearBatchSuggestNucleotide (); + cfp->nucbsp = NULL; + cfp->genCode = 0; + } + } + if (StringChr (cfp->feat, 'c') != NULL) { + VisitFeaturesInSep (sep, (Pointer) cfp, DoVisitCodingRegions); + } if (StringChr (cfp->desc, 'b') != NULL) { } @@ -782,6 +1086,7 @@ static void ProcessMultipleRecord ( } starttime = GetSecs (); + for (x = 0; x < cfp->maxcount; x++) { DoProcess (sep, entityID, cfp); } @@ -856,11 +1161,12 @@ static void ProcessOneRecord ( #define X_argMaxCount 11 #define O_argInOut 12 #define K_argClean 13 -#define I_argIndex 14 -#define S_argSeq 15 -#define F_argFeat 16 -#define D_argDesc 17 -#define V_argVerify 18 +#define P_argSkip 14 +#define I_argIndex 15 +#define S_argSeq 16 +#define F_argFeat 17 +#define D_argDesc 18 +#define V_argVerify 19 Args myargs [] = { {"Path to Files", NULL, NULL, NULL, @@ -901,16 +1207,29 @@ Args myargs [] = { " wb Write Binary ASN.1", NULL, NULL, NULL, TRUE, 'O', ARG_STRING, 0.0, 0, NULL}, {"Cleanup\n" + " t Remove Titles\n" + " a AssignIDsInEntity\n" " b BasicSeqEntryCleanup\n" " s SeriousSeqEntryCleanup", NULL, NULL, NULL, TRUE, 'K', ARG_STRING, 0.0, 0, NULL}, + {"Skip\n" + " s Segmented Set Components\n" + " v Virtual Bioseqs", NULL, NULL, NULL, + TRUE, 'P', ARG_STRING, 0.0, 0, NULL}, {"Index\n" " f Feature Indexing", NULL, NULL, NULL, TRUE, 'I', ARG_STRING, 0.0, 0, NULL}, {"Sequence\n" + " c Compare FASTA Deflines\n" + " C Compare Regenerated FASTA Deflines\n" " s FASTA of Sequence\n" + " S Indexed FASTA\n" " r Raw FASTA no Defline\n" " d Just FASTA Defline\n" + " D Indexed FASTA Defline\n" + " T Regenerate FASTA Titles\n" + " x New FASTA Titles\n" + " X Regenerate new FASTA Titles\n" " f FASTA by Feature\n" " t FASTA of Translation", NULL, NULL, NULL, TRUE, 'S', ARG_STRING, 0.0, 0, NULL}, @@ -921,7 +1240,10 @@ Args myargs [] = { " x Gene by Xref\n" " o Operon by Overlap\n" " d Feature by ID\n" - " t Feature Table", NULL, NULL, NULL, + " t Feature Table\n" + " s Slow Suggest Intervals\n" + " S Indexed Suggest Intervals\n" + " c Coding Region Intervals", NULL, NULL, NULL, TRUE, 'F', ARG_STRING, 0.0, 0, NULL}, {"Descriptor\n" " b BioSource\n" @@ -1042,6 +1364,7 @@ Int2 Main (void) cfd.io = myargs [O_argInOut].strvalue; cfd.clean = myargs [K_argClean].strvalue; + cfd.skip = myargs [P_argSkip].strvalue; cfd.index = myargs [I_argIndex].strvalue; cfd.seq = myargs [S_argSeq].strvalue; cfd.feat = myargs [F_argFeat].strvalue; @@ -1089,6 +1412,7 @@ Int2 Main (void) fprintf (cfd.logfp, "Finished in %ld seconds\n", (long) runtime); FileClose (cfd.logfp); } + printf ("Finished in %ld seconds\n", (long) runtime); if (remote) { PubSeqFetchDisable (); diff --git a/demo/entrez2.c b/demo/entrez2.c index 3977579d..b5e4633e 100644 --- a/demo/entrez2.c +++ b/demo/entrez2.c @@ -29,7 +29,7 @@ * * Version Creation Date: 06/16/00 * -* $Revision: 6.28 $ +* $Revision: 6.29 $ * * File Description: * @@ -61,7 +61,7 @@ #include <entrez2.h> -#define ENTREZ_APP_VERSION "9.5" +#define ENTREZ_APP_VERSION "9.6" #define MAX_QUERY_FORMS 256 diff --git a/demo/formatrpsdb.c b/demo/formatrpsdb.c index f6420407..35276ce0 100644 --- a/demo/formatrpsdb.c +++ b/demo/formatrpsdb.c @@ -1,4 +1,4 @@ -static char const rcsid[] = "$Id: formatrpsdb.c,v 1.25 2007/05/07 13:29:11 kans Exp $"; +static char const rcsid[] = "$Id: formatrpsdb.c,v 1.28 2008/11/04 16:26:59 boratyng Exp $"; /***************************************************************************** @@ -38,6 +38,17 @@ static char const rcsid[] = "$Id: formatrpsdb.c,v 1.25 2007/05/07 13:29:11 kans *************************************************************************** $Log: formatrpsdb.c,v $ + Revision 1.28 2008/11/04 16:26:59 boratyng + Synchronized with the new BlastAaLookupTable implementation + + Revision 1.27 2008/08/13 13:33:21 ucko + Correct previous revision to build even with strict compilers (MSVC, MIPSpro): + - In FileWriteInChunks, cast ptr to char* to allow adding to it. + - In RPS_DbClose, drop unused mid-block declaration of chunk. + + Revision 1.26 2008/08/12 16:53:12 boratyng + Added function that calls FileWrite for chunks of data in order to aviod FileWrite warning: size > SIZE_MAX + Revision 1.25 2007/05/07 13:29:11 kans added casts for Seq-data.gap (SeqDataPtr, SeqGapPtr, ByteStorePtr) @@ -93,12 +104,12 @@ static char const rcsid[] = "$Id: formatrpsdb.c,v 1.25 2007/05/07 13:29:11 kans 1. Modify scoremat IO to comply with new scoremat spec 2. Remove check that residue frequencies read from scoremat are <= 1.0 3. Add input argument to specify the underlying score matrix, or to - use the score matrix specified in the scoremat if present + use the score matrix specified in the scoremat if present Revision 1.8 2004/09/15 18:06:13 papadopo 1. Verify that the scale factor is the same for all input scoremats 2. Do not use the scoremat 'identifier' field to determine the underlying - score matrix; hardwire to BLOSUM62 temporarily + score matrix; hardwire to BLOSUM62 temporarily 3. Use BlastSeqLoc's instead of ListNodes Revision 1.7 2004/08/25 14:47:50 camacho @@ -897,6 +908,24 @@ Int2 RPSAddSequence(RPS_DbInfo *info, return 0; } +/* Writes data to file in chunks in order to avoid allocations larger than + SIZE_MAX in FileWrite */ + +size_t FileWriteInChunks(const void* ptr, size_t size, size_t n, FILE* stream) +{ + size_t chunk = SIZE_MAX / size; + size_t i; + size_t count = 0; + + for (i=0;i < n;i+=chunk) { + count += FileWrite((char*)ptr + i*size, size, + (n-i < chunk ? n-i : chunk), stream); + } + + return count; +} + + /* Once all sequences have been processed, perform final setup on the BLAST lookup table and finish up the RPS files */ @@ -913,7 +942,7 @@ void RPS_DbClose(RPS_DbInfo *info) /* Pack the lookup table into its compressed form */ - if (BlastAaLookupFinalize(info->lookup) != 0) { + if (BlastAaLookupFinalize(info->lookup, eBackbone) != 0) { ErrPostEx(SEV_WARNING, 0, 0, "Failed to compress lookup table"); } else { @@ -933,7 +962,8 @@ void RPS_DbClose(RPS_DbInfo *info) /* for each lookup table cell */ for (index = cursor = 0; index < lut->backbone_size; index++) { - cell = &lut->thick_backbone[index]; + cell = (AaLookupBackboneCell*)lut->thick_backbone + index; + if (cell->num_used == 0) continue; @@ -956,11 +986,12 @@ void RPS_DbClose(RPS_DbInfo *info) offsets as well */ old_cursor = cell->payload.overflow_cursor; - cell->payload.entries[0] = lut->overflow[old_cursor] + + cell->payload.entries[0] = ((Int4*)lut->overflow)[old_cursor] + BLAST_WORDSIZE_PROT - 1; cell->payload.entries[1] = cursor * sizeof(Int4); for (i = 1; i < cell->num_used; i++, cursor++) { - lut->overflow[cursor] = lut->overflow[old_cursor + i] + + ((Int4*)lut->overflow)[cursor] + = ((Int4*)lut->overflow)[old_cursor + i] + BLAST_WORDSIZE_PROT - 1; } } @@ -976,9 +1007,11 @@ void RPS_DbClose(RPS_DbInfo *info) FileWrite(&header, sizeof(header), 1, info->lookup_fd); /* write the thick backbone */ - - FileWrite(lut->thick_backbone, sizeof(RPSBackboneCell), + + + FileWriteInChunks(lut->thick_backbone, sizeof(RPSBackboneCell), lut->backbone_size, info->lookup_fd); + /* write extra backbone cells */ @@ -989,7 +1022,7 @@ void RPS_DbClose(RPS_DbInfo *info) /* write the new overflow array */ - FileWrite(lut->overflow, sizeof(Int4), cursor, info->lookup_fd); + FileWriteInChunks(lut->overflow, sizeof(Int4), cursor, info->lookup_fd); } /* Free data, close files */ diff --git a/demo/nps2gps.c b/demo/nps2gps.c index d926a85b..6656fac4 100644 --- a/demo/nps2gps.c +++ b/demo/nps2gps.c @@ -29,7 +29,7 @@ * * Version Creation Date: 5/12/05 * -* $Revision: 1.13 $ +* $Revision: 1.14 $ * * File Description: * @@ -50,7 +50,7 @@ #include <toasn3.h> #include <pmfapi.h> -#define NPS2GPSAPP_VER "2.1" +#define NPS2GPSAPP_VER "2.2" CharPtr NPS2GPSAPPLICATION = NPS2GPSAPP_VER; @@ -61,6 +61,7 @@ typedef struct n2gdata { Boolean lock; Boolean byFeatID; Boolean useProtID; + Boolean refSeqTitles; } N2GData, PNTR N2GPtr; typedef struct npsseqs { @@ -279,7 +280,8 @@ static void LclCopyGene ( static void LclAddMrnaTitles ( SeqLocPtr slp, - Pointer userdata + CharPtr organism, + Boolean refSeqTitles ) { @@ -289,7 +291,6 @@ static void LclAddMrnaTitles ( SeqMgrFeatContext gcontext; CharPtr genelabel = NULL; size_t len; - CharPtr organism; SeqFeatPtr sfp; CharPtr str; @@ -297,7 +298,6 @@ static void LclAddMrnaTitles ( bsp = BioseqFindFromSeqLoc (slp); if (bsp == NULL) return; if (! ISA_na (bsp->mol)) return; - organism = (CharPtr) userdata; if (BioseqGetTitle (bsp) != NULL) return; sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_GENE, 0, &gcontext); if (sfp != NULL) { @@ -333,9 +333,18 @@ static void LclAddMrnaTitles ( } if (cdslabel != NULL && genelabel != NULL) { if (ccontext.partialL || ccontext.partialR) { - StringCat (str, " mRNA, partial cds."); + if (refSeqTitles) { + StringCat (str, " partial mRNA."); + } else { + StringCat (str, " mRNA, partial cds."); + } } else { - StringCat (str, " mRNA, complete cds."); + if (refSeqTitles) { + /* requested to make all mRNAs partial in defline */ + StringCat (str, " partial mRNA."); + } else { + StringCat (str, " mRNA, complete cds."); + } } } else if (genelabel != NULL) { StringCat (str, " mRNA."); @@ -808,7 +817,7 @@ static void NPStoGPS ( sfp = SeqMgrGetNextFeature (bsp, NULL, 0, FEATDEF_mRNA, &mcontext); while (sfp != NULL) { - LclAddMrnaTitles (sfp->product, organism); + LclAddMrnaTitles (sfp->product, organism, ngp->refSeqTitles); sfp = SeqMgrGetNextFeature (bsp, sfp, 0, FEATDEF_mRNA, &mcontext); } @@ -976,16 +985,17 @@ static void ProcessOneRecord ( /* Args structure contains command-line arguments */ -#define p_argInputPath 0 -#define r_argOutputPath 1 -#define i_argInputFile 2 -#define o_argOutputFile 3 -#define f_argFilter 4 -#define x_argSuffix 5 -#define R_argRemote 6 -#define L_argLockFar 7 -#define F_argUseFeatID 8 -#define P_argUseProtID 9 +#define p_argInputPath 0 +#define r_argOutputPath 1 +#define i_argInputFile 2 +#define o_argOutputFile 3 +#define f_argFilter 4 +#define x_argSuffix 5 +#define R_argRemote 6 +#define L_argLockFar 7 +#define F_argUseFeatID 8 +#define P_argUseProtID 9 +#define D_argRefSeqTitles 10 Args myargs [] = { @@ -1009,6 +1019,8 @@ Args myargs [] = { TRUE, 'F', ARG_BOOLEAN, 0.0, 0, NULL}, {"mRNA ID from Protein", "F", NULL, NULL, TRUE, 'P', ARG_BOOLEAN, 0.0, 0, NULL}, + {"RefSeq mRNA Titles", "F", NULL, NULL, + TRUE, 'D', ARG_BOOLEAN, 0.0, 0, NULL}, }; Int2 Main (void) @@ -1062,6 +1074,7 @@ Int2 Main (void) ngd.lock = (Boolean) myargs [L_argLockFar].intvalue; ngd.byFeatID = (Boolean) myargs [F_argUseFeatID].intvalue; ngd.useProtID = (Boolean) myargs [P_argUseProtID].intvalue; + ngd.refSeqTitles = (Boolean) myargs [D_argRefSeqTitles].intvalue; directory = (CharPtr) myargs [p_argInputPath].strvalue; results = (CharPtr) myargs [r_argOutputPath].strvalue; diff --git a/demo/rpsblast.c b/demo/rpsblast.c index f476d692..fd85ee1a 100644 --- a/demo/rpsblast.c +++ b/demo/rpsblast.c @@ -1,6 +1,6 @@ -static char const rcsid[] = "$Id: rpsblast.c,v 6.92 2007/08/21 20:07:01 kans Exp $"; +static char const rcsid[] = "$Id: rpsblast.c,v 6.93 2008/07/23 14:06:57 madden Exp $"; -/* $Id: rpsblast.c,v 6.92 2007/08/21 20:07:01 kans Exp $ +/* $Id: rpsblast.c,v 6.93 2008/07/23 14:06:57 madden Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE @@ -31,12 +31,15 @@ static char const rcsid[] = "$Id: rpsblast.c,v 6.92 2007/08/21 20:07:01 kans Exp * * Initial Version Creation Date: 12/14/1999 * -* $Revision: 6.92 $ +* $Revision: 6.93 $ * * File Description: * Main file for RPS BLAST program * * $Log: rpsblast.c,v $ +* Revision 6.93 2008/07/23 14:06:57 madden +* Fix ASN.1 output (JIRA SB-89) +* * Revision 6.92 2007/08/21 20:07:01 kans * include gencode_singleton.h, cast first argument to BlastFormattingInfoNew to fix CodeWarrior complaint * @@ -575,6 +578,7 @@ Int2 Main(void) Blast_SummaryReturn* full_sum_returns = NULL; Boolean believe_query = (Boolean) myargs[OPT_BELIEVE_QUERY].intvalue; Char buf[256] = { '\0' }; + BlastFormattingInfo* asn_format_info = NULL; GeneticCodeSingletonInit(); StringCpy(buf, "rpsblast "); @@ -654,6 +658,20 @@ Int2 Main(void) believe_query); BLAST_PrintOutputHeader(format_info); + if (myargs[OPT_ASNOUT].strvalue) { + /* This just prints out the ASN.1 to a secondary file. */ + BlastFormattingInfoNew(eAlignViewAsnText, options, + blast_program, dbname, + myargs[OPT_ASNOUT].strvalue, &asn_format_info); + + BlastFormattingInfoSetUpOptions(asn_format_info, + myargs[OPT_NUM_DESC].intvalue, + myargs[OPT_NUM_DESC].intvalue, + FALSE, + FALSE, + FALSE, + TRUE); + } /* Loop over sets of queries. */ while (1) { @@ -719,23 +737,9 @@ Int2 Main(void) /* format results */ if (myargs[OPT_ASNOUT].strvalue) { - /* This just prints out the ASN.1 to a secondary file. */ - BlastFormattingInfo* asn_format_info = NULL; - BlastFormattingInfoNew(eAlignViewAsnText, options, - blast_program, dbname, - myargs[OPT_ASNOUT].strvalue, &asn_format_info); - - BlastFormattingInfoSetUpOptions(asn_format_info, - myargs[OPT_NUM_DESC].intvalue, - myargs[OPT_NUM_DESC].intvalue, - FALSE, - FALSE, - FALSE, - TRUE); status = BLAST_FormatResults(seqalign_arr, num_queries, query_slp, NULL, asn_format_info, sum_returns); - asn_format_info = BlastFormattingInfoFree(asn_format_info); } status = @@ -756,6 +760,8 @@ Int2 Main(void) if (infp) FileClose(infp); + if (asn_format_info) + asn_format_info = BlastFormattingInfoFree(asn_format_info); /* Print the footer with summary information. */ Blast_PrintOutputFooter(format_info, full_sum_returns); diff --git a/demo/scantest.c b/demo/scantest.c index e3fff721..f7052b48 100644 --- a/demo/scantest.c +++ b/demo/scantest.c @@ -29,7 +29,7 @@ * * Version Creation Date: 1/20/95 * -* $Revision: 6.4 $ +* $Revision: 6.13 $ * * File Description: * template for custom scans of ASN.1 release files @@ -52,6 +52,7 @@ #include <sequtil.h> #include <sqnutils.h> #include <explore.h> +#include <toasn3.h> typedef struct appflags { Boolean binary; @@ -61,79 +62,765 @@ typedef struct appflags { Char id [64]; } AppFlagData, PNTR AppFlagPtr; -static void DoOneUser (UserObjectPtr uop, Pointer userdata) +static CharPtr Se2Str ( + SeqEntryPtr sep +) { - AppFlagPtr afp; - Char buf [128]; - ObjectIdPtr oip; + AsnIoBSPtr aibp; + ByteStorePtr bs; + CharPtr str; - if (uop == NULL) return; - afp = (AppFlagPtr) userdata; - if (afp == NULL) return; + if (sep == NULL) return NULL; + + bs = BSNew (1000); + if (bs == NULL) return NULL; + aibp = AsnIoBSOpen ("w", bs); + if (aibp == NULL) return NULL; + + SeqEntryAsnWrite (sep, aibp->aip, NULL); + + AsnIoFlush (aibp->aip); + AsnIoBSClose (aibp); + + str = BSMerge (bs, NULL); + BSFree (bs); + + return str; +} + +typedef struct chgdata { + Boolean rubisco; + Boolean rbc; + Boolean its; + Boolean sgml; + Boolean rnaother; + Boolean trnanote; + Boolean oldbiomol; + Boolean badname; + Int4 protdesc; + Int4 sfpnote; + Int4 gbsource; + Int4 cdsconf; + AppFlagPtr afp; +} ChangeData, PNTR ChangeDataPtr; + +static Boolean IsRubisco ( + CharPtr name +) + +{ + return (StringICmp (name, "rubisco large subunit") == 0 || + StringICmp (name, "rubisco small subunit") == 0); +} + +static Boolean IsRbc ( + CharPtr name +) + +{ + return (StringICmp (name, "RbcL") == 0 || + StringICmp (name, "RbcS") == 0); +} - buf [0] = '\0'; - if (StringDoesHaveText (uop->_class)) { - StringCat (buf, uop->_class); - } - StringCat (buf, " "); - buf [30] = '\0'; - fprintf (afp->fp, "%s", buf); - - buf [0] = '\0'; - oip = uop->type; - if (oip != NULL) { - if (StringDoesHaveText (oip->str)) { - StringCat (buf, oip->str); - } else if (oip->id > 0) { - sprintf (buf, "%ld", (long) oip->id); +static Boolean IsITS ( + CharPtr name +) + +{ + return (StringICmp (name, "its1") == 0 || + StringICmp (name, "its 1") == 0 || + StringICmp (name, "its2") == 0 || + StringICmp (name, "its 2") == 0 || + StringICmp (name, "its3") == 0 || + StringICmp (name, "its 3") == 0 || + StringICmp (name, "Ribosomal DNA internal transcribed spacer 1") == 0 || + StringICmp (name, "Ribosomal DNA internal transcribed spacer 2") == 0 || + StringICmp (name, "Ribosomal DNA internal transcribed spacer 3") == 0 || + StringICmp (name, "internal transcribed spacer 1 (ITS1)") == 0 || + StringICmp (name, "internal transcribed spacer 2 (ITS2)") == 0 || + StringICmp (name, "internal transcribed spacer 3 (ITS3)") == 0); +} + +static Boolean HasSgml ( + CharPtr str +) + +{ + Int2 ascii_len; + Char buf [1024]; + + if (StringHasNoText (str)) return FALSE; + + ascii_len = Sgml2AsciiLen (str); + if (ascii_len + 2 > sizeof (buf)) return FALSE; + + Sgml2Ascii (str, buf, ascii_len + 1); + if (StringCmp (str, buf) != 0) { + return TRUE; + } + + return FALSE; +} + +static void ScoreFeature ( + SeqFeatPtr sfp, + Pointer userdata +) + +{ + ChangeDataPtr cdp; + CharPtr comment; + CdRegionPtr crp; + CharPtr desc; + GBQualPtr gbq; + GeneRefPtr grp; + CharPtr name; + ProtRefPtr prp; + Uint1 residue; + RnaRefPtr rrp; + CharPtr str; + ValNodePtr vnp; + + if (sfp == NULL) return; + cdp = (ChangeDataPtr) userdata; + if (cdp == NULL) return; + + comment = sfp->comment; + if (StringDoesHaveText (comment)) { + (cdp->sfpnote)++; + } + + /* skip feature types that do not use data.value.ptrvalue */ + switch (sfp->data.choice) { + case SEQFEAT_COMMENT: + case SEQFEAT_BOND: + case SEQFEAT_SITE: + case SEQFEAT_PSEC_STR: + return; + default: + break; + } + + if (sfp->data.value.ptrvalue == NULL) return; + + switch (sfp->data.choice) { + case SEQFEAT_GENE: + grp = (GeneRefPtr) sfp->data.value.ptrvalue; + if (HasSgml (grp->locus)) { + cdp->sgml = TRUE; + } + if (HasSgml (grp->desc)) { + cdp->sgml = TRUE; + } + for (vnp = grp->syn; vnp != NULL; vnp = vnp->next) { + str = (CharPtr) vnp->data.ptrvalue; + if (StringHasNoText (str)) continue; + if (HasSgml (str)) { + cdp->sgml = TRUE; + } + } + break; + case SEQFEAT_CDREGION: + crp = (CdRegionPtr) sfp->data.value.ptrvalue; + if (crp->conflict) { + (cdp->cdsconf)++; + } + break; + case SEQFEAT_PROT: + prp = (ProtRefPtr) sfp->data.value.ptrvalue; + desc = prp->desc; + if (StringDoesHaveText (desc)) { + (cdp->protdesc)++; + } + for (vnp = prp->name; vnp != NULL; vnp = vnp->next) { + str = (CharPtr) vnp->data.ptrvalue; + if (StringHasNoText (str)) continue; + if (IsRubisco (str)) { + cdp->rubisco = TRUE; + } + if (IsRbc (str)) { + cdp->rbc = TRUE; + } + } + break; + case SEQFEAT_RNA : + rrp = (RnaRefPtr) sfp->data.value.ptrvalue; + if (rrp->type == 255 && rrp->ext.choice == 1) { + name = (CharPtr) rrp->ext.value.ptrvalue; + if (StringCmp (name, "misc_RNA") == 0) { + for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) { + if (StringCmp (gbq->qual, "product") != 0) continue; + name = gbq->val; + if (StringHasNoText (name)) continue; + if (IsITS (name)) { + cdp->its = TRUE; + } + } + } else if (StringCmp (name, "ncRNA") == 0 || StringCmp (name, "tmRNA") == 0) { + } else { + cdp->rnaother = TRUE; + if (IsITS (name)) { + cdp->its = TRUE; + } + } + } else if (rrp->type == 3 && rrp->ext.choice == 2) { + if (StringDoesHaveText (comment)) { + if (StringNCmp (comment, "aa: ", 4) == 0) { + comment += 4; + } + residue = FindTrnaAA3 (comment); + if (residue > 0 && residue != 255) { + cdp->trnanote = TRUE; + } + residue = FindTrnaAA (comment); + if (residue > 0 && residue != 255) { + cdp->trnanote = TRUE; + } + } + } + break; + default: + break; + } +} + +static void ScoreDescriptor ( + SeqDescrPtr sdp, + Pointer userdata +) + +{ + ChangeDataPtr cdp; + GBBlockPtr gbp; + MolInfoPtr mip; + + if (sdp == NULL) return; + cdp = (ChangeDataPtr) userdata; + if (cdp == NULL) return; + + switch (sdp->choice) { + case Seq_descr_genbank : + gbp = (GBBlockPtr) sdp->data.ptrvalue; + if (gbp != NULL) { + if (StringDoesHaveText (gbp->source)) { + (cdp->gbsource)++; + } + } + break; + case Seq_descr_molinfo : + mip = (MolInfoPtr) sdp->data.ptrvalue; + if (mip != NULL) { + switch (mip->biomol) { + case MOLECULE_TYPE_SNRNA: + case MOLECULE_TYPE_SCRNA: + case MOLECULE_TYPE_SNORNA: + cdp->oldbiomol = TRUE; + break; + default : + break; + } + } + break; + default : + break; + } +} + +static void CheckForChanges ( + SeqEntryPtr sep, + ChangeDataPtr cdp +) + +{ + if (sep == NULL || cdp == NULL) return; + + VisitFeaturesInSep (sep, (Pointer) cdp, ScoreFeature); + VisitDescriptorsInSep (sep, (Pointer) cdp, ScoreDescriptor); +} + +static void ModGenes ( + SeqFeatPtr sfp, + Pointer userdata +) + +{ + ModernizeGeneFields (sfp); +} + +static void ModRNAs ( + SeqFeatPtr sfp, + Pointer userdata +) + +{ + ModernizeRNAFields (sfp); +} + +static void ModPCRs ( + BioSourcePtr biop, + Pointer userdata +) + +{ + BoolPtr namP; + PCRPrimerPtr ppp; + PCRReactionPtr prp; + + if (biop == NULL) return; + + ModernizePCRPrimers (biop); + + namP = (BoolPtr) userdata; + if (namP == NULL) return; + + for (prp = biop->pcr_primers; prp != NULL; prp = prp->next) { + if (prp->forward == NULL || prp->reverse == NULL) { + *namP = TRUE; + return; + } + for (ppp = prp->forward; ppp != NULL; ppp = ppp->next) { + if (StringHasNoText (ppp->seq) && StringDoesHaveText (ppp->name)) { + *namP = TRUE; + return; + } + } + for (ppp = prp->reverse; ppp != NULL; ppp = ppp->next) { + if (StringHasNoText (ppp->seq) && StringDoesHaveText (ppp->name)) { + *namP = TRUE; + return; + } } } - StringCat (buf, " "); - buf [30] = '\0'; - fprintf (afp->fp, "%s", buf); +} - if (afp->verbose) { - fprintf (afp->fp, " %s", afp->id); +static void TestForRubisco ( + CharPtr str, + AppFlagPtr afp, + CharPtr prefix, + CharPtr remainder +) + +{ + if (StringHasNoText (str)) return; + if (afp == NULL || afp->fp == NULL) return; + + if (StringICmp (str, "ribulose-1,5-bisphosphate carboxylase/oxygenase large subunit") == 0) return; + if (StringICmp (str, "ribulose-1,5-bisphosphate carboxylase/oxygenase small subunit") == 0) return; + if (StringStr (str, "ribulose") == NULL || StringStr (str, "bisphosphate") == NULL) return; + + if (StringHasNoText (prefix)) { + prefix = "?"; } - fprintf (afp->fp, "\n"); + if (StringStr (str, "methyltransferase") == NULL) { + if (StringICmp (str, "ribulose 1,5-bisphosphate carboxylase/oxygenase large subunit") == 0 || + StringICmp (str, "ribulose 1,5-bisphosphate carboxylase large subunit") == 0 || + StringICmp (str, "ribulose bisphosphate carboxylase large subunit") == 0 || + StringICmp (str, "ribulose-bisphosphate carboxylase large subunit") == 0 || + StringICmp (str, "ribulose-1,5-bisphosphate carboxylase large subunit") == 0 || + StringICmp (str, "ribulose-1,5-bisphosphate carboxylase, large subunit") == 0 || + StringICmp (str, "large subunit of ribulose-1,5-bisphosphate carboxylase/oxygenase") == 0 || + StringICmp (str, "ribulose-1,5-bisphosphate carboxylase oxygenase large subunit") == 0 || + StringICmp (str, "ribulose bisphosphate carboxylase large chain") == 0 || + StringICmp (str, "ribulose 1,5-bisphosphate carboxylase-oxygenase large subunit") == 0 || + StringICmp (str, "ribulose bisphosphate carboxylase oxygenase large subunit") == 0 || + StringICmp (str, "ribulose 1,5 bisphosphate carboxylase large subunit") == 0 || + StringICmp (str, "ribulose-1,5-bisphosphate carboxylase/oxygenase, large subunit") == 0 || + StringICmp (str, "large subunit of ribulose-1,5-bisphosphate carboxylase/oxgenase") == 0 || + StringICmp (str, "ribulose bisphosphate carboxylase/oxygenase large subunit") == 0 || + StringICmp (str, "ribulose-1,5-bisphosphate carboxylase oxygenase, large subunit") == 0 || + StringICmp (str, "ribulose 5-bisphosphate carboxylase, large subunit") == 0 || + StringICmp (str, "ribulosebisphosphate carboxylase large subunit") == 0 || + StringICmp (str, "ribulose bisphosphate large subunit") == 0 || + StringICmp (str, "ribulose 1,5 bisphosphate carboxylase/oxygenase large subunit") == 0 || + StringICmp (str, "ribulose 1,5-bisphosphate carboxylase/oxygenase large chain") == 0 || + StringICmp (str, "large subunit ribulose-1,5-bisphosphate carboxylase/oxygenase") == 0 || + StringICmp (str, "ribulose-bisphosphate carboxylase, large subunit") == 0 || + StringICmp (str, "ribulose-1, 5-bisphosphate carboxylase/oxygenase large-subunit") == 0) { + if (afp->verbose) { + fprintf (afp->fp, "%s\t%s\t%s\n", prefix, afp->id, str); + } else { + fprintf (afp->fp, "%s %s\n", prefix, afp->id); + } + fflush (afp->fp); + return; + } + } + + if (StringHasNoText (remainder)) { + remainder = "?"; + } + if (afp->verbose) { + fprintf (afp->fp, "%s\t%s\t%s\n", remainder, afp->id, str); + } else { + fprintf (afp->fp, "%s %s\n", remainder, afp->id); + } fflush (afp->fp); } -static void DoOneDescriptor (SeqDescrPtr sdp, Pointer userdata) +static void TrailingCommaFix ( + CharPtr str, + AppFlagPtr afp, + CharPtr prefix +) { - AppFlagPtr afp; - UserObjectPtr uop; + Char ch; + size_t len; + + if (StringHasNoText (str)) return; + len = StringLen (str); + if (len < 1) return; + ch = str [len - 1]; + while (ch == ' ' && len > 2) { + len--; + ch = str [len - 1]; + } + if (ch == ',') { + if (afp != NULL && afp->verbose && afp->fp != NULL) { + str [len] = '\0'; + if (StringHasNoText (prefix)) { + prefix = "?"; + } + fprintf (afp->fp, "%s\t%s\t%s\n", prefix, afp->id, str); + fflush (afp->fp); + } + str [len - 1] = '_'; + str [len] = '\0'; + } +} - if (sdp == NULL || sdp->choice != Seq_descr_user) return; +static void RnaProtCmntTrailingCommaFix ( + SeqFeatPtr sfp, + Pointer userdata +) + +{ + AppFlagPtr afp; + ProtRefPtr prp; + RnaRefPtr rrp; + CharPtr str; + ValNodePtr vnp; + + if (sfp == NULL) return; afp = (AppFlagPtr) userdata; if (afp == NULL) return; - uop = (UserObjectPtr) sdp->data.ptrvalue; - if (uop == NULL) return; + str = sfp->comment; + if (StringDoesHaveText (str)) { + TrailingCommaFix (str, afp, "SFPCOMM"); + } - VisitUserObjectsInUop (uop, (Pointer) afp, DoOneUser); + if (sfp->data.choice == SEQFEAT_PROT) { + prp = (ProtRefPtr) sfp->data.value.ptrvalue; + /* turn trailing space into trailing underscore for validator */ + for (vnp = prp->name; vnp != NULL; vnp = vnp->next) { + str = (CharPtr) vnp->data.ptrvalue; + if (StringHasNoText (str)) continue; + TrailingCommaFix (str, afp, "PRTCOMM"); + TestForRubisco (str, afp, "RIBBIS", "RIBREM"); + } + } else if (sfp->data.choice == SEQFEAT_RNA) { + rrp = (RnaRefPtr) sfp->data.value.ptrvalue; + /* turn trailing space into trailing underscore for validator */ + if (rrp->ext.choice == 1) { + str = rrp->ext.value.ptrvalue; + if (StringDoesHaveText (str)) { + TrailingCommaFix (str, afp, "RNACOMM"); + } + } + } } -static void DoOneFeature (SeqFeatPtr sfp, Pointer userdata) +static void LookForBadAuth ( + NameStdPtr nsp, + Pointer userdata +) { AppFlagPtr afp; - UserObjectPtr uop; + ChangeDataPtr cdp; + Char ch; + Int2 i; + Boolean is_bad = FALSE; + CharPtr prefix = "\t"; + CharPtr str; + + if (nsp == NULL) return; + cdp = (ChangeDataPtr) userdata; + if (cdp == NULL) return; + afp = cdp->afp; + if (afp == NULL) return; - if (sfp == NULL) return; + for (i = 0; i < 6; i++) { + str = nsp->names [i]; + if (StringHasNoText (str)) continue; + ch = *str; + while (ch != '\0') { + if (IS_DIGIT (ch)) { + cdp->badname = TRUE; + is_bad = TRUE; + } + str++; + ch = *str; + } + } + + if (is_bad && afp->fp != NULL && afp->verbose) { + fprintf (afp->fp, "%s\t%s", "AUTHOR", afp->id); + for (i = 0; i < 6; i++) { + str = nsp->names [i]; + if (StringHasNoText (str)) continue; + fprintf (afp->fp, "%s%s", prefix, str); + prefix = " | "; + } + fprintf (afp->fp, "\n"); + fflush (afp->fp); + } +} + +static void LookForBadPub ( + PubdescPtr pdp, + Pointer userdata +) + +{ + VisitAuthorsInPub (pdp, userdata, LookForBadAuth); +} + +static void CommentDescrTrailingCommaFix ( + SeqDescrPtr sdp, + Pointer userdata +) + +{ + AppFlagPtr afp; + CharPtr str; + + if (sdp == NULL || sdp->choice != Seq_descr_comment) return; afp = (AppFlagPtr) userdata; if (afp == NULL) return; - uop = sfp->ext; - if (uop != NULL) { - VisitUserObjectsInUop (uop, (Pointer) afp, DoOneUser); + str = (CharPtr) sdp->data.ptrvalue; + if (StringDoesHaveText (str)) { + TrailingCommaFix (str, afp, "DSCCOMM"); + } +} + +static void DoReport ( + SeqEntryPtr sep, + AppFlagPtr afp +) + +{ + Boolean bsec = FALSE, cma = FALSE, norm = FALSE, ssec = FALSE; + Boolean gen = FALSE, ncr = FALSE, pcr = FALSE, nam = FALSE; + ChangeData cdbefore, cdafter; + CharPtr str = NULL, tmp = NULL; + + if (sep == NULL || afp == NULL) return; + + MemSet ((Pointer) &cdbefore, 0, sizeof (ChangeData)); + MemSet ((Pointer) &cdafter, 0, sizeof (ChangeData)); + + cdbefore.afp = afp; + cdafter.afp = afp; + + CheckForChanges (sep, &cdbefore); + + str = Se2Str (sep); + NormalizeDescriptorOrder (sep); + tmp = Se2Str (sep); + if (StringCmp (str, tmp) != 0) { + norm = TRUE; + } + MemFree (str); + str = tmp; + + VisitFeaturesInSep (sep, (Pointer) afp, RnaProtCmntTrailingCommaFix); + VisitDescriptorsInSep (sep, (Pointer) afp, CommentDescrTrailingCommaFix); + tmp = Se2Str (sep); + if (StringCmp (str, tmp) != 0) { + cma = TRUE; + } + MemFree (str); + str = tmp; + + BasicSeqEntryCleanup (sep); + tmp = Se2Str (sep); + if (StringCmp (str, tmp) != 0) { + bsec = TRUE; + } + MemFree (str); + str = tmp; + + VisitPubdescsInSep (sep, (Pointer) &cdbefore, LookForBadPub); + + VisitFeaturesInSep (sep, NULL, ModGenes); + tmp = Se2Str (sep); + if (StringCmp (str, tmp) != 0) { + gen = TRUE; + } + MemFree (str); + str = tmp; + + VisitFeaturesInSep (sep, NULL, ModRNAs); + tmp = Se2Str (sep); + if (StringCmp (str, tmp) != 0) { + ncr = TRUE; + } + MemFree (str); + str = tmp; + + VisitBioSourcesInSep (sep, (Pointer) &nam, ModPCRs); + tmp = Se2Str (sep); + if (StringCmp (str, tmp) != 0) { + pcr = TRUE; + } + MemFree (str); + str = tmp; + + SeriousSeqEntryCleanup (sep, NULL, NULL); + tmp = Se2Str (sep); + if (StringCmp (str, tmp) != 0) { + ssec = TRUE; + } + MemFree (str); + str = tmp; + + CheckForChanges (sep, &cdafter); + + MemFree (str); + + if (ssec) { + if (afp->fp != NULL) { + fprintf (afp->fp, "SSEC %s\n", afp->id); + fflush (afp->fp); + } + } else if (bsec) { + if (afp->fp != NULL) { + fprintf (afp->fp, "BSEC %s\n", afp->id); + fflush (afp->fp); + } + } else if (norm) { + if (afp->fp != NULL) { + fprintf (afp->fp, "NORM %s\n", afp->id); + fflush (afp->fp); + } + } else { + /* + if (afp->fp != NULL) { + fprintf (afp->fp, "OKAY %s\n", afp->id); + fflush (afp->fp); + } + */ + } + + if (cma) { + if (afp->fp != NULL) { + fprintf (afp->fp, "CMA %s\n", afp->id); + fflush (afp->fp); + } + } + + if (gen) { + if (afp->fp != NULL) { + fprintf (afp->fp, "GEN %s\n", afp->id); + fflush (afp->fp); + } + } + if (ncr) { + if (afp->fp != NULL) { + fprintf (afp->fp, "NCR %s\n", afp->id); + fflush (afp->fp); + } + } + if (pcr) { + if (afp->fp != NULL) { + fprintf (afp->fp, "PCR %s\n", afp->id); + fflush (afp->fp); + } + } + if (nam) { + if (afp->fp != NULL) { + fprintf (afp->fp, "NAM %s\n", afp->id); + fflush (afp->fp); + } + } + + if (cdbefore.rubisco) { + if (afp->fp != NULL) { + fprintf (afp->fp, "RUB %s\n", afp->id); + fflush (afp->fp); + } + } + if (cdbefore.rbc) { + if (afp->fp != NULL) { + fprintf (afp->fp, "RBC %s\n", afp->id); + fflush (afp->fp); + } + } + if (cdbefore.its) { + if (afp->fp != NULL) { + fprintf (afp->fp, "ITS %s\n", afp->id); + fflush (afp->fp); + } + } + if (cdbefore.sgml) { + if (afp->fp != NULL) { + fprintf (afp->fp, "SGM %s\n", afp->id); + fflush (afp->fp); + } + } + if (cdbefore.rnaother) { + if (afp->fp != NULL) { + fprintf (afp->fp, "RNA %s\n", afp->id); + fflush (afp->fp); + } + } + if (cdbefore.trnanote) { + if (afp->fp != NULL) { + fprintf (afp->fp, "TRN %s\n", afp->id); + fflush (afp->fp); + } + } + if (cdbefore.oldbiomol) { + if (afp->fp != NULL) { + fprintf (afp->fp, "MOL %s\n", afp->id); + fflush (afp->fp); + } + } + if (cdbefore.badname) { + if (afp->fp != NULL) { + fprintf (afp->fp, "AUT %s\n", afp->id); + fflush (afp->fp); + } } - for (uop = sfp->exts; uop != NULL; uop = uop->next) { - VisitUserObjectsInUop (uop, (Pointer) afp, DoOneUser); + if (cdbefore.protdesc != cdafter.protdesc) { + if (afp->fp != NULL) { + fprintf (afp->fp, "PRT %s\n", afp->id); + fflush (afp->fp); + } + } + if (cdbefore.sfpnote != cdafter.sfpnote) { + if (afp->fp != NULL) { + fprintf (afp->fp, "COM %s\n", afp->id); + fflush (afp->fp); + } + } + if (cdbefore.gbsource != cdafter.gbsource) { + if (afp->fp != NULL) { + fprintf (afp->fp, "SRC %s\n", afp->id); + fflush (afp->fp); + } + } + if (cdbefore.cdsconf != cdafter.cdsconf) { + if (afp->fp != NULL) { + fprintf (afp->fp, "CNF %s\n", afp->id); + fflush (afp->fp); + } } } @@ -143,6 +830,7 @@ static void DoRecord (SeqEntryPtr sep, Pointer userdata) AppFlagPtr afp; BioseqPtr fbsp; SeqEntryPtr fsep; + SeqIdPtr sip, siphead; if (sep == NULL) return; afp = (AppFlagPtr) userdata; @@ -153,10 +841,14 @@ static void DoRecord (SeqEntryPtr sep, Pointer userdata) fbsp = (BioseqPtr) fsep->data.ptrvalue; if (fbsp == NULL) return; - SeqIdWrite (fbsp->id, afp->id, PRINTID_FASTA_LONG, 64); + siphead = SeqIdSetDup (fbsp->id); + for (sip = siphead; sip != NULL; sip = sip->next) { + SeqIdStripLocus (sip); + } + SeqIdWrite (siphead, afp->id, PRINTID_FASTA_LONG, sizeof (afp->id)); + SeqIdSetFree (siphead); - VisitDescriptorsInSep (sep, (Pointer) afp, DoOneDescriptor); - VisitFeaturesInSep (sep, (Pointer) afp, DoOneFeature); + DoReport (sep, afp); } static void ProcessOneRecord ( @@ -171,7 +863,8 @@ static void ProcessOneRecord ( afp = (AppFlagPtr) userdata; if (afp == NULL) return; - if (StringStr (filename, "gbest") != NULL || + if (StringStr (filename, "gbcon") != NULL || + StringStr (filename, "gbest") != NULL || StringStr (filename, "gbgss") != NULL || StringStr (filename, "gbhtg") != NULL || StringStr (filename, "gbsts") != NULL) { @@ -218,7 +911,7 @@ Args myargs [] = { TRUE, 'b', ARG_BOOLEAN, 0.0, 0, NULL}, {"Bioseq-set is Compressed", "F", NULL, NULL, TRUE, 'c', ARG_BOOLEAN, 0.0, 0, NULL}, - {"Verbose", "F", NULL, NULL, + {"Verbose Output", "F", NULL, NULL, TRUE, 'v', ARG_BOOLEAN, 0.0, 0, NULL}, }; @@ -274,7 +967,7 @@ extern Int2 Main (void) dorecurse = (Boolean) myargs [u_argRecurse].intvalue; afd.binary = (Boolean) myargs [b_argBinary].intvalue; afd.compressed = (Boolean) myargs [c_argCompressed].intvalue; - afd.verbose = (Boolean) myargs[v_argVerbose].intvalue; + afd.verbose = (Boolean) myargs [v_argVerbose].intvalue; afd.fp = FileOpen (outfile, "w"); if (afd.fp == NULL) { diff --git a/demo/src_chk.c b/demo/src_chk.c new file mode 100755 index 00000000..cb5ffe30 --- /dev/null +++ b/demo/src_chk.c @@ -0,0 +1,398 @@ +/* src_chk.c +* =========================================================================== +* +* PUBLIC DOMAIN NOTICE +* National Center for Biotechnology Information (NCBI) +* +* This software/database is a "United States Government Work" under the +* terms of the United States Copyright Act. It was written as part of +* the author's official duties as a United States Government employee and +* thus cannot be copyrighted. This software/database is freely available +* to the public for use. The National Library of Medicine and the U.S. +* Government do not place any restriction on its use or reproduction. +* We would, however, appreciate having the NCBI and the author cited in +* any work or product based on this material +* +* Although all reasonable efforts have been taken to ensure the accuracy +* and reliability of the software and data, the NLM and the U.S. +* Government do not and cannot warrant the performance or results that +* may be obtained by using this software or data. The NLM and the U.S. +* Government disclaim all warranties, express or implied, including +* warranties of performance, merchantability or fitness for any particular +* purpose. +* +* =========================================================================== +* +* File Name: src_chk.c +* +* Author: Colleen Bollin +* +* Version Creation Date: 4/12/07 +* +* $Revision: 1.10 $ +* +* File Description: +* +* Modifications: +* -------------------------------------------------------------------------- +* Date Name Description of modification +* ------- ---------- ----------------------------------------------------- +* +* +* ========================================================================== +*/ + +#include <ncbi.h> +#include <objall.h> +#include <objsset.h> +#include <objsub.h> +#include <objfdef.h> +#include <sequtil.h> +#include <gather.h> +#include <sqnutils.h> +#include <explore.h> +#include <pmfapi.h> +#define NLM_GENERATED_CODE_PROTO +#include <asnmacro.h> +#include <objmacro.h> +#include <macroapi.h> + +#define SRC_CHK_APP_VER "1.0" + +CharPtr SRC_CHK_APPLICATION = SRC_CHK_APP_VER; + + +static ValNodePtr CollectFieldList(BioseqPtr bsp) +{ + BioSourcePtr biop; + SeqDescrPtr sdp; + SeqMgrDescContext dcontext; + ValNodePtr list = NULL, vnp; + + for (sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &dcontext); + sdp != NULL; + sdp = SeqMgrGetNextDescriptor (bsp, sdp, Seq_descr_source, &dcontext)) { + biop = (BioSourcePtr) sdp->data.ptrvalue; + vnp = GetSourceQualFieldListFromBioSource (biop); + ValNodeLink (&list, vnp); + } + return list; +} + + +static void PrintHeader (FILE *fp, ValNodePtr field_list) +{ + CharPtr txt; + + if (fp == NULL || field_list == NULL) { + return; + } + /* first field accession, second field GI, third field tax ID */ + fprintf (fp, "\t\tTaxID"); + while (field_list != NULL) { + txt = SummarizeFieldType (field_list); + fprintf (fp, "\t%s", txt); + txt = MemFree (txt); + field_list = field_list->next; + } + fprintf (fp, "\n"); +} + + +static Int4 GetTaxIdFromOrgRef (OrgRefPtr orp) +{ + Int4 tax_id = -1; + ValNodePtr vnp; + DbtagPtr d; + + if (orp != NULL) + { + for (vnp = orp->db; vnp != NULL; vnp = vnp->next) + { + d = (DbtagPtr) vnp->data.ptrvalue; + if (StringCmp(d->db, "taxon") == 0) + { + tax_id = d->tag->id; + break; + } + } + } + return tax_id; +} + + +static void PrintBioSourceLine (FILE *fp, BioSourcePtr biop, ValNodePtr field_list) +{ + CharPtr txt; + + if (fp == NULL || biop == NULL || field_list == NULL) { + return; + } + + fprintf (fp, "\t%d", GetTaxIdFromOrgRef(biop->org)); + + while (field_list != NULL) { + txt = GetSourceQualFromBioSource (biop, field_list->data.ptrvalue, NULL); + fprintf (fp, "\t%s", txt == NULL ? "" : txt); + txt = MemFree (txt); + field_list = field_list->next; + } +} + + +static void PrintBioseqLines (FILE *fp, BioseqPtr bsp, ValNodePtr field_list) +{ + SeqDescrPtr sdp; + SeqMgrDescContext dcontext; + Char id_txt[255], id_txt2[255]; + SeqIdPtr sip, sip_gi = NULL, sip_gb = NULL; + + if (fp == NULL || bsp == NULL || field_list == NULL) { + return; + } + + for (sip = bsp->id; sip != NULL; sip = sip->next) { + if (sip->choice == SEQID_GENBANK + || (sip->choice == SEQID_EMBL && sip_gb == NULL) + || (sip->choice == SEQID_SWISSPROT && sip_gb == NULL) + || (sip->choice == SEQID_DDBJ && sip_gb == NULL) + || (sip->choice == SEQID_PIR && sip_gb == NULL)) { + sip_gb = sip; + } else if (sip->choice == SEQID_GI) { + sip_gi = sip; + } + } + + if (sip_gb == NULL && sip_gi == NULL) { + SeqIdWrite (SeqIdFindBest (bsp->id, SEQID_GENBANK), id_txt, PRINTID_REPORT, sizeof (id_txt) - 1); + id_txt2[0] = 0; + } else { + if (sip_gb == NULL) { + id_txt[0] = 0; + } else { + SeqIdWrite (sip_gb, id_txt, PRINTID_REPORT, sizeof (id_txt) - 1); + } + if (sip_gi == NULL) { + id_txt2[0] = 0; + } else { + SeqIdWrite (sip_gi, id_txt2, PRINTID_REPORT, sizeof (id_txt2) - 1); + } + } + + for (sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &dcontext); + sdp != NULL; + sdp = SeqMgrGetNextDescriptor (bsp, sdp, Seq_descr_source, &dcontext)) { + fprintf (fp, "%s\t%s", id_txt, id_txt2); + PrintBioSourceLine (fp, sdp->data.ptrvalue, field_list); + fprintf (fp, "\n"); + } +} + + +static void PrintBioseqErrorLine (FILE *fp, SeqIdPtr sip) +{ + Char id_txt[255]; + + if (fp == NULL || sip == NULL) { + return; + } + + SeqIdWrite (sip, id_txt, PRINTID_REPORT, sizeof (id_txt) - 1); + + if (sip->choice == SEQID_GI) { + fprintf (fp, "\t%s\n", id_txt); + } else { + fprintf (fp, "%s\t\n", id_txt); + } +} + + +static Boolean IsAllDigits (CharPtr str) +{ + CharPtr cp; + + if (StringHasNoText (str)) return FALSE; + + cp = str; + while (*cp != 0 && isdigit (*cp)) { + cp++; + } + if (*cp == 0) { + return TRUE; + } else { + return FALSE; + } +} + + +static SeqIdPtr SmartGuessMakeId (CharPtr str) +{ + CharPtr id_txt; + SeqIdPtr sip = NULL; + + if (StringHasNoText (str)) { + return NULL; + } else if (StringChr (str, '|') != NULL) { + sip = MakeSeqID (str); + } else if (IsAllDigits (str)) { + id_txt = (CharPtr) MemNew (sizeof (Char) * (StringLen (str) + 4)); + sprintf (id_txt, "gi|%s", str); + sip = MakeSeqID (id_txt); + id_txt = MemFree (id_txt); + } else { + id_txt = (CharPtr) MemNew (sizeof (Char) * (StringLen (str) + 4)); + sprintf (id_txt, "gb|%s", str); + sip = MakeSeqID (id_txt); + id_txt = MemFree (id_txt); + } + return sip; +} + + +/* Args structure contains command-line arguments */ + +#define i_argInputFile 0 +#define o_argOutputFile 1 + +Args myargs [] = { + {"Input File", NULL, NULL, NULL, + TRUE, 'i', ARG_FILE_IN, 0.0, 0, NULL}, + {"Output File", NULL, NULL, NULL, + TRUE, 'o', ARG_FILE_OUT, 0.0, 0, NULL} +}; + + +static void SortFieldListForSrcChk (ValNodePtr PNTR field_list) +{ + ValNodePtr vnp, vnp_s, vnp_prev = NULL; + + if (field_list == NULL || *field_list == NULL) return; + + SortUniqueFieldTypeList (field_list); + + /* move taxname to front of list */ + for (vnp = *field_list; vnp != NULL; vnp_prev = vnp, vnp = vnp->next) { + if (vnp->choice == FieldType_source_qual) { + vnp_s = vnp->data.ptrvalue; + if (vnp_s != NULL + && vnp_s->choice == SourceQualChoice_textqual + && vnp_s->data.intvalue == Source_qual_taxname) { + /* only need to move if not already at front of list */ + if (vnp_prev != NULL) { + vnp_prev->next = vnp->next; + vnp->next = *field_list; + *field_list = vnp; + } + break; + } + } + } + + +} + + +Int2 Main(void) +{ + Char app [64]; + Int4 rval = 0; + CharPtr id_file, line; + ReadBufferData rbd; + ValNodePtr field_list = NULL; + SeqIdPtr sip; + ValNodePtr bsp_list = NULL, vnp; + BioseqPtr bsp; + FILE *fp; + + + /* standard setup */ + + ErrSetFatalLevel (SEV_MAX); + ErrClearOptFlags (EO_SHOW_USERSTR); + UseLocalAsnloadDataAndErrMsg (); + ErrPathReset (); + + /* finish resolving internal connections in ASN.1 parse tables */ + + if (! AllObjLoad ()) { + Message (MSG_FATAL, "AllObjLoad failed"); + return 1; + } + if (! SubmitAsnLoad ()) { + Message (MSG_FATAL, "SubmitAsnLoad failed"); + return 1; + } + if (! FeatDefSetLoad ()) { + Message (MSG_FATAL, "FeatDefSetLoad failed"); + return 1; + } + if (! SeqCodeSetLoad ()) { + Message (MSG_FATAL, "SeqCodeSetLoad failed"); + return 1; + } + if (! GeneticCodeTableLoad ()) { + Message (MSG_FATAL, "GeneticCodeTableLoad failed"); + return 1; + } + + PubSeqFetchEnable (); + + /* process command line arguments */ + + sprintf (app, "src_chk %s", SRC_CHK_APPLICATION); + if (! GetArgs (app, sizeof (myargs) / sizeof (Args), myargs)) { + return 0; + } + + id_file = (CharPtr) myargs [i_argInputFile].strvalue; + + rbd.fp = FileOpen (id_file, "r"); + if (rbd.fp == NULL) { + Message (MSG_ERROR, "Unable to open %s", (CharPtr) myargs [i_argInputFile].strvalue); + return 1; + } + rbd.current_data = NULL; + line = AbstractReadFunction (&rbd); + while (line != NULL && line[0] != EOF) { + if (!StringHasNoText (line)) { + + sip = SmartGuessMakeId (line); + bsp = BioseqLockById (sip); + if (bsp == NULL) { + printf ("Unable to download Bioseq for %s\n", line); + } else { + ValNodeLink (&field_list, CollectFieldList (bsp)); + BioseqUnlock (bsp); + } + ValNodeAddPointer (&bsp_list, 0, sip); + } + line = MemFree (line); + line = AbstractReadFunction (&rbd); + } + + FileClose (rbd.fp); + + SortFieldListForSrcChk (&field_list); + + fp = FileOpen ((CharPtr) myargs [o_argOutputFile].strvalue, "w"); + if (fp == NULL) { + Message (MSG_ERROR, "Unable to open %s", (CharPtr) myargs [o_argOutputFile].strvalue); + rval = 1; + } else { + PrintHeader (fp, field_list); + for (vnp = bsp_list; vnp != NULL; vnp = vnp->next) { + bsp = BioseqLockById (vnp->data.ptrvalue); + if (bsp == NULL) { + PrintBioseqErrorLine (fp, vnp->data.ptrvalue); + } else { + PrintBioseqLines (fp, bsp, field_list); + } + BioseqUnlock (bsp); + vnp->data.ptrvalue = SeqIdFree (vnp->data.ptrvalue); + } + } + FileClose (fp); + bsp_list = ValNodeFree (bsp_list); + field_list = FieldTypeListFree (field_list); + return rval; +} diff --git a/demo/subfuse.c b/demo/subfuse.c new file mode 100644 index 00000000..a8046690 --- /dev/null +++ b/demo/subfuse.c @@ -0,0 +1,229 @@ +/* subfuse.c +* =========================================================================== +* +* PUBLIC DOMAIN NOTICE +* National Center for Biotechnology Information (NCBI) +* +* This software/database is a "United States Government Work" under the +* terms of the United States Copyright Act. It was written as part of +* the author's official duties as a United States Government employee and +* thus cannot be copyrighted. This software/database is freely available +* to the public for use. The National Library of Medicine and the U.S. +* Government do not place any restriction on its use or reproduction. +* We would, however, appreciate having the NCBI and the author cited in +* any work or product based on this material +* +* Although all reasonable efforts have been taken to ensure the accuracy +* and reliability of the software and data, the NLM and the U.S. +* Government do not and cannot warrant the performance or results that +* may be obtained by using this software or data. The NLM and the U.S. +* Government disclaim all warranties, express or implied, including +* warranties of performance, merchantability or fitness for any particular +* purpose. +* +* =========================================================================== +* +* File Name: subfuse.c +* +* Author: Jonathan Kans +* +* Version Creation Date: 7/30/01 +* +* $Revision: 1.2 $ +* +* File Description: +* +* Modifications: +* -------------------------------------------------------------------------- +* Date Name Description of modification +* ------- ---------- ----------------------------------------------------- +* +* +* ========================================================================== +*/ + +#include <ncbi.h> +#include <objall.h> +#include <objsset.h> +#include <objsub.h> +#include <objfdef.h> + +static SeqSubmitPtr ReadOneSubmission ( + CharPtr directory, + CharPtr base, + CharPtr suffix +) + +{ + AsnIoPtr aip; + Char file [FILENAME_MAX], path [PATH_MAX]; + SeqSubmitPtr ssp; + + if (base == NULL) { + base = ""; + } + if (suffix == NULL) { + suffix = ""; + } + StringNCpy_0 (path, directory, sizeof (path)); + sprintf (file, "%s%s", base, suffix); + FileBuildPath (path, NULL, file); + + aip = AsnIoOpen (path, "r"); + if (aip == NULL) return NULL; + ssp = SeqSubmitAsnRead (aip, NULL); + AsnIoClose (aip); + + return ssp; +} + +static void WriteOneSubmission ( + CharPtr path, + SeqSubmitPtr ssp +) + +{ + AsnIoPtr aip; + + aip = AsnIoOpen (path, "w"); + if (aip == NULL) return; + + SeqSubmitAsnWrite (ssp, aip, NULL); + + AsnIoFlush (aip); + AsnIoClose (aip); +} + +static void ProcessOneRecord ( + SeqSubmitPtr master, + BioseqSetPtr bssp, + CharPtr directory, + CharPtr base, + CharPtr suffix +) + +{ + SeqEntryPtr sep; + SeqSubmitPtr ssp; + + ssp = ReadOneSubmission (directory, base, suffix); + if (ssp == NULL || ssp->datatype != 1) return; + + if (master->sub == NULL) { + master->sub = ssp->sub; + ssp->sub = NULL; + } + + sep = (SeqEntryPtr) ssp->data; + ssp->data = NULL; + + ValNodeLink (&(bssp->seq_set), sep); +} + +/* Args structure contains command-line arguments */ + +#define p_argInputPath 0 +#define o_argOutputFile 1 +#define x_argSuffix 2 + +Args myargs [] = { + {"Path to files", NULL, NULL, NULL, + TRUE, 'p', ARG_STRING, 0.0, 0, NULL}, + {"Output file", "stdout", NULL, NULL, + TRUE, 'o', ARG_FILE_OUT, 0.0, 0, NULL}, + {"Suffix", ".sqn", NULL, NULL, + TRUE, 'x', ARG_STRING, 0.0, 0, NULL}, +}; + +Int2 Main (void) + +{ + CharPtr base, directory, outfile, suffix, ptr; + BioseqSetPtr bssp; + ValNodePtr head, vnp; + SeqEntryPtr sep; + SeqSubmitPtr ssp; + + /* standard setup */ + + ErrSetFatalLevel (SEV_MAX); + ErrClearOptFlags (EO_SHOW_USERSTR); + UseLocalAsnloadDataAndErrMsg (); + ErrPathReset (); + + /* finish resolving internal connections in ASN.1 parse tables */ + + if (! AllObjLoad ()) { + Message (MSG_FATAL, "AllObjLoad failed"); + return 1; + } + if (! SubmitAsnLoad ()) { + Message (MSG_FATAL, "SubmitAsnLoad failed"); + return 1; + } + if (! FeatDefSetLoad ()) { + Message (MSG_FATAL, "FeatDefSetLoad failed"); + return 1; + } + if (! SeqCodeSetLoad ()) { + Message (MSG_FATAL, "SeqCodeSetLoad failed"); + return 1; + } + if (! GeneticCodeTableLoad ()) { + Message (MSG_FATAL, "GeneticCodeTableLoad failed"); + return 1; + } + + /* process command line arguments */ + + if (! GetArgs ("subfuse", sizeof (myargs) / sizeof (Args), myargs)) { + return 0; + } + + directory = (CharPtr) myargs [p_argInputPath].strvalue; + outfile = (CharPtr) myargs [o_argOutputFile].strvalue; + suffix = (CharPtr) myargs [x_argSuffix].strvalue; + + bssp = BioseqSetNew (); + if (bssp == NULL) return 0; + bssp->_class = BioseqseqSet_class_genbank; + + sep = SeqEntryNew (); + if (sep == NULL) return 0; + sep->choice = 2; + sep->data.ptrvalue = (Pointer) bssp; + + ssp = SeqSubmitNew (); + if (ssp == NULL) return 0; + ssp->datatype = 1; + ssp->data = (Pointer) sep; + + /* get list of all files in source directory */ + + head = DirCatalog (directory); + + for (vnp = head; vnp != NULL; vnp = vnp->next) { + if (vnp->choice == 0) { + base = (CharPtr) vnp->data.ptrvalue; + if (! StringHasNoText (base)) { + ptr = StringStr (base, suffix); + if (ptr != NULL) { + *ptr = '\0'; + Message (MSG_POST, "Processing %s\n", base); + ProcessOneRecord (ssp, bssp, directory, base, suffix); + } + } + } + } + + /* clean up file list */ + + ValNodeFreeData (head); + + /* write output file */ + + WriteOneSubmission (outfile, ssp); + + return 0; +} + diff --git a/demo/sugint.c b/demo/sugint.c new file mode 100644 index 00000000..b93ce545 --- /dev/null +++ b/demo/sugint.c @@ -0,0 +1,214 @@ +/* sugint.c +* =========================================================================== +* +* PUBLIC DOMAIN NOTICE +* National Center for Biotechnology Information (NCBI) +* +* This software/database is a "United States Government Work" under the +* terms of the United States Copyright Act. It was written as part of +* the author's official duties as a United States Government employee and +* thus cannot be copyrighted. This software/database is freely available +* to the public for use. The National Library of Medicine and the U.S. +* Government do not place any restriction on its use or reproduction. +* We would, however, appreciate having the NCBI and the author cited in +* any work or product based on this material +* +* Although all reasonable efforts have been taken to ensure the accuracy +* and reliability of the software and data, the NLM and the U.S. +* Government do not and cannot warrant the performance or results that +* may be obtained by using this software or data. The NLM and the U.S. +* Government disclaim all warranties, express or implied, including +* warranties of performance, merchantability or fitness for any particular +* purpose. +* +* =========================================================================== +* +* File Name: sugint.c +* +* Author: Jonathan Kans +* +* Version Creation Date: 10/31/08 +* +* $Revision: 1.1 $ +* +* File Description: +* +* Modifications: +* -------------------------------------------------------------------------- +* Date Name Description of modification +* ------- ---------- ----------------------------------------------------- +* +* ========================================================================== +*/ + +#include <ncbi.h> +#include <objall.h> +#include <objsset.h> +#include <objsub.h> +#include <objfdef.h> +#include <seqport.h> +#include <sequtil.h> +#include <sqnutils.h> +#include <subutil.h> +#include <tofasta.h> +#include <gather.h> +#include <explore.h> +#include <suggslp.h> + +static SeqEntryPtr ReadSep ( + FILE *fp, + Boolean forceNuc, + Boolean forcePrt +) + +{ + Pointer dataptr; + Uint2 datatype, entityID; + + dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, forceNuc, forcePrt, TRUE, FALSE); + if (dataptr == NULL) return NULL; + entityID = ObjMgrRegister (datatype, dataptr); + return GetTopSeqEntryForEntityID (entityID); +} + +static void ProcessSuggest ( + FILE *nfp, + FILE *pfp, + AsnIoPtr ofp, + Int2 gencode +) + +{ + BioseqPtr nbsp = NULL, pbsp = NULL; + SeqEntryPtr nsep, psep, sep; + SeqAnnotPtr sap; + SeqFeatPtr sfp; + SeqLocPtr slp; + + nsep = ReadSep (nfp, TRUE, FALSE); + psep = ReadSep (pfp, FALSE, TRUE); + + if (nsep != NULL && psep != NULL) { + sep = FindNthBioseq (nsep, 1); + if (sep != NULL && IS_Bioseq (sep)) { + nbsp = (BioseqPtr) sep->data.ptrvalue; + } + sep = FindNthBioseq (psep, 1); + if (sep != NULL && IS_Bioseq (sep)) { + pbsp = (BioseqPtr) sep->data.ptrvalue; + } + if (nbsp != NULL && pbsp != NULL) { + if (ISA_na (nbsp->mol) && ISA_aa (pbsp->mol)) { + sap = SuggestCodingRegion (nbsp, pbsp, gencode); + + if (sap != NULL && sap->type == 1) { + sfp = (SeqFeatPtr) sap->data; + if (sfp != NULL && sfp->data.choice == SEQFEAT_CDREGION) { + slp = sfp->location; + if (slp != NULL) { + SeqLocAsnWrite (slp, ofp, NULL); + } + } + } + + SeqAnnotFree (sap); + } + } + } + + SeqEntryFree (nsep); + SeqEntryFree (psep); +} + +#define n_argNucInputFile 0 +#define p_argPrtInputFile 1 +#define o_argOutputFile 2 +#define g_argGeneticCode 3 + +Args myargs [] = { + {"Nucleotide Input File", NULL, NULL, NULL, + FALSE, 'n', ARG_FILE_IN, 0.0, 0, NULL}, + {"Protein Input File", NULL, NULL, NULL, + FALSE, 'p', ARG_FILE_IN, 0.0, 0, NULL}, + {"Output File", NULL, NULL, NULL, + FALSE, 'o', ARG_FILE_OUT, 0.0, 0, NULL}, + {"Genetic Code", "1", "0", "20", + TRUE, 'g', ARG_INT, 0.0, 0, NULL}, +}; + +Int2 Main (void) + +{ + Int2 gencode; + FILE *nfp, *pfp; + AsnIoPtr ofp; + CharPtr nucfile, prtfile, outfile; + + /* standard setup */ + + ErrSetFatalLevel (SEV_MAX); + ErrClearOptFlags (EO_SHOW_USERSTR); + UseLocalAsnloadDataAndErrMsg (); + ErrPathReset (); + + /* finish resolving internal connections in ASN.1 parse tables */ + + if (! AllObjLoad ()) { + Message (MSG_FATAL, "AllObjLoad failed"); + return 1; + } + if (! SubmitAsnLoad ()) { + Message (MSG_FATAL, "SubmitAsnLoad failed"); + return 1; + } + if (! FeatDefSetLoad ()) { + Message (MSG_FATAL, "FeatDefSetLoad failed"); + return 1; + } + if (! SeqCodeSetLoad ()) { + Message (MSG_FATAL, "SeqCodeSetLoad failed"); + return 1; + } + if (! GeneticCodeTableLoad ()) { + Message (MSG_FATAL, "GeneticCodeTableLoad failed"); + return 1; + } + + /* process command line arguments */ + + if (! GetArgs ("sugint", sizeof (myargs) / sizeof (Args), myargs)) { + return 0; + } + + nucfile = (CharPtr) myargs [n_argNucInputFile].strvalue; + prtfile = (CharPtr) myargs [p_argPrtInputFile].strvalue; + outfile = (CharPtr) myargs [o_argOutputFile].strvalue; + gencode = (Int2) myargs [g_argGeneticCode].intvalue; + + nfp = FileOpen (nucfile, "r"); + if (nfp == NULL) { + Message (MSG_FATAL, "Unable to open nucleotide input file"); + return 1; + } + + pfp = FileOpen (prtfile, "r"); + if (pfp == NULL) { + Message (MSG_FATAL, "Unable to open protein input file"); + return 1; + } + + ofp = AsnIoOpen (outfile, "w"); + if (ofp == NULL) { + Message (MSG_FATAL, "Unable to open output file"); + return 1; + } + + ProcessSuggest (nfp, pfp, ofp, gencode); + + AsnIoClose (ofp); + FileClose (pfp); + FileClose (nfp); + + return 0; +} + diff --git a/demo/taxblast_main.c b/demo/taxblast_main.c new file mode 100644 index 00000000..b4aa65c7 --- /dev/null +++ b/demo/taxblast_main.c @@ -0,0 +1,99 @@ +static char const rcsid[] = "$Id: taxblast_main.c,v"; + +/* $Id: taxblast_main.c,v +* =========================================================================== +* +* PUBLIC DOMAIN NOTICE +* National Center for Biotechnology Information +* +* This software/database is a "United States Government Work" under the +* terms of the United States Copyright Act. It was written as part of +* the author's official duties as a United States Government employee and +* thus cannot be copyrighted. This software/database is freely available +* to the public for use. The National Library of Medicine and the U.S. +* Government have not placed any restriction on its use or reproduction. +* +* Although all reasonable efforts have been taken to ensure the accuracy +* and reliability of the software and data, the NLM and the U.S. +* Government do not and cannot warrant the performance or results that +* may be obtained by using this software or data. The NLM and the U.S. +* Government disclaim all warranties, express or implied, including +* warranties of performance, merchantability or fitness for any particular +* purpose. +* +* Please cite the author in any work or product based on this material. +* +* =========================================================================== +* +* File Name: $RCSfile: taxblast_main.c,v $ +* +* Authors: Tom Madden +* +* ========================================================================== +*/ + +#include <ncbi.h> +#include <sequtil.h> +#include <treemgr.h> +#include <taxext.h> +#include <txclient.h> +#include <objseq.h> +#include <objgen.h> +#include <taxblast.h> + + +#define NUMARG (sizeof(myargs)/sizeof(myargs[0])) + +static Args myargs [] = { + { "Input ASN.1 File (SeqAnnot)", /* 0 */ + NULL, NULL, NULL, FALSE, 'i', ARG_FILE_IN, 0.0, 0, NULL }, + { "Sequence is DNA", /* 1 */ + "F", NULL, NULL, TRUE, 'p', ARG_BOOLEAN, 0.0, 0, NULL }, + { "Database used to get SeqAnnot ASN.1", /* 2 */ + "nr", NULL, NULL, TRUE, 'd', ARG_STRING, 0.0, 0, NULL }, + { "Output file name", /* 3 */ + "stdout", NULL, NULL, TRUE, 'o', ARG_FILE_OUT, 0.0, 0, NULL } +}; + +Int2 Main (void) +{ + AsnIoPtr aip; + SeqAnnotPtr sap; + Boolean is_na = FALSE; + FILE *outfile; + Char ofile[128]; + + if (!GetArgs("txblast", NUMARG, myargs)) { + return 1; + } + + if (myargs[1].intvalue) + is_na = TRUE; + + if((aip = AsnIoOpen(myargs[0].strvalue, "r")) == NULL) { + ErrPostEx(SEV_FATAL, 1,0, "AsnIoOpen failure\n"); + return 1; + } + + if((sap = SeqAnnotAsnRead (aip, NULL)) == NULL) { + ErrPostEx(SEV_FATAL, 1,0,"SeqAlignAsnRead failure\n"); + return 1; + } + + if(StringCmp(myargs[3].strvalue, "stdout")) { + sprintf (ofile, "%s.html", myargs[0].strvalue); + outfile = FileOpen(ofile, "w"); + } else { + outfile = FileOpen(myargs[3].strvalue, "w"); + } + + TXBHtmlReport((SeqAlignPtr)sap->data, outfile, is_na, is_na, + myargs[2].strvalue, NULL, NULL, FALSE); + + FileClose(outfile); + + AsnIoClose(aip); + SeqAnnotFree(sap); + + return (0); +} diff --git a/demo/tbl2asn.c b/demo/tbl2asn.c index 2aae8664..2d999195 100644 --- a/demo/tbl2asn.c +++ b/demo/tbl2asn.c @@ -29,7 +29,7 @@ * * Version Creation Date: 5/5/00 * -* $Revision: 6.223 $ +* $Revision: 6.271 $ * * File Description: * @@ -67,11 +67,21 @@ #ifdef INTERNAL_NCBI_TBL2ASN #include <accpubseq.h> #endif +#define NLM_GENERATED_CODE_PROTO +#include <asnmacro.h> +#include <objmacro.h> +#include <macroapi.h> -#define TBL2ASN_APP_VER "10.3" +#define TBL2ASN_APP_VER "12.7" CharPtr TBL2ASN_APPLICATION = TBL2ASN_APP_VER; +typedef struct cleanupargs { + Boolean collection_dates; + Boolean collection_dates_month_first; + Boolean add_notes_to_overlapping_cds_without_abc; +} CleanupArgsData, PNTR CleanupArgsPtr; + typedef struct tblargs { Boolean raw2delt; Int2 r2dmin; @@ -105,8 +115,9 @@ typedef struct tblargs { Boolean conflict; Boolean validate; Boolean relaxed; - Boolean discrepancy; + Boolean validate_barcode; Boolean flatfile; + Boolean genereport; Boolean seqidfromfile; Boolean smartfeats; Boolean smarttitle; @@ -118,17 +129,10 @@ typedef struct tblargs { CharPtr aln_match; Boolean aln_is_protein; Boolean save_bioseq_set; - ValNodePtr locus_tag_list; - ValNodePtr missing_locus_tag; - ValNodePtr cds_product_list; - ValNodePtr missing_cds_product; - ValNodePtr mrna_product_list; - ValNodePtr missing_mrna_product; - ValNodePtr adjacent_locus_tag_disc_list; - ValNodePtr missing_gnl_list; - ValNodePtr gnl_list; - ValNodePtr discrepancy_list; - DiscReportOutputConfigData disc_rep_config; + + GlobalDiscrepReportPtr global_report; + + CleanupArgsData cleanup_args; } TblArgs, PNTR TblArgsPtr; static FILE* OpenOneFile ( @@ -271,12 +275,15 @@ static void LIBCALLBACK ValidCallback ( fprintf (fp, "\n"); } + static void ValidateOneFile ( CharPtr results, CharPtr base, CharPtr suffix, SeqEntryPtr sep, - Boolean relaxed + Boolean standard, + Boolean relaxed, + Boolean barcode ) { @@ -289,24 +296,32 @@ static void ValidateOneFile ( sprintf (file, "%s%s", base, suffix); FileBuildPath (path, NULL, file); - vsp = ValidStructNew (); - if (vsp != NULL) { - vsp->useSeqMgrIndexes = TRUE; - vsp->suppressContext = TRUE; - vsp->seqSubmitParent = TRUE; - if (! relaxed) { - vsp->testLatLonSubregion = TRUE; - } - oldErrSev = ErrSetMessageLevel (SEV_NONE); - ofp = FileOpen (path, "w"); - vsp->errfunc = ValidCallback; - vsp->userdata = (Pointer) ofp; - /* vsp->convertGiToAccn = FALSE; */ - ValidateSeqEntry (sep, vsp); - ValidStructFree (vsp); - FileClose (ofp); - ErrSetMessageLevel (oldErrSev); + ofp = FileOpen (path, "w"); + + if (standard) { + vsp = ValidStructNew (); + if (vsp != NULL) { + vsp->useSeqMgrIndexes = TRUE; + vsp->suppressContext = TRUE; + vsp->seqSubmitParent = TRUE; + if (! relaxed) { + vsp->testLatLonSubregion = TRUE; + } + oldErrSev = ErrSetMessageLevel (SEV_NONE); + vsp->errfunc = ValidCallback; + vsp->userdata = (Pointer) ofp; + /* vsp->convertGiToAccn = FALSE; */ + ValidateSeqEntry (sep, vsp); + ValidStructFree (vsp); + ErrSetMessageLevel (oldErrSev); + } } + /* Barcode results if requested */ + if (barcode) { + BarcodeValidateOneSeqEntry (ofp, sep, TRUE, FALSE, TRUE, NULL); + } + + FileClose (ofp); } static void FlatfileOneFile ( @@ -989,6 +1004,450 @@ static int LIBCALLBACK SortByGenePtr ( return 0; } +static void PrintOneGeneLine ( + SeqFeatPtr gene, + SeqFeatPtr cds, + SeqFeatPtr rna, + CharPtr cdslabel, + CharPtr rnalabel, + FILE *fp +) + +{ + BioseqPtr bsp; + ValNodePtr db, old_locus_tag, vnp; + DbtagPtr dbt; + CharPtr desc, locus, locus_tag, cdslcl, cdsaccn, cdsgnl, + rnaaccn, rnagnl, fbgn, gene_type, rna_type, prefix; + GBQualPtr gbq; + GeneRefPtr grp; + ObjectIdPtr oip; + SeqIdPtr sip; + CharPtr str; + TextSeqIdPtr tsip; + + if (fp == NULL) return; + + locus = NULL; + desc = NULL; + locus_tag = NULL; + old_locus_tag = NULL; + + cdslcl = NULL; + cdsaccn = NULL; + cdsgnl = NULL; + rnaaccn = NULL; + rnagnl = NULL; + + db = NULL; + fbgn = NULL; + + gene_type = NULL; + rna_type = NULL; + + if (gene != NULL) { + gene_type = "gene"; + if (gene->pseudo) { + gene_type = "pseudogene"; + } + grp = (GeneRefPtr) gene->data.value.ptrvalue; + if (grp != NULL) { + if (grp->pseudo) { + gene_type = "pseudogene"; + } + locus = grp->locus; + desc = grp->desc; + locus_tag = grp->locus_tag; + db = grp->db; + } + if (db == NULL) { + db = gene->dbxref; + } + for (gbq = gene->qual; gbq != NULL; gbq = gbq->next) { + if (StringICmp (gbq->qual, "old_locus_tag") != 0) continue; + if (StringHasNoText (gbq->val)) continue; + ValNodeCopyStr(&old_locus_tag, 0, gbq->val); + } + for (vnp = db; vnp != NULL; vnp = vnp->next) { + dbt = (DbtagPtr) vnp->data.ptrvalue; + if (dbt == NULL) continue; + if (StringICmp (dbt->db, "FLYBASE") != 0) continue; + oip = dbt->tag; + if (oip == NULL) continue; + fbgn = oip->str; + } + } + + if (cds != NULL) { + if (cds->product != NULL) { + bsp = BioseqFindFromSeqLoc (cds->product); + if (bsp != NULL) { + for (sip = bsp->id; sip != NULL; sip = sip->next) { + switch (sip->choice) { + case SEQID_LOCAL : + oip = (ObjectIdPtr) sip->data.ptrvalue; + if (oip == NULL) continue; + cdslcl = oip->str; + break; + case SEQID_GENBANK : + case SEQID_TPG : + tsip = (TextSeqIdPtr) sip->data.ptrvalue; + if (tsip == NULL) continue; + cdsaccn = tsip->accession; + break; + case SEQID_GENERAL : + dbt = (DbtagPtr) sip->data.ptrvalue; + if (dbt == NULL) continue; + if (IsSkippableDbtag (dbt)) continue; + oip = dbt->tag; + if (oip == NULL) continue; + cdsgnl = oip->str; + break; + default : + break; + } + } + } + } + } + + if (rna != NULL) { + switch (rna->idx.subtype) { + case FEATDEF_preRNA : + rna_type = "precursor RNA"; + break; + case FEATDEF_mRNA : + rna_type = "mRNA"; + break; + case FEATDEF_tRNA : + rna_type = "tRNA"; + break; + case FEATDEF_rRNA : + rna_type = "rRNA"; + break; + case FEATDEF_otherRNA : + rna_type = "misc RNA"; + break; + case FEATDEF_ncRNA : + rna_type = "ncRNA"; + for (gbq = rna->qual; gbq != NULL; gbq = gbq->next) { + if (StringICmp (gbq->qual, "ncRNA_class") != 0) continue; + if (StringDoesHaveText (gbq->val)) { + rna_type = gbq->val; + } + } + break; + case FEATDEF_tmRNA : + rna_type = "tmRNA"; + break; + default : + break; + } + if (rna->pseudo) { + rna_type = "pseudo RNA"; + } + if (rna->product != NULL) { + bsp = BioseqFindFromSeqLoc (rna->product); + if (bsp != NULL) { + for (sip = bsp->id; sip != NULL; sip = sip->next) { + switch (sip->choice) { + case SEQID_GENBANK : + case SEQID_TPG : + tsip = (TextSeqIdPtr) sip->data.ptrvalue; + if (tsip == NULL) continue; + rnaaccn = tsip->accession; + break; + case SEQID_GENERAL : + dbt = (DbtagPtr) sip->data.ptrvalue; + if (dbt == NULL) continue; + if (IsSkippableDbtag (dbt)) continue; + oip = dbt->tag; + if (oip == NULL) continue; + rnagnl = oip->str; + break; + default : + break; + } + } + } + } + } + + if (StringDoesHaveText (locus_tag)) { + fprintf (fp, "%s", locus_tag); + } else { + fprintf (fp, "null_gene_ltag"); + } + + fprintf (fp, "\t"); + if (StringDoesHaveText (locus)) { + fprintf (fp, "%s", locus); + } else { + fprintf (fp, "null_gene_locus"); + } + + fprintf (fp, "\t"); + if (StringDoesHaveText (desc)) { + fprintf (fp, "%s", desc); + } else { + fprintf (fp, "null_gene_desc"); + } + + fprintf (fp, "\t"); + if (StringDoesHaveText (fbgn)) { + fprintf (fp, "%s", fbgn); + } else { + fprintf (fp, "null_fbgn"); + } + + fprintf (fp, "\t"); + if (old_locus_tag != NULL) { + prefix = ""; + for (vnp = old_locus_tag; vnp != NULL; vnp = vnp->next) { + str = (CharPtr) vnp->data.ptrvalue; + if (StringHasNoText (str)) continue; + fprintf (fp, "%s%s", prefix, str); + prefix = ","; + } + } else { + fprintf (fp, "null_old_ltag"); + } + + fprintf (fp, "\t"); + if (StringDoesHaveText (cdslcl)) { + fprintf (fp, "%s", cdslcl); + } else { + fprintf (fp, "null_cds_lcl"); + } + + fprintf (fp, "\t"); + if (StringDoesHaveText (cdsaccn)) { + fprintf (fp, "%s", cdsaccn); + } else { + fprintf (fp, "null_cds_accn"); + } + + fprintf (fp, "\t"); + if (StringDoesHaveText (cdsgnl)) { + fprintf (fp, "%s", cdsgnl); + } else { + fprintf (fp, "null_cds_gnl"); + } + + fprintf (fp, "\t"); + if (StringDoesHaveText (rnaaccn)) { + fprintf (fp, "%s", rnaaccn); + } else { + fprintf (fp, "null_rna_accn"); + } + + fprintf (fp, "\t"); + if (StringDoesHaveText (rnagnl)) { + fprintf (fp, "%s", rnagnl); + } else { + fprintf (fp, "null_rna_gnl"); + } + + fprintf (fp, "\t"); + if (StringDoesHaveText (cdslabel)) { + fprintf (fp, "%s", cdslabel); + } else { + fprintf (fp, "null_cds_product"); + } + + fprintf (fp, "\t"); + if (StringDoesHaveText (rnalabel)) { + fprintf (fp, "%s", rnalabel); + } else { + fprintf (fp, "null_rna_product"); + } + + fprintf (fp, "\t"); + if (StringDoesHaveText (gene_type)) { + fprintf (fp, "%s", gene_type); + } else { + fprintf (fp, "null_gene_type"); + } + + fprintf (fp, "\t"); + if (StringDoesHaveText (rna_type)) { + fprintf (fp, "%s", rna_type); + } else { + fprintf (fp, "null_rna_type"); + } + + fprintf (fp, "\n"); +} + +static void GeneReportOneBsp ( + BioseqPtr bsp, + FILE *fp +) + +{ + CharPtr cdslabel, rnalabel; + SeqMgrFeatContext fcontext; + GmcDataPtr gdp, head; + GeneRefPtr grp; + Int2 i, j, k, numgene, numcds, numrna, total; + SeqFeatPtr matchsfp, sfp, tmp; + SeqFeatXrefPtr xref; + + if (bsp == NULL || fp == NULL) return; + + numgene = 0; + numcds = 0; + numrna = 0; + + sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &fcontext); + while (sfp != NULL) { + switch (sfp->data.choice) { + case SEQFEAT_GENE : + numgene++; + break; + case SEQFEAT_CDREGION : + numcds++; + break; + case SEQFEAT_RNA : + numrna++; + break; + default : + break; + } + sfp = SeqMgrGetNextFeature (bsp, sfp, 0, 0, &fcontext); + } + + if (numgene == 0) return; + total = numgene + numcds + numrna; + if (total == 0) return; + + head = (GmcDataPtr) MemNew (sizeof (GmcData) * (size_t) (total + 1)); + if (head == NULL) return; + + gdp = head; + total = 0; + sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &fcontext); + while (sfp != NULL) { + if (sfp->data.choice == SEQFEAT_CDREGION || sfp->data.choice == SEQFEAT_RNA) { + gdp->feat = sfp; + gdp->label = fcontext.label; + grp = SeqMgrGetGeneXref (sfp); + if (grp == NULL) { + gdp->gene = SeqMgrGetOverlappingGene (sfp->location, NULL); + } else if (! SeqMgrGeneIsSuppressed (grp)) { + if (StringDoesHaveText (grp->locus_tag)) { + gdp->gene = SeqMgrGetGeneByLocusTag (bsp, grp->locus_tag, NULL); + } else if (StringDoesHaveText (grp->locus)) { + gdp->gene = SeqMgrGetFeatureByLabel (bsp, grp->locus, SEQFEAT_GENE, 0, NULL); + } + } + gdp++; + total++; + } else if (sfp->data.choice == SEQFEAT_GENE) { + gdp->gene = sfp; + gdp++; + total++; + } + sfp = SeqMgrGetNextFeature (bsp, sfp, 0, 0, &fcontext); + } + + HeapSort (head, (size_t) total, sizeof (GmcData), SortByGenePtr); + + for (i = 0; i < total; i += j) { + sfp = head [i].gene; + if (sfp == NULL) continue; + numcds = 0; + numrna = 0; + for (j = 0; i + j < total && sfp == head [i + j].gene; j++) { + tmp = head [i + j].feat; + if (tmp == NULL) continue; + if (tmp->data.choice == SEQFEAT_CDREGION) { + numcds++; + } else if (tmp->data.choice == SEQFEAT_RNA) { + numrna++; + } + } + cdslabel = NULL; + rnalabel = NULL; + if (numcds > 0) { + for (k = 0; k < j; k++) { + tmp = head [i + k].feat; + if (tmp == NULL) continue; + if (tmp->data.choice != SEQFEAT_CDREGION) continue; + cdslabel = head [i + k].label; + matchsfp = NULL; + for (xref = tmp->xref; xref != NULL && matchsfp == NULL; xref = xref->next) { + if (xref->id.choice != 0) { + matchsfp = SeqMgrGetFeatureByFeatID (tmp->idx.entityID, NULL, NULL, xref, &fcontext); + rnalabel = fcontext.label; + } + } + PrintOneGeneLine (sfp, tmp, matchsfp, cdslabel, rnalabel, fp); + } + } else if (numrna > 0) { + for (k = 0; k < j; k++) { + tmp = head [i + k].feat; + if (tmp == NULL) continue; + if (tmp->data.choice != SEQFEAT_RNA) continue; + rnalabel = head [i + k].label; + PrintOneGeneLine (sfp, NULL, tmp, NULL, rnalabel, fp); + } + } else { + PrintOneGeneLine (sfp, NULL, NULL, NULL, NULL, fp); + } + } + + MemFree (head); +} + +static void GeneReportGenomicBsp ( + BioseqPtr bsp, + Pointer userdata +) + +{ + SeqMgrDescContext dcontext; + MolInfoPtr mip; + SeqDescrPtr sdp; + + if (bsp == NULL) return; + + if (ISA_aa (bsp->mol)) return; + sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &dcontext); + if (sdp == NULL) return; + mip = (MolInfoPtr) sdp->data.ptrvalue; + if (mip == NULL) return; + if (mip->biomol != MOLECULE_TYPE_GENOMIC) return; + + GeneReportOneBsp (bsp, (FILE *) userdata); +} + +static void GeneReportOneFile ( + CharPtr results, + CharPtr base, + CharPtr suffix, + SeqEntryPtr sep +) + +{ + Char file [FILENAME_MAX], path [PATH_MAX]; + FILE *fp; + ErrSev oldErrSev; + + StringNCpy_0 (path, results, sizeof (path)); + sprintf (file, "%s%s", base, suffix); + FileBuildPath (path, NULL, file); + + fp = FileOpen (path, "w"); + if (fp == NULL) return; + + oldErrSev = ErrSetMessageLevel (SEV_MAX); + VisitBioseqsInSep (sep, (Pointer) fp, GeneReportGenomicBsp); + ErrSetMessageLevel (oldErrSev); + + FileClose (fp); +} + static void EnhanceOneCDS ( SeqFeatPtr sfp, Boolean alt_splice @@ -1032,7 +1491,7 @@ static void EnhanceOneCDS ( if (sip->choice != SEQID_GENERAL) continue; dbt = (DbtagPtr) sip->data.ptrvalue; if (dbt == NULL) continue; - if (IsSkippableDbtag (dbt)) continue; + if (IsSkippableDbtag (dbt)) continue; oip = dbt->tag; if (oip == NULL) continue; str = oip->str; @@ -1100,14 +1559,14 @@ static void EnhanceOneCDS ( } } -static void EnhanceOneMrna ( +static void EnhanceOneRna ( SeqFeatPtr sfp, Boolean alt_splice ) { DbtagPtr dbt; - GBQualPtr gbq; + GBQualPtr gbq, nm_gbq; Char id [64]; SeqIdPtr ids, sip; size_t len; @@ -1119,10 +1578,29 @@ static void EnhanceOneMrna ( if (sfp == NULL || sfp->data.choice != SEQFEAT_RNA) return; name = NULL; + nm_gbq = NULL; rrp = (RnaRefPtr) sfp->data.value.ptrvalue; - if (rrp != NULL && rrp->type == 2 && rrp->ext.choice == 1) { - name = rrp->ext.value.ptrvalue; + if (rrp != NULL && rrp->ext.choice == 1) { + switch (rrp->type) { + case 1 : /* precurrsor_RNA */ + case 2 : /* mRNA */ + case 4 : /* rRNA */ + name = rrp->ext.value.ptrvalue; + break; + case 255 : /* misc_RNA, ncRNA, tmRNA */ + for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) { + if (StringICmp (gbq->qual, "product") == 0) { + nm_gbq = gbq; + name = gbq->val; + } + } + break; + case 3: /* tRNA */ + return; + default : + break; + } } id [0] = '\0'; @@ -1161,7 +1639,11 @@ static void EnhanceOneMrna ( */ StringCat (nwstr, " "); StringCat (nwstr, tmp); - rrp->ext.value.ptrvalue = (Pointer) nwstr; + if (nm_gbq != NULL) { + nm_gbq->val = (Pointer) nwstr; + } else { + rrp->ext.value.ptrvalue = (Pointer) nwstr; + } MemFree (name); } } else { @@ -1177,7 +1659,11 @@ static void EnhanceOneMrna ( */ StringCat (nwstr, " "); StringCat (nwstr, str); - rrp->ext.value.ptrvalue = (Pointer) nwstr; + if (nm_gbq != NULL) { + nm_gbq->val = (Pointer) nwstr; + } else { + rrp->ext.value.ptrvalue = (Pointer) nwstr; + } MemFree (name); } } @@ -1210,25 +1696,25 @@ static void EnhanceFeatureAnnotation ( { GmcDataPtr gdp, head; GeneRefPtr grp; - Int2 i, j, k, numgene, numcds, nummrna; + Int2 i, j, k, numgene, numcds, numrna; SeqFeatPtr sfp; if (features == NULL || bsp == NULL) return; numgene = 0; numcds = 0; - nummrna = 0; + numrna = 0; for (sfp = features; sfp != NULL; sfp = sfp->next) { - switch (sfp->idx.subtype) { - case FEATDEF_GENE : + switch (sfp->data.choice) { + case SEQFEAT_GENE : numgene++; break; - case FEATDEF_CDS : + case SEQFEAT_CDREGION : numcds++; break; - case FEATDEF_mRNA : - nummrna++; + case SEQFEAT_RNA : + numrna++; break; default : break; @@ -1269,12 +1755,12 @@ static void EnhanceFeatureAnnotation ( MemFree (head); } - if (nummrna > 0) { - head = (GmcDataPtr) MemNew (sizeof (GmcData) * (size_t) (nummrna + 1)); + if (numrna > 0) { + head = (GmcDataPtr) MemNew (sizeof (GmcData) * (size_t) (numrna + 1)); if (head != NULL) { gdp = head; for (sfp = features; sfp != NULL; sfp = sfp->next) { - if (sfp->idx.subtype == FEATDEF_mRNA) { + if (sfp->data.choice == SEQFEAT_RNA) { gdp->feat = sfp; grp = SeqMgrGetGeneXref (sfp); if (grp == NULL || (! SeqMgrGeneIsSuppressed (grp))) { @@ -1283,17 +1769,17 @@ static void EnhanceFeatureAnnotation ( gdp++; } } - HeapSort (head, (size_t) nummrna, sizeof (GmcData), SortByGenePtr); - for (i = 0; i < nummrna; i += j) { + HeapSort (head, (size_t) numrna, sizeof (GmcData), SortByGenePtr); + for (i = 0; i < numrna; i += j) { sfp = head [i].gene; - for (j = 1; i + j < nummrna && sfp == head [i + j].gene; j++) continue; + for (j = 1; i + j < numrna && sfp == head [i + j].gene; j++) continue; if (j == 1) { /* no alt splicing */ - EnhanceOneMrna (head [i].feat, FALSE); + EnhanceOneRna (head [i].feat, FALSE); } else { /* is alt splicing */ for (k = 0; k < j; k++) { - EnhanceOneMrna (head [i + k].feat, TRUE); + EnhanceOneRna (head [i + k].feat, TRUE); } } } @@ -1564,10 +2050,11 @@ static void ProcessOneNuc ( SeqHistPtr shp; SqnTagPtr stp = NULL; CharPtr str; - SeqEntryPtr top; + CharPtr tmp; CharPtr ttl = NULL; UserObjectPtr uop; ValNodePtr vnp; + SeqMgrDescContext dcontext; if (bsp == NULL) return; @@ -1582,9 +2069,9 @@ static void ProcessOneNuc ( (AsnReadFunc) BioSourceAsnRead, (AsnWriteFunc) BioSourceAsnWrite); } else { - top = GetTopSeqEntryForEntityID (entityID); - if (top != NULL) { - VisitBioSourcesInSep (top, (Pointer) &src, GetFirstBiop); + sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &dcontext); + if (sdp != NULL) { + src = sdp->data.ptrvalue; if (src != NULL) { addNewBiop = FALSE; } @@ -1613,6 +2100,11 @@ static void ProcessOneNuc ( if (stp != NULL) { biop = ParseTitleIntoBioSource (stp, tbl->organism, src); ParseTitleIntoBioseq (stp, bsp); + str = SqnTagFind (stp, "comment"); + if (str != NULL) { + tmp = StringSave (str); + SeqDescrAddPointer (&(bsp->descr), Seq_descr_comment, (Pointer) tmp); + } } if (biop == NULL) { biop = ParseTitleIntoBioSource (NULL, tbl->organism, src); @@ -1757,6 +2249,27 @@ static void ProcessOneNuc ( ValNodeFreeData (vnp); } +static void ProcessNucBioseqs (SeqEntryPtr top_sep, Uint2 entityID, BioSourcePtr src, TblArgsPtr tbl, MolInfoPtr template_molinfo) +{ + BioseqPtr bsp; + BioseqSetPtr bssp; + SeqEntryPtr sep; + + if (top_sep == NULL || top_sep->data.ptrvalue == NULL) return; + if (IS_Bioseq (top_sep)) { + bsp = (BioseqPtr) top_sep->data.ptrvalue; + if (!ISA_aa (bsp->mol)) { + ProcessOneNuc (entityID, bsp, src, tbl, template_molinfo); + } + } else if (IS_Bioseq_set (top_sep)) { + bssp = (BioseqSetPtr) top_sep->data.ptrvalue; + for (sep = bssp->seq_set; sep != NULL; sep = sep->next) { + ProcessNucBioseqs (sep, entityID, src, tbl, template_molinfo); + } + } +} + + static void ProcessOneAnnot ( SeqAnnotPtr sap, Uint2 entityID, @@ -1952,11 +2465,14 @@ static void ReplaceOneRNA ( ) { - ByteStorePtr bs; - BioseqPtr bsp; - SeqFeatPtr mrna; - SeqIdPtr sip; - CharPtr str, str1, str2; + ByteStorePtr bs; + BioseqPtr bsp; + SeqMgrFeatContext ccontext; + SeqFeatPtr cds, mrna; + SeqIntPtr sintp; + SeqIdPtr sip; + SeqLocPtr slp; + CharPtr str, str1, str2; if (ssp == NULL || ssp->numid < 1) return; @@ -2008,6 +2524,21 @@ static void ReplaceOneRNA ( } */ } + + /* make sure CDS in nuc-prot set is not longer than just-replaced RNA */ + + cds = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_CDREGION, 0, &ccontext); + if (cds != NULL) { + slp = cds->location; + if (slp != NULL && slp->choice == SEQLOC_INT) { + sintp = (SeqIntPtr) slp->data.ptrvalue; + if (sintp != NULL) { + if (sintp->from == 0 && sintp->to > bsp->length - 1) { + sintp->to = bsp->length - 1; + } + } + } + } } MemFree (str1); @@ -2134,6 +2665,59 @@ static void SuggestOnePeptide ( SeqLocFree (slp); } +static void RnaProtTrailingCommaFix (SeqFeatPtr sfp, Pointer userdata) + +{ + Char ch; + size_t len; + ProtRefPtr prp; + RnaRefPtr rrp; + CharPtr str; + ValNodePtr vnp; + + if (sfp == NULL) return; + + if (sfp->data.choice == SEQFEAT_PROT) { + prp = (ProtRefPtr) sfp->data.value.ptrvalue; + /* turn trailing space into trailing underscore for validator */ + for (vnp = prp->name; vnp != NULL; vnp = vnp->next) { + str = (CharPtr) vnp->data.ptrvalue; + if (StringHasNoText (str)) continue; + len = StringLen (str); + if (len < 1) continue; + ch = str [len - 1]; + while (ch == ' ' && len > 2) { + len--; + ch = str [len - 1]; + } + if (ch == ',') { + str [len - 1] = '_'; + str [len] = '\0'; + } + } + } else if (sfp->data.choice == SEQFEAT_RNA) { + rrp = (RnaRefPtr) sfp->data.value.ptrvalue; + /* turn trailing space into trailing underscore for validator */ + if (rrp->ext.choice == 1) { + str = rrp->ext.value.ptrvalue; + if (StringDoesHaveText (str)) { + len = StringLen (str); + if (len > 0) { + ch = str [len - 1]; + while (ch == ' ' && len > 2) { + len--; + ch = str [len - 1]; + } + if (ch == ',') { + str [len - 1] = '_'; + str [len] = '\0'; + } + } + } + } + } +} + static Uint2 ProcessOneAsn ( FILE* fp, BioSourcePtr src, @@ -2197,6 +2781,8 @@ static Uint2 ProcessOneAsn ( return 0; } + VisitFeaturesInSep (sep, NULL, RnaProtTrailingCommaFix); + if (StringDoesHaveText (localname)) { sip = MakeSeqID (localname); if (sip != NULL) { @@ -2207,7 +2793,7 @@ static Uint2 ProcessOneAsn ( } } - ProcessOneNuc (entityID, bsp, src, tbl, template_molinfo); + ProcessNucBioseqs (sep, entityID, src, tbl, template_molinfo); return entityID; } @@ -2276,6 +2862,8 @@ static Uint2 ProcessRaw2Delt ( return 0; } + VisitFeaturesInSep (sep, NULL, RnaProtTrailingCommaFix); + if (StringDoesHaveText (localname)) { sip = MakeSeqID (localname); if (sip != NULL) { @@ -2355,6 +2943,8 @@ static Uint2 ProcessGappedSet ( return 0; } + VisitFeaturesInSep (sep, NULL, RnaProtTrailingCommaFix); + ProcessOneNuc (entityID, bsp, src, tbl, template_molinfo); return entityID; @@ -2642,38 +3232,38 @@ static CharPtr ReadContigFile ( rescuedcontigs = ValNodeFreeData (rescuedcontigs); if (sp6_clonep != NULL && *sp6_clonep != NULL) { - sp6_end = StringChr (*sp6_clonep, ','); - if (sp6_end != NULL) { - *sp6_end = '\0'; - sp6_end++; - if (StringICmp (sp6_end, "left") == 0) { - sp6_end = "left"; - } else if (StringICmp (sp6_end, "right") == 0) { - sp6_end = "right"; - } else { - sp6_end = NULL; - } - } - if (sp6_endp != NULL) { - *sp6_endp = sp6_end; - } + sp6_end = StringChr (*sp6_clonep, ','); + if (sp6_end != NULL) { + *sp6_end = '\0'; + sp6_end++; + if (StringICmp (sp6_end, "left") == 0) { + sp6_end = "left"; + } else if (StringICmp (sp6_end, "right") == 0) { + sp6_end = "right"; + } else { + sp6_end = NULL; + } + } + if (sp6_endp != NULL) { + *sp6_endp = sp6_end; + } } if (t7_clonep != NULL && *t7_clonep != NULL) { - t7_end = StringChr (*t7_clonep, ','); - if (t7_end != NULL) { - *t7_end = '\0'; - t7_end++; - if (StringICmp (t7_end, "left") == 0) { - t7_end = "left"; - } else if (StringICmp (t7_end, "right") == 0) { - t7_end = "right"; - } else { - t7_end = NULL; - } - } - if (t7_endp != NULL) { - *t7_endp = t7_end; - } + t7_end = StringChr (*t7_clonep, ','); + if (t7_end != NULL) { + *t7_end = '\0'; + t7_end++; + if (StringICmp (t7_end, "left") == 0) { + t7_end = "left"; + } else if (StringICmp (t7_end, "right") == 0) { + t7_end = "right"; + } else { + t7_end = NULL; + } + } + if (t7_endp != NULL) { + *t7_endp = t7_end; + } } return pstring; @@ -3792,6 +4382,64 @@ static void CopyGene ( MemFree (temp); /* do not SeqFeatFree */ } +static void CopyNcRna ( + SeqFeatPtr sfp, + Pointer userdata +) + +{ + BioseqPtr bsp; + SeqFeatPtr copy, temp; + Boolean partial5, partial3; + + if (sfp->data.choice != SEQFEAT_RNA) return; + if (sfp->idx.subtype != FEATDEF_ncRNA) return; + + /* find instantiated product of ncRNA */ + + bsp = BioseqFindFromSeqLoc (sfp->product); + if (bsp == NULL) return; + + CheckSeqLocForPartial (sfp->location, &partial5, &partial3); + + /* copy ncRNA feature fields to paste into new ncRNA feature */ + + temp = AsnIoMemCopy (sfp, + (AsnReadFunc) SeqFeatAsnRead, + (AsnWriteFunc) SeqFeatAsnWrite); + if (temp == NULL) return; + + /* make new ncRNA feature on full-length of transcript */ + + copy = CreateNewFeatureOnBioseq (bsp, SEQFEAT_RNA, NULL); + if (copy == NULL) { + SeqFeatFree (temp); + return; + } + + /* paste fields from temp copy of original ncRNA */ + + copy->data.value.ptrvalue = temp->data.value.ptrvalue; + copy->partial = temp->partial; + copy->excpt = temp->excpt; + copy->comment = temp->comment; + copy->qual = temp->qual; + copy->title = temp->title; + copy->ext = temp->ext; + copy->cit = temp->cit; + copy->exp_ev = temp->exp_ev; + copy->xref = temp->xref; + copy->dbxref = temp->dbxref; + copy->pseudo = temp->pseudo; + copy->except_text = temp->except_text; + + SetSeqLocPartial (copy->location, partial5, partial3); + + SeqLocFree (temp->location); + SeqLocFree (temp->product); + MemFree (temp); /* do not SeqFeatFree */ +} + static void ClearRnaProducts ( SeqFeatPtr sfp, Pointer userdata @@ -3858,6 +4506,35 @@ static void FindDupProtFeats ( } } +static void ClearProtFeatStrand ( + SeqFeatPtr sfp, + Pointer userdata +) + +{ + SeqIntPtr sintp; + SeqLocPtr slp; + + if (sfp == NULL) return; + if (sfp->data.choice != SEQFEAT_REGION && + sfp->data.choice != SEQFEAT_SITE && + sfp->data.choice != SEQFEAT_BOND && + sfp->data.choice != SEQFEAT_PROT) return; + + slp = SeqLocFindNext (sfp->location, NULL); + while (slp != NULL) { + if (slp->choice == SEQLOC_INT) { + sintp = (SeqIntPtr) slp->data.ptrvalue; + if (sintp != NULL) { + if (sintp->strand != Seq_strand_unknown) { + sintp->strand = Seq_strand_unknown; + } + } + } + slp = SeqLocFindNext (sfp->location, slp); + } +} + static void RemoveDupProtFeats ( BioseqPtr bsp, Pointer userdata @@ -3868,10 +4545,11 @@ static void RemoveDupProtFeats ( if (bsp == NULL) return; if (! ISA_aa (bsp->mol)) return; + VisitFeaturesOnBsp (bsp, NULL, ClearProtFeatStrand); dp.firstprot = NULL; dp.secondprot = NULL; VisitFeaturesOnBsp (bsp, (Pointer) &dp, FindDupProtFeats); - if (dp.firstprot == NULL && dp.secondprot == NULL) return; + if (dp.firstprot == NULL || dp.secondprot == NULL) return; if (AsnIoMemComp ((Pointer) dp.firstprot, (Pointer) dp.secondprot, (AsnWriteFunc) SeqFeatAsnWrite)) { dp.firstprot->idx.deleteme = TRUE; } @@ -4085,7 +4763,7 @@ static CharPtr RnaTypeLabel ( return "RNA"; } -static void AddMrnaTitles ( +static void AddRnaTitles ( SeqFeatPtr rna, CharPtr organism ) @@ -4155,7 +4833,7 @@ static void AddMrnaTitles ( SeqDescrAddPointer (&(bsp->descr), Seq_descr_title, (Pointer) str); } -static void MakeOneMrnaTitle ( +static void MakeOneRnaTitle ( SeqFeatPtr rna, SeqFeatPtr gene, CharPtr label, @@ -4215,7 +4893,7 @@ static void MakeOneMrnaTitle ( StringCat (str, grp->locus); StringCat (str, ")"); } - StringCat (str, " "); + StringCat (str, ", "); StringCat (str, ptr); } else { StringCat (str, lbl); @@ -4226,24 +4904,16 @@ static void MakeOneMrnaTitle ( } } } - if (cds != NULL) { - StringCat (str, " "); - StringCat (str, typ); - if (ccontext.partialL || ccontext.partialR) { - StringCat (str, ", partial cds."); - } else { - StringCat (str, ", complete cds."); - } - } else { - StringCat (str, " "); - StringCat (str, typ); - StringCat (str, "."); - } + + StringCat (str, ", "); + StringCat (str, typ); + StringCat (str, "."); + SeqDescrAddPointer (&(bsp->descr), Seq_descr_title, (Pointer) str); MemFree (lbl); } -static void MakeSmartMrnaTitles ( +static void MakeSmartRnaTitles ( BioseqPtr bsp, CharPtr organism ) @@ -4252,13 +4922,13 @@ static void MakeSmartMrnaTitles ( SeqMgrFeatContext context; GmcDataPtr gdp, head; GeneRefPtr grp; - Int2 i, j, k, numgene, nummrna; + Int2 i, j, k, numgene, numrna; SeqFeatPtr sfp; if (bsp == NULL) return; numgene = 0; - nummrna = 0; + numrna = 0; sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &context); while (sfp != NULL) { @@ -4267,7 +4937,7 @@ static void MakeSmartMrnaTitles ( numgene++; break; case SEQFEAT_RNA : - nummrna++; + numrna++; break; default : break; @@ -4277,8 +4947,8 @@ static void MakeSmartMrnaTitles ( /* if (numgene == 0) return; */ - if (nummrna > 0) { - head = (GmcDataPtr) MemNew (sizeof (GmcData) * (size_t) (nummrna + 1)); + if (numrna > 0) { + head = (GmcDataPtr) MemNew (sizeof (GmcData) * (size_t) (numrna + 1)); if (head != NULL) { gdp = head; sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_RNA, 0, &context); @@ -4294,17 +4964,17 @@ static void MakeSmartMrnaTitles ( } sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_RNA, 0, &context); } - HeapSort (head, (size_t) nummrna, sizeof (GmcData), SortByGenePtr); - for (i = 0; i < nummrna; i += j) { + HeapSort (head, (size_t) numrna, sizeof (GmcData), SortByGenePtr); + for (i = 0; i < numrna; i += j) { sfp = head [i].gene; - for (j = 1; i + j < nummrna && sfp == head [i + j].gene; j++) continue; + for (j = 1; i + j < numrna && sfp == head [i + j].gene; j++) continue; if (j == 1) { /* no alt splicing */ - MakeOneMrnaTitle (head [i].feat, head [i].gene, head [i].label, organism, FALSE); + MakeOneRnaTitle (head [i].feat, head [i].gene, head [i].label, organism, FALSE); } else { /* is alt splicing */ for (k = 0; k < j; k++) { - MakeOneMrnaTitle (head [i + k].feat, head [i + k].gene, head [i + k].label, organism, TRUE); + MakeOneRnaTitle (head [i + k].feat, head [i + k].gene, head [i + k].label, organism, TRUE); } } } @@ -4327,7 +4997,7 @@ static void LookForGo ( Char ch; GoSearchPtr gsp; CharPtr ptr; - Int2 state; + Int4 state; ValNodePtr matches; if (sfp == NULL || StringHasNoText (sfp->comment)) return; @@ -4528,6 +5198,7 @@ static SeqEntryPtr PropagateDescsFromGenBankSet ( } bssp = (BioseqSetPtr) sep->data.ptrvalue; bssp->descr = SeqDescrFree (bssp->descr); + NormalizeDescriptorOrder (sep); return firstsep; } @@ -4752,6 +5423,7 @@ static void AddTemplateDescriptors ( dsc = sdp; } ValNodeLink (current_list, (Pointer) dsc); + sdp->next = sdp_next; } } @@ -5004,159 +5676,138 @@ static void LookupPubdesc ( } -typedef struct globaldiscrepancylists { - ValNodePtr locus_tag_list; - ValNodePtr missing_locus_tag; -} GlobalDiscrepancyListsData, PNTR GlobalDiscrepancyListPtr; -static void CollectGlobalDiscrepancyData ( - SeqFeatPtr sfp, - Pointer userdata -) +#ifdef INTERNAL_NCBI_ASNDISC +const PerformDiscrepancyTest taxlookup = CheckTaxNamesAgainstTaxDatabase; +#else +const PerformDiscrepancyTest taxlookup = NULL; +#endif + +static void CleanupCollectionDatesMonthFirst (BioSourcePtr biop, Pointer data) { - GeneRefPtr grp; - GlobalDiscrepancyListPtr tbl; + SubSourcePtr ssp; + CharPtr reformatted_date = NULL; - if (sfp == NULL || sfp->idx.subtype != FEATDEF_GENE) return; - tbl = (GlobalDiscrepancyListPtr) userdata; - if (tbl == NULL) return; + if (biop == NULL) return; - grp = (GeneRefPtr) sfp->data.value.ptrvalue; - if (grp != NULL) { - if (grp->pseudo) return; - if (StringDoesHaveText (grp->locus_tag)) { - ValNodeAddPointer (&(tbl->locus_tag_list), 0, - GlobalDiscrepancyNew (grp->locus_tag, OBJ_SEQFEAT, sfp)); - } else { - ValNodeAddPointer (&(tbl->missing_locus_tag), 0, - GlobalDiscrepancyNew (NULL, OBJ_SEQFEAT, sfp)); + ssp = biop->subtype; + while (ssp != NULL) + { + if (ssp->subtype == SUBSRC_collection_date) + { + reformatted_date = ReformatDateStringEx (ssp->name, TRUE, NULL); + if (reformatted_date != NULL) + { + ssp->name = MemFree (ssp->name); + ssp->name = reformatted_date; + } } + ssp = ssp->next; } } -static void SaveStringsForDiscrepancyItemList (ValNodePtr list, Boolean use_feature_fmt); - -static void SaveStringsForDiscrepancyItems (ClickableItemPtr cip, Boolean use_feature_fmt) +static void CleanupCollectionDatesDayFirst (BioSourcePtr biop, Pointer data) { - ValNodePtr vnp, list_copy; - CharPtr str; + SubSourcePtr ssp; + CharPtr reformatted_date = NULL; - if (cip == NULL) return; - if (use_feature_fmt) { - list_copy = ReplaceDiscrepancyItemWithFeatureTableStrings (cip->item_list); - cip->item_list = ValNodeFree (cip->item_list); - cip->item_list = list_copy; - } else { - for (vnp = cip->item_list; vnp != NULL; vnp = vnp->next) { - str = GetDiscrepancyItemText (vnp); - vnp->choice = 0; - vnp->data.ptrvalue = str; + if (biop == NULL) return; + + ssp = biop->subtype; + while (ssp != NULL) + { + if (ssp->subtype == SUBSRC_collection_date) + { + reformatted_date = ReformatDateStringEx (ssp->name, FALSE, NULL); + if (reformatted_date != NULL) + { + ssp->name = MemFree (ssp->name); + ssp->name = reformatted_date; + } } + ssp = ssp->next; } - SaveStringsForDiscrepancyItemList (cip->subcategories, use_feature_fmt); } -static void SaveStringsForDiscrepancyItemList (ValNodePtr list, Boolean use_feature_fmt) +static void ValNodeLinkCopy (ValNodePtr PNTR list1, ValNodePtr list2) { - while (list != NULL) { - SaveStringsForDiscrepancyItems (list->data.ptrvalue, use_feature_fmt); - list = list->next; + if (list1 == NULL) return; + while (list2 != NULL) + { + ValNodeAddPointer (list1, list2->choice, list2->data.ptrvalue); + list2 = list2->next; } } +static ValNodePtr FindItemListForClickableItemCategory (ValNodePtr list, CharPtr category_fmt) +{ + ClickableItemPtr cip; + ValNodePtr vnp; + ValNodePtr item_list = NULL; + CharPtr cp; -#ifdef INTERNAL_NCBI_ASNDISC -const PerformDiscrepancyTest taxlookup = CheckTaxNamesAgainstTaxDatabase; -#else -const PerformDiscrepancyTest taxlookup = NULL; -#endif + if (StringLen (category_fmt) < 2) { + return NULL; + } + for (vnp = list; vnp != NULL; vnp = vnp->next) { + cip = (ClickableItemPtr) vnp->data.ptrvalue; + if (cip != NULL) { + if (cip->description != NULL) { + /* skip number at beginning of category title */ + cp = cip->description; + while (isdigit (*cp)) { + cp++; + } + if (StringCmp (cp, category_fmt + 2) == 0) { + ValNodeLinkCopy (&item_list, cip->item_list); + } + } + ValNodeLink (&item_list, FindItemListForClickableItemCategory (cip->subcategories, category_fmt)); + } + } + return item_list; +} -static void DiscrepancyReportOneRecord (TblArgsPtr tbl, SeqEntryPtr sep) +static void DoTbl2AsnCleanup (SeqEntryPtr sep, CleanupArgsPtr c) { - ClickableItemPtr adjacent_cip = NULL; - ValNode sep_list; - ValNodePtr local_discrepancy_list = NULL; - Uint2 entityID; - DiscrepancyConfigData dcd; - GlobalDiscrepancyListsData lists; - GenProdSetDiscrepancyListsData gps_lists; - ProtIdListsData prot_lists; - Int4 k; - - if (tbl == NULL || sep == NULL) return; - - entityID = SeqMgrGetEntityIDForSeqEntry (sep); - if (SeqMgrFeaturesAreIndexed (entityID) == 0) { - SeqMgrIndexFeatures (entityID, NULL); - } + ValNodePtr sep_list = NULL; + ValNodePtr discrepancy_list = NULL, item_list = NULL, vnp; + SeqFeatPtr sfp; - MemSet (&lists, 0, sizeof (GlobalDiscrepancyListsData)); - VisitGenProdSetFeatures (sep, &lists, CollectGlobalDiscrepancyData); - MemSet (&gps_lists, 0, sizeof (GenProdSetDiscrepancyListsData)); - CheckGenProdSetsInSeqEntry (sep, &gps_lists); - MemSet (&prot_lists, 0, sizeof (ProtIdListsData)); - VisitBioseqsInSep (sep, &prot_lists, FindProteinIDCallback); - - if (lists.locus_tag_list != NULL) { - /* collect adjacent genes */ - lists.locus_tag_list = ValNodeSort (lists.locus_tag_list, SortVnpByGlobalDiscrepancyString); - adjacent_cip = FindAdjacentDuplicateLocusTagGenes (lists.locus_tag_list); - if (adjacent_cip != NULL) { - SaveStringsForDiscrepancyItems (adjacent_cip, tbl->disc_rep_config.use_feature_table_format); - ValNodeAddPointer (&(tbl->adjacent_locus_tag_disc_list), 0, adjacent_cip); - } - } - - /* convert lists to strings and add to global lists */ - ConvertGlobalDiscrepancyListToText (lists.locus_tag_list, tbl->disc_rep_config.use_feature_table_format); - ValNodeLink (&(tbl->locus_tag_list), lists.locus_tag_list); - ConvertGlobalDiscrepancyListToText (lists.missing_locus_tag, tbl->disc_rep_config.use_feature_table_format); - ValNodeLink (&(tbl->missing_locus_tag), lists.missing_locus_tag); - ConvertGlobalDiscrepancyListToText (gps_lists.cds_product_list, tbl->disc_rep_config.use_feature_table_format); - ValNodeLink (&(tbl->cds_product_list), gps_lists.cds_product_list); - ConvertGlobalDiscrepancyListToText (gps_lists.missing_protein_id, tbl->disc_rep_config.use_feature_table_format); - ValNodeLink (&(tbl->missing_cds_product), gps_lists.missing_protein_id); - ConvertGlobalDiscrepancyListToText (gps_lists.mrna_product_list, tbl->disc_rep_config.use_feature_table_format); - ValNodeLink (&(tbl->mrna_product_list), gps_lists.mrna_product_list); - ConvertGlobalDiscrepancyListToText (gps_lists.missing_mrna_product, tbl->disc_rep_config.use_feature_table_format); - ValNodeLink (&(tbl->missing_mrna_product), gps_lists.missing_mrna_product); - ConvertGlobalDiscrepancyListToText (prot_lists.gnl_list, tbl->disc_rep_config.use_feature_table_format); - ValNodeLink (&tbl->gnl_list, prot_lists.gnl_list); - ConvertGlobalDiscrepancyListToText (prot_lists.missing_gnl_list, tbl->disc_rep_config.use_feature_table_format); - ValNodeLink (&tbl->missing_gnl_list, prot_lists.missing_gnl_list); - - /* setup discrepancy report config */ - MemSet (&dcd, 0, sizeof (DiscrepancyConfigData)); - /* enable all tests except tRNA */ - for (k = 0; k < MAX_DISC_TYPE; k++) { - dcd.conf_list[k] = TRUE; - } - DisableTRNATests (&(dcd)); - /* disable tests that are global */ - dcd.conf_list[DISC_GENE_MISSING_LOCUS_TAG] = FALSE; - dcd.conf_list[DISC_GENE_DUPLICATE_LOCUS_TAG] = FALSE; - dcd.conf_list[DISC_GENE_LOCUS_TAG_BAD_FORMAT] = FALSE; - dcd.conf_list[DISC_GENE_LOCUS_TAG_INCONSISTENT_PREFIX] = FALSE; - dcd.conf_list[DISC_MISSING_GENPRODSET_PROTEIN] = FALSE; - dcd.conf_list[DISC_DUP_GENPRODSET_PROTEIN] = FALSE; - dcd.conf_list[DISC_MISSING_GENPRODSET_TRANSCRIPT_ID] = FALSE; - dcd.conf_list[DISC_DUP_GENPRODSET_TRANSCRIPT_ID] = FALSE; - dcd.conf_list[DISC_MISSING_PROTEIN_ID] = FALSE; - dcd.conf_list[DISC_INCONSISTENT_PROTEIN_ID_PREFIX] = FALSE; - - sep_list.data.ptrvalue = sep; - sep_list.next = NULL; - local_discrepancy_list = CollectDiscrepancies (&dcd, &sep_list, taxlookup); - - SaveStringsForDiscrepancyItemList (local_discrepancy_list, tbl->disc_rep_config.use_feature_table_format); - ValNodeLink (&(tbl->discrepancy_list), local_discrepancy_list); + if (sep == NULL || c == NULL) { + return; + } + if (c->collection_dates) { + if (c->collection_dates_month_first) { + VisitBioSourcesInSep (sep, NULL, CleanupCollectionDatesMonthFirst); + } else { + VisitBioSourcesInSep (sep, NULL, CleanupCollectionDatesDayFirst); + } + } + if (c->add_notes_to_overlapping_cds_without_abc) { + ValNodeAddPointer (&sep_list, 0, sep); + SeqMgrIndexFeatures (ObjMgrGetEntityIDForChoice (sep), NULL); + AddOverlappingCodingRegionDiscrepancies (&discrepancy_list, sep_list); + sep_list = ValNodeFree (sep_list); + item_list = FindItemListForClickableItemCategory (discrepancy_list, kOverlappingCDSNeedsNoteFmt); + discrepancy_list = FreeClickableList (discrepancy_list); + for (vnp = item_list; vnp != NULL; vnp = vnp->next) { + if (vnp->choice == OBJ_SEQFEAT) { + sfp = (SeqFeatPtr) vnp->data.ptrvalue; + if (sfp != NULL) { + SetStringValue (&(sfp->comment), kOverlappingCDSNoteText, ExistingTextOption_append_semi); + } + } + } + item_list = ValNodeFree (item_list); + } } - + static void ProcessOneRecord ( SubmitBlockPtr sbp, @@ -5219,6 +5870,7 @@ static void ProcessOneRecord ( CharPtr tblfile = NULL; SeqEntryPtr tmp; MolInfoPtr template_molinfo = NULL; + ValNodePtr cmt_errors, vnp; fp = OpenOneFile (directory, base, suffix); if (fp == NULL) return; @@ -5374,6 +6026,20 @@ static void ProcessOneRecord ( FileClose (fp); } + /* read structured comments from .cmt file */ + fp = OpenOneFile (directory, base, ".cmt"); + if (fp != NULL) { + sep = GetTopSeqEntryForEntityID (entityID); + cmt_errors = CreateStructuredCommentsFromFile (fp, sep); + FileClose (fp); + if (cmt_errors != NULL) { + for (vnp = cmt_errors; vnp != NULL; vnp = vnp->next) { + Message (MSG_POSTERR, "Error processing structured comment (.cmt) file: %s", vnp->data.ptrvalue); + } + cmt_errors = ValNodeFreeData (cmt_errors); + } + } + /* read one or more protein sequences from .pep file */ fp = OpenOneFile (directory, base, ".pep"); @@ -5622,8 +6288,12 @@ static void ProcessOneRecord ( if (tbl->genprodset) { VisitFeaturesInSep (sep, NULL, CopyGene); } + if (tbl->genprodset) { + /* currently copying ncRNA feature onto product */ + VisitFeaturesInSep (sep, NULL, CopyNcRna); + } if (! tbl->genprodset) { - VisitFeaturesInSep (sep, NULL, ClearRnaProducts); + VisitFeaturesInSep (sep, NULL, ClearRnaProducts); } if (tbl->removeunnecxref) { @@ -5648,11 +6318,11 @@ static void ProcessOneRecord ( bsp = FindNucBioseq (sep); if (tbl->smarttitle) { - MakeSmartMrnaTitles (bsp, organism); + MakeSmartRnaTitles (bsp, organism); } else { sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_RNA, 0, &context); while (sfp != NULL) { - AddMrnaTitles (sfp, organism); + AddRnaTitles (sfp, organism); sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_RNA, 0, &context); } } @@ -5675,6 +6345,8 @@ static void ProcessOneRecord ( /* SeriousSeqEntryCleanup (sep, NULL, NULL); */ + ConvertFullLenSourceFeatToDesc (sep); + ConvertFullLenPubFeatToDesc (sep); if (tbl->linkbyoverlap) { SeqMgrIndexFeatures (entityID, NULL); LinkCDSmRNAbyOverlap (sep); @@ -5683,6 +6355,9 @@ static void ProcessOneRecord ( LinkCDSmRNAbyProduct (sep); } + DoTbl2AsnCleanup (sep, &(tbl->cleanup_args)); + NormalizeDescriptorOrder (sep); + if (StringHasNoText (results)) { results = directory; } @@ -5691,9 +6366,10 @@ static void ProcessOneRecord ( atp_bssse = AsnFind ("Bioseq-set.seq-set.E"); if (atp_bssse == NULL) { Message (MSG_POSTERR, "Unable to find ASN.1 type Bioseq-set.seq-set.E"); - } else if (tbl->fastaset) { + } else if (tbl->fastaset && tbl->whichclass == 0) { /* already has genbank wrapper, write individual components */ tmp = PropagateDescsFromGenBankSet (sep); + SeqMgrClearFeatureIndexes (entityID, NULL); while (tmp != NULL) { SeqEntryAsnWrite (tmp, aip, atp_bssse); tmp = tmp->next; @@ -5702,8 +6378,9 @@ static void ProcessOneRecord ( SeqEntryAsnWrite (sep, aip, atp_bssse); } } else { - if (tbl->fastaset) { + if (tbl->fastaset && tbl->whichclass == 0) { PropagateDescsFromGenBankSet (sep); + SeqMgrClearFeatureIndexes (entityID, NULL); } WriteOneFile (results, base, ".sqn", outfile, sep, sbp, tbl->save_bioseq_set); } @@ -5712,11 +6389,11 @@ static void ProcessOneRecord ( Message (MSG_OK, "Illegal GO term format detected in note - contact database for instructions"); } - if (tbl->discrepancy) { - DiscrepancyReportOneRecord (tbl, sep); + if (tbl->global_report != NULL) { + AddSeqEntryToGlobalDiscrepReport (sep, tbl->global_report, base); } - if (tbl->validate || tbl->flatfile) { + if (tbl->validate || tbl->flatfile || tbl->genereport || tbl->validate_barcode) { if (pdp != NULL) { /* copy in citsub as publication for validator and flatfile */ @@ -5733,9 +6410,12 @@ static void ProcessOneRecord ( Message (MSG_POST, "Flatfile %s\n", base); FlatfileOneFile (results, base, ".gbf", sep); } - if (tbl->validate) { + if (tbl->validate || tbl->validate_barcode) { Message (MSG_POST, "Validating %s\n", base); - ValidateOneFile (results, base, ".val", sep, tbl->relaxed); + ValidateOneFile (results, base, ".val", sep, tbl->validate, tbl->relaxed, tbl->validate_barcode); + } + if (tbl->genereport) { + GeneReportOneFile (results, base, ".t2g", sep); } } } @@ -5744,126 +6424,6 @@ static void ProcessOneRecord ( } -static void DoDiscrepancySummary ( - TblArgsPtr tbl, - FILE *fp -) - -{ - ValNodePtr local_list = NULL; - ClickableItemPtr cip; - - if (tbl == NULL) return; - - tbl->locus_tag_list = ValNodeSort (tbl->locus_tag_list, SortVnpByGlobalDiscrepancyString); - tbl->missing_locus_tag = ValNodeSort (tbl->missing_locus_tag, SortVnpByGlobalDiscrepancyString); - tbl->cds_product_list = ValNodeSort (tbl->cds_product_list, SortVnpByGlobalDiscrepancyString); - tbl->missing_cds_product = ValNodeSort (tbl->missing_cds_product, SortVnpByGlobalDiscrepancyString); - tbl->mrna_product_list = ValNodeSort (tbl->mrna_product_list, SortVnpByGlobalDiscrepancyString); - tbl->missing_mrna_product = ValNodeSort (tbl->missing_mrna_product, SortVnpByGlobalDiscrepancyString); - - if (tbl->locus_tag_list != NULL) { - if (tbl->missing_locus_tag != NULL) { - cip = ReportMissingFields (tbl->missing_locus_tag, discReportMissingLocusTags, DISC_GENE_MISSING_LOCUS_TAG); - if (cip != NULL) { - ValNodeAddPointer (&local_list, 0, cip); - } - } - CollateDiscrepancyReports (&(tbl->adjacent_locus_tag_disc_list)); - cip = ReportNonUniqueGlobalDiscrepancy (tbl->locus_tag_list, - discReportDuplicateLocusTagFmt, - discReportOneDuplicateLocusTagFmt, - DISC_GENE_DUPLICATE_LOCUS_TAG, - TRUE); - if (cip != NULL) { - ValNodeAddPointer (&local_list, 0, cip); - if (tbl->adjacent_locus_tag_disc_list != NULL) { - ValNodeLink (&(cip->subcategories), tbl->adjacent_locus_tag_disc_list); - } - } else if (tbl->adjacent_locus_tag_disc_list != NULL) { - ValNodeLink (&local_list, tbl->adjacent_locus_tag_disc_list); - } - tbl->adjacent_locus_tag_disc_list = NULL; - - /* inconsistent locus tags */ - ValNodeLink (&local_list, - ReportInconsistentGlobalDiscrepancyPrefixes (tbl->locus_tag_list, - discReportInconsistentLocusTagPrefixFmt, - DISC_GENE_LOCUS_TAG_INCONSISTENT_PREFIX)); - /* bad formats */ - cip = ReportBadLocusTagFormat (tbl->locus_tag_list); - if (cip != NULL) { - ValNodeAddPointer (&local_list, 0, cip); - } - } - - if (tbl->cds_product_list != NULL) { - /* report duplicates */ - cip = ReportNonUniqueGlobalDiscrepancy (tbl->cds_product_list, - discReportDuplicateProteinIDFmt, - discReportOneDuplicateProteinIDFmt, - DISC_DUP_GENPRODSET_PROTEIN, - TRUE); - if (cip != NULL) { - ValNodeAddPointer (&local_list, 0, cip); - } - - /* report inconsistent IDs */ - ValNodeLink (&local_list, - ReportInconsistentGlobalDiscrepancyPrefixes (tbl->cds_product_list, - discReportInconsistentProteinIDPrefixFmt, - DISC_INCONSISTENT_PROTEIN_ID_PREFIX)); - } - - if (tbl->mrna_product_list != NULL) { - if (tbl->missing_locus_tag != NULL) { - cip = ReportMissingFields (tbl->mrna_product_list, discReportMissingTranscriptIDFmt, DISC_MISSING_GENPRODSET_TRANSCRIPT_ID); - if (cip != NULL) { - ValNodeAddPointer (&local_list, 0, cip); - } - } - - cip = ReportNonUniqueGlobalDiscrepancy (tbl->mrna_product_list, - discReportDuplicateTranscriptIdFmt, - discReportOneDuplicateTranscriptIdFmt, - DISC_DUP_GENPRODSET_TRANSCRIPT_ID, - TRUE); - if (cip != NULL) { - ValNodeAddPointer (&local_list, 0, cip); - } - } - - /* missing gnl protein IDs */ - cip = ReportMissingFields (tbl->missing_gnl_list, discReportBadProteinIdFmt, DISC_MISSING_PROTEIN_ID); - if (cip != NULL) { - ValNodeAddPointer (&local_list, 0, cip); - } - tbl->gnl_list = ValNodeSort (tbl->gnl_list, SortVnpByGlobalDiscrepancyString); - ValNodeLink (&local_list, - ReportInconsistentGlobalDiscrepancyStrings (tbl->gnl_list, - discReportInconsistentProteinIDPrefixFmt, - DISC_INCONSISTENT_PROTEIN_ID_PREFIX)); - - - tbl->locus_tag_list = FreeGlobalDiscrepancyList (tbl->locus_tag_list); - tbl->missing_locus_tag = FreeGlobalDiscrepancyList (tbl->missing_locus_tag); - tbl->cds_product_list = FreeGlobalDiscrepancyList (tbl->cds_product_list); - tbl->missing_cds_product = FreeGlobalDiscrepancyList (tbl->missing_cds_product); - tbl->mrna_product_list = FreeGlobalDiscrepancyList (tbl->mrna_product_list); - tbl->missing_mrna_product = FreeGlobalDiscrepancyList (tbl->missing_mrna_product); - tbl->missing_gnl_list = FreeGlobalDiscrepancyList (tbl->missing_gnl_list); - tbl->gnl_list = FreeGlobalDiscrepancyList (tbl->gnl_list); - - - /* group discrepany reports from separate files */ - CollateDiscrepancyReports (&(tbl->discrepancy_list)); - - - WriteAsnDiscReport (local_list, fp, &(tbl->disc_rep_config), TRUE); - local_list = FreeClickableList (local_list); - - WriteAsnDiscReport (tbl->discrepancy_list, fp, &(tbl->disc_rep_config), TRUE); -} static CharPtr overwriteMsg = "Your template with a .sqn suffix will be overwritten. Do you wish to continue?"; @@ -5912,6 +6472,7 @@ static void FileRecurse ( CharPtr directory, CharPtr results, CharPtr suffix, + Boolean recurse, SeqDescrPtr sdphead, TblArgsPtr tbl, TextFsaPtr gotags, @@ -5951,14 +6512,14 @@ static void FileRecurse ( } } } - } else if (vnp->choice == 1) { + } else if (vnp->choice == 1 && recurse) { /* recurse into subdirectory */ StringNCpy_0 (path, directory, sizeof (path)); str = (CharPtr) vnp->data.ptrvalue; FileBuildPath (path, str, NULL); - FileRecurse (sbp, pdp, src, path, results, suffix, sdphead, tbl, gotags, aip, outfile); + FileRecurse (sbp, pdp, src, path, results, suffix, recurse, sdphead, tbl, gotags, aip, outfile); } } @@ -6050,7 +6611,8 @@ static AsnTypePtr DoFirstPrefix ( } static AsnTypePtr DoSecondPrefix ( - AsnIoPtr aip + AsnIoPtr aip, + TblArgsPtr tbl ) { @@ -6090,7 +6652,23 @@ static AsnTypePtr DoSecondPrefix ( if (! AsnOpenStruct (aip, bssp_atp, (Pointer) &bs)) return NULL; - av.intvalue = BioseqseqSet_class_genbank; + switch (tbl->whichclass) { + case 1 : + av.intvalue = BioseqseqSet_class_pop_set; + break; + case 2 : + av.intvalue = BioseqseqSet_class_phy_set; + break; + case 3 : + av.intvalue = BioseqseqSet_class_mut_set; + break; + case 4 : + av.intvalue = BioseqseqSet_class_eco_set; + break; + default : + av.intvalue = BioseqseqSet_class_genbank; + break; + } if (! AsnWrite (aip, atp_bsc, &av)) return NULL; if (! AsnOpenStruct (aip, atp_bsss, (Pointer) &bs.seq_set)) return NULL; @@ -6275,6 +6853,60 @@ static CharPtr ParseCommaField ( return str; } +static DatePtr DateParse ( + CharPtr str +) + +{ + Int4 day = -1, month = -1, year = -1; + DatePtr dp; + CharPtr ptr; + Char tmp [64]; + long int val; + + if (StringHasNoText (str)) return NULL; + + StringNCpy_0 (tmp, str, sizeof (tmp)); + ptr = StringChr (tmp, '/'); + if (ptr == NULL) { + ptr = StringChr (tmp, '-'); + } + if (ptr != NULL) { + *ptr = '\0'; + ptr++; + if (sscanf (tmp, "%ld", &val) == 1) { + month = (Int4) val; + } + str = StringChr (ptr, '/'); + if (str == NULL) { + str = StringChr (ptr, '-'); + } + if (str != NULL) { + *str = '\0'; + str++; + if (sscanf (ptr, "%ld", &val) == 1) { + day = (Int4) val; + } + if (sscanf (str, "%ld", &val) == 1) { + year = (Int4) val; + } + } + } + + if (month < 0 || day < 0 || year < 2000) return NULL; + if (month > 12 || day > 31 || year > 2099) return NULL; + + dp = DateNew (); + if (dp == NULL) return NULL; + + dp->data [0] = 1; + dp->data [1] = (Uint1) (year - 1900); + dp->data [2] = (Uint1) month; + dp->data [3] = (Uint1) day; + + return dp; +} + /* Args structure contains command-line arguments */ #define p_argInputPath 0 @@ -6282,44 +6914,41 @@ static CharPtr ParseCommaField ( #define i_argInputFile 2 #define o_argOutputFile 3 #define x_argSuffix 4 -#define t_argTemplate 5 -#define a_argType 6 -#define s_argFastaSet 7 -#define w_argWhichClass 8 -#define d_argDeltaSet 9 -#define l_argAlignment 10 -#define z_argGapped 11 -#define e_argPhrapAce 12 -#define g_argGenProdSet 13 -#define F_argFeatIdLinks 14 -#define H_argImplicitGaps 15 -#define A_argAccession 16 -#define C_argCenter 17 -#define n_argOrgName 18 -#define j_argSrcQuals 19 -#define y_argComment 20 -#define Y_argCommentFile 21 -#define D_argDescrsFile 22 -#define f_argTableFile 23 -#define k_argCdsFlags 24 -#define c_argFindOrf 25 -#define V_argVerify 26 -#define v_argValidate 27 -#define b_argGenBank 28 -#define q_argFileID 29 -#define u_argUndoGPS 30 -#define h_argGnlToNote 31 -#define G_argGapFields 32 -#define R_argRemote 33 -#define S_argSmartFeats 34 -#define Q_argSmartTitle 35 -#define U_argUnnecXref 36 -#define L_argLocalID 37 -#define T_argTaxLookup 38 -#define P_argPubLookup 39 -#define W_argLogProgress 40 -#define K_argBioseqSet 41 -#define Z_argDiscRepFile 42 +#define E_argRecurse 5 +#define t_argTemplate 6 +#define a_argType 7 +#define s_argFastaSet 8 +#define g_argGenProdSet 9 +#define F_argFeatIdLinks 10 +#define A_argAccession 11 +#define C_argCenter 12 +#define n_argOrgName 13 +#define j_argSrcQuals 14 +#define y_argComment 15 +#define Y_argCommentFile 16 +#define D_argDescrsFile 17 +#define f_argTableFile 18 +#define k_argCdsFlags 19 +#define V_argVerify 20 +#define v_argValidate 21 +#define b_argGenBank 22 +#define q_argFileID 23 +#define u_argUndoGPS 24 +#define h_argGnlToNote 25 +#define G_argGapFields 26 +#define R_argRemote 27 +#define S_argSmartFeats 28 +#define Q_argSmartTitle 29 +#define U_argUnnecXref 30 +#define L_argLocalID 31 +#define T_argTaxLookup 32 +#define P_argPubLookup 33 +#define W_argLogProgress 34 +#define K_argBioseqSet 35 +#define H_argHoldUntilPub 36 +#define Z_argDiscRepFile 37 +#define c_argCleanupOptions 38 + Args myargs [] = { {"Path to Files", NULL, NULL, NULL, @@ -6332,6 +6961,8 @@ Args myargs [] = { TRUE, 'o', ARG_FILE_OUT, 0.0, 0, NULL}, {"Suffix", ".fsa", NULL, NULL, TRUE, 'x', ARG_STRING, 0.0, 0, NULL}, + {"Recurse", "F", NULL, NULL, + TRUE, 'E', ARG_BOOLEAN, 0.0, 0, NULL}, {"Template File", NULL, NULL, NULL, TRUE, 't', ARG_FILE_IN, 0.0, 0, NULL}, {"File Type\n" @@ -6346,22 +6977,10 @@ Args myargs [] = { TRUE, 'a', ARG_STRING, 0.0, 0, NULL}, {"Read FASTAs as Set", "F", NULL, NULL, TRUE, 's', ARG_BOOLEAN, 0.0, 0, NULL}, - {"Fasta Set Class (1 Pop, 2 Phy, 3 Mut, 4 Eco) (obsolete: use -a s1-4)", "0", "0", "4", - FALSE, 'w', ARG_INT, 0.0, 0, NULL}, - {"Read FASTAs as Delta (obsolete: use -a d)", "F", NULL, NULL, - TRUE, 'd', ARG_BOOLEAN, 0.0, 0, NULL}, - {"Read FASTA+Gap Alignment (obsolete: use -a l)", "F", NULL, NULL, - TRUE, 'l', ARG_BOOLEAN, 0.0, 0, NULL}, - {"Read FASTAs with Gap Lines (obsolete: use -a z)", "F", NULL, NULL, - TRUE, 'z', ARG_BOOLEAN, 0.0, 0, NULL}, - {"Read PHRAP/ACE Format (obsolete: use -a e)", "F", NULL, NULL, - TRUE, 'e', ARG_BOOLEAN, 0.0, 0, NULL}, {"Genomic Product Set", "F", NULL, NULL, TRUE, 'g', ARG_BOOLEAN, 0.0, 0, NULL}, {"Feature ID Links (o by Overlap, p by Product)", NULL, NULL, NULL, TRUE, 'F', ARG_STRING, 0.0, 0, NULL}, - {"Implicit Gaps (obsolete: use -a di)", "F", NULL, NULL, - TRUE, 'H', ARG_BOOLEAN, 0.0, 0, NULL}, {"Accession", NULL, NULL, NULL, TRUE, 'A', ARG_STRING, 0.0, 0, NULL}, {"Genome Center Tag", NULL, NULL, NULL, @@ -6384,12 +7003,11 @@ Args myargs [] = { " m Allow Alternative Starts\n" " k Set Conflict on Mismatch\n", NULL, NULL, NULL, TRUE, 'k', ARG_STRING, 0.0, 0, NULL}, - {"Annotate Longest ORF (obsolete: use -k c)", "F", NULL, NULL, - TRUE, 'c', ARG_BOOLEAN, 0.0, 0, NULL}, {"Verification (combine any of the following letters)\n" " v Validate with Normal Stringency\n" " r Validate without Country Check\n" - " b Generate GenBank Flatfile\n", NULL, NULL, NULL, + " b Generate GenBank Flatfile\n" + " g Generate Gene Report\n", NULL, NULL, NULL, TRUE, 'V', ARG_STRING, 0.0, 0, NULL}, {"Validate (obsolete: use -V v)", "F", NULL, NULL, TRUE, 'v', ARG_BOOLEAN, 0.0, 0, NULL}, @@ -6424,8 +7042,18 @@ Args myargs [] = { TRUE, 'W', ARG_BOOLEAN, 0.0, 0, NULL}, {"Save Bioseq-set", "F", NULL, NULL, TRUE, 'K', ARG_BOOLEAN, 0.0, 0, NULL}, + {"Hold Until Publish\n" + " y Hold for One Year\n" + " mm/dd/yyyy\n", NULL, NULL, NULL, + TRUE, 'H', ARG_STRING, 0.0, 0, NULL}, {"Discrepancy Report Output File", NULL, NULL, NULL, TRUE, 'Z', ARG_FILE_OUT, 0.0, 0, NULL}, + {"Cleanup (combine any of the following letters)\n" + " d Correct Collection Dates (assume month first)\n" + " D Correct Collection Dates (assume day first)\n" + " b Append note to coding regions that overlap other coding regions with similar product names and do not contain 'ABC'", + NULL, NULL, NULL, + TRUE, 'c', ARG_STRING, 0.0, 0, NULL}, }; Int2 Main (void) @@ -6440,15 +7068,18 @@ Int2 Main (void) Uint2 datatype; CharPtr descrs; CharPtr directory; + DatePtr dp; FILE *fp; Char gapstring [128]; TextFsaPtr gotags; + CharPtr hold; CharPtr os; CharPtr outfile; Pubdesc pd; PubdescPtr pdp = NULL; ValNode pb; CharPtr ptr; + Boolean recurse; Boolean remote; CharPtr results; SubmitBlockPtr sbp = NULL; @@ -6509,6 +7140,7 @@ Int2 Main (void) results = NULL; } suffix = (CharPtr) myargs [x_argSuffix].strvalue; + recurse = (Boolean) myargs [E_argRecurse].intvalue; base = (CharPtr) myargs [i_argInputFile].strvalue; outfile = (CharPtr) myargs [o_argOutputFile].strvalue; if (StringHasNoText (outfile)) { @@ -6517,6 +7149,8 @@ Int2 Main (void) tmplate = (CharPtr) myargs [t_argTemplate].strvalue; descrs = (CharPtr) myargs [D_argDescrsFile].strvalue; + hold = (CharPtr) myargs [H_argHoldUntilPub].strvalue; + if (StringHasNoText(directory) && StringHasNoText(base)) { Message (MSG_FATAL, "You must supply either an input file (-i) or an input directory (-p).\nUse -p . to specify the current directory.\n\n"); return 1; @@ -6525,33 +7159,6 @@ Int2 Main (void) MemSet ((Pointer) &tbl, 0, sizeof (TblArgs)); - /* process obsolete format arguments first, warn if used */ - - tbl.whichclass = (Boolean) myargs [w_argWhichClass].intvalue; - if (tbl.whichclass) { - Message (MSG_POST, "-w is obsolete, use -a s1-4 instead"); - } - tbl.deltaset = (Boolean) myargs [d_argDeltaSet].intvalue; - if (tbl.deltaset) { - Message (MSG_POST, "-d is obsolete, use -a d instead"); - } - tbl.alignset = (Boolean) myargs [l_argAlignment].intvalue; - if (tbl.alignset) { - Message (MSG_POST, "-l is obsolete, use -a l instead"); - } - tbl.gapped = (Boolean) myargs [z_argGapped].intvalue; - if (tbl.gapped) { - Message (MSG_POST, "-z is obsolete, use -a z instead"); - } - tbl.phrapace = (Boolean) myargs [e_argPhrapAce].intvalue; - if (tbl.phrapace) { - Message (MSG_POST, "-e is obsolete, use -a e instead"); - } - tbl.implicitgaps = (Boolean) myargs [H_argImplicitGaps].intvalue; - if (tbl.implicitgaps) { - Message (MSG_POST, "-H is obsolete, use -a di instead"); - } - /* -s is heavily used and will remain as an alternative to -a s */ tbl.fastaset = (Boolean) myargs [s_argFastaSet].intvalue; @@ -6611,13 +7218,6 @@ Int2 Main (void) tbl.comment = (CharPtr) myargs [y_argComment].strvalue; tbl.commentFile = ReadCommentFile ((CharPtr) myargs [Y_argCommentFile].strvalue); - /* process obsolete findorf argument first, warn if used */ - - tbl.findorf = (Boolean) myargs [c_argFindOrf].intvalue; - if (tbl.findorf) { - Message (MSG_POST, "-c is obsolete, use -k c instead"); - } - ptr = myargs [k_argCdsFlags].strvalue; if (StringChr (ptr, 'c') != NULL) { tbl.findorf = TRUE; @@ -6661,7 +7261,12 @@ Int2 Main (void) if (StringChr (ptr, 'b') != NULL) { tbl.flatfile = TRUE; } - + if (StringChr (ptr, 'g') != NULL) { + tbl.genereport = TRUE; + } + if (StringChr (ptr, 'c') != NULL) { + tbl.validate_barcode = TRUE; + } tbl.seqidfromfile = (Boolean) myargs [q_argFileID].intvalue; @@ -6675,8 +7280,18 @@ Int2 Main (void) tbl.save_bioseq_set = (Boolean) myargs [K_argBioseqSet].intvalue; disc_rep_file = (CharPtr) myargs [Z_argDiscRepFile].strvalue; - if (!StringHasNoText (disc_rep_file)) { - tbl.discrepancy = TRUE; + if (StringHasNoText (disc_rep_file)) { + tbl.global_report = NULL; + } else { + tbl.global_report = GlobalDiscrepReportNew(); + tbl.global_report->test_config = DiscrepancyConfigNew (); + ConfigureForGenomes (tbl.global_report->test_config); + tbl.global_report->taxlookup = taxlookup; + tbl.global_report->output_config->summary_report = FALSE; + tbl.global_report->output_config->expand_report_categories[DISC_SUPERFLUOUS_GENE] = TRUE; + tbl.global_report->output_config->expand_report_categories[DISC_RNA_CDS_OVERLAP] = TRUE; + tbl.global_report->output_config->expand_report_categories[DISC_SUSPECT_PRODUCT_NAME] = TRUE; + tbl.global_report->output_config->expand_report_categories[DISC_OVERLAPPING_CDS] = TRUE; } @@ -6735,6 +7350,25 @@ Int2 Main (void) return 1; } + /* arguments for cleanup */ + MemSet (&(tbl.cleanup_args), 0, sizeof (CleanupArgsData)); + ptr = (CharPtr) myargs [c_argCleanupOptions].strvalue; + if (StringChr (ptr, 'd') != NULL) { + if (StringChr (ptr, 'D') != NULL) { + Message (MSG_FATAL, "Cannot use both d and D options for cleanup. Choose one."); + return 1; + } + tbl.cleanup_args.collection_dates = TRUE; + tbl.cleanup_args.collection_dates_month_first = TRUE; + } else if (StringChr (ptr, 'D') != NULL) { + tbl.cleanup_args.collection_dates = TRUE; + tbl.cleanup_args.collection_dates_month_first = FALSE; + } + + if (StringChr (ptr, 'b') != NULL) { + tbl.cleanup_args.add_notes_to_overlapping_cds_without_abc = TRUE; + } + if (StringHasNoText (base) && (StringDoesHaveText (tbl.accn))) { Message (MSG_FATAL, "Accession can be entered only for a single record"); return 1; @@ -6790,6 +7424,24 @@ Int2 Main (void) MemFree (os); sbp->hup = FALSE; sbp->reldate = DateFree (sbp->reldate); + if (StringDoesHaveText (hold)) { + if (StringICmp (hold, "y") == 0) { + sbp->hup = TRUE; + dp = DateCurr (); + sbp->reldate = dp; + if (dp != NULL) { + if (dp->data [0] == 1) { + (dp->data [1])++; + } + } + } else { + dp = DateParse (hold); + if (dp != NULL) { + sbp->hup = TRUE; + sbp->reldate = dp; + } + } + } csp = sbp->cit; if (csp != NULL) { csp->date = DateFree (csp->date); @@ -6870,7 +7522,7 @@ Int2 Main (void) return 1; } ssp_atp = DoFirstPrefix (aip, sbp); - bssp_atp = DoSecondPrefix (aip); + bssp_atp = DoSecondPrefix (aip, &tbl); } if (StringDoesHaveText (base)) { @@ -6885,7 +7537,7 @@ Int2 Main (void) } else { - FileRecurse (sbp, pdp, src, directory, results, suffix, sdphead, &tbl, gotags, aip, NULL); + FileRecurse (sbp, pdp, src, directory, results, suffix, recurse, sdphead, &tbl, gotags, aip, NULL); } if (aip != NULL) { @@ -6894,14 +7546,11 @@ Int2 Main (void) AsnIoClose (aip); } - if (tbl.discrepancy) { - tbl.disc_rep_config.summary_report = FALSE; - tbl.disc_rep_config.expand_report_categories[DISC_SUPERFLUOUS_GENE] = TRUE; - tbl.disc_rep_config.expand_report_categories[DISC_RNA_CDS_OVERLAP] = TRUE; - tbl.disc_rep_config.expand_report_categories[DISC_SUSPECT_PRODUCT_NAME] = TRUE; + if (tbl.global_report != NULL) { fp = FileOpen (disc_rep_file, "w"); - DoDiscrepancySummary (&tbl, fp); + WriteGlobalDiscrepancyReport (tbl.global_report, fp); FileClose (fp); + tbl.global_report = GlobalDiscrepReportFree (tbl.global_report); } if (sbp != NULL) { |