diff options
Diffstat (limited to 'api')
-rw-r--r-- | api/alignval.c | 8 | ||||
-rw-r--r-- | api/asn2gnb1.c | 135 | ||||
-rw-r--r-- | api/asn2gnb2.c | 263 | ||||
-rw-r--r-- | api/asn2gnb3.c | 141 | ||||
-rw-r--r-- | api/asn2gnb4.c | 257 | ||||
-rw-r--r-- | api/asn2gnb5.c | 60 | ||||
-rw-r--r-- | api/asn2gnb6.c | 104 | ||||
-rw-r--r-- | api/asn2gnbi.h | 11 | ||||
-rw-r--r-- | api/asn2gnbk.h | 4 | ||||
-rw-r--r-- | api/edutil.c | 68 | ||||
-rw-r--r-- | api/edutil.h | 12 | ||||
-rw-r--r-- | api/explore.h | 4 | ||||
-rw-r--r-- | api/findrepl.c | 966 | ||||
-rw-r--r-- | api/findrepl.h | 27 | ||||
-rw-r--r-- | api/salsap.c | 646 | ||||
-rw-r--r-- | api/salsap.h | 8 | ||||
-rw-r--r-- | api/seqmgr.c | 234 | ||||
-rw-r--r-- | api/seqmgr.h | 43 | ||||
-rw-r--r-- | api/seqport.c | 300 | ||||
-rw-r--r-- | api/sequtil.c | 379 | ||||
-rw-r--r-- | api/sequtil.h | 9 | ||||
-rw-r--r-- | api/sqnutil1.c | 262 | ||||
-rw-r--r-- | api/sqnutil2.c | 194 | ||||
-rw-r--r-- | api/sqnutil3.c | 114 | ||||
-rw-r--r-- | api/sqnutils.h | 32 | ||||
-rw-r--r-- | api/subutil.c | 142 | ||||
-rw-r--r-- | api/subutil.h | 94 | ||||
-rw-r--r-- | api/tofasta.c | 148 | ||||
-rw-r--r-- | api/txalign.c | 16 | ||||
-rw-r--r-- | api/utilpars.c | 13 | ||||
-rw-r--r-- | api/utilpars.h | 5 | ||||
-rw-r--r-- | api/valid.c | 1423 | ||||
-rw-r--r-- | api/valid.h | 11 | ||||
-rw-r--r-- | api/valid.msg | 48 | ||||
-rw-r--r-- | api/validerr.h | 11 |
35 files changed, 5180 insertions, 1012 deletions
diff --git a/api/alignval.c b/api/alignval.c index 141b366d..753eeb34 100644 --- a/api/alignval.c +++ b/api/alignval.c @@ -29,7 +29,7 @@ * * Version Creation Date: 6/3/99 * -* $Revision: 6.45 $ +* $Revision: 6.46 $ * * File Description: To validate sequence alignment. * @@ -2093,9 +2093,9 @@ static void ValidateSeqAlignInHist (SeqHistPtr hist, SaValPtr svp) SeqAlignPtr salp; if (hist == NULL) return; - for (salp = hist->assembly; salp != NULL; salp = salp->next) { - ValidateSeqAlign (salp, svp->entityID, svp->message, svp->msg_success, svp->find_remote_bsp, svp->delete_bsp, svp->delete_salp, &svp->dirty); - } + salp = hist->assembly; + /* ValidateSeqAlign will validate the entire chain */ + ValidateSeqAlign (salp, svp->entityID, svp->message, svp->msg_success, svp->find_remote_bsp, svp->delete_bsp, svp->delete_salp, &svp->dirty); } static void ValidateSeqAlignCallback (SeqEntryPtr sep, Pointer mydata, diff --git a/api/asn2gnb1.c b/api/asn2gnb1.c index 3ef4f95d..9e55f9b1 100644 --- a/api/asn2gnb1.c +++ b/api/asn2gnb1.c @@ -28,11 +28,11 @@ * Author: Karl Sirotkin, Tom Madden, Tatiana Tatusov, Jonathan Kans, * Mati Shomrat * -* $Id: asn2gnb1.c,v 1.85 2005/12/01 20:09:32 kans Exp $ +* $Id: asn2gnb1.c,v 1.97 2006/02/23 16:38:54 kans Exp $ * * Version Creation Date: 10/21/98 * -* $Revision: 1.85 $ +* $Revision: 1.97 $ * * File Description: New GenBank flatfile generator - work in progress * @@ -367,7 +367,7 @@ NLM_EXTERN CharPtr DateToFF ( } if (day < 1) { - sprintf (buf, "??-%s-%ld", + sprintf (buf, "\?\?-%s-%ld", month_names [month-1], (long) year); } else if (day < 10) { sprintf (buf, "0%ld-%s-%ld", @@ -1313,7 +1313,7 @@ NLM_EXTERN void FFLineWrap ( FFSavePosition(dest, &line_start, &line_pos); - // for EMBL 'XX' lines + /* for EMBL 'XX' lines */ if (eb_line_prefix != NULL) { cont = FALSE; if (break_pos > 1) { @@ -2998,7 +2998,7 @@ static Boolean IsSepRefseq ( } typedef struct modeflags { - Boolean flags [27]; + Boolean flags [29]; } ModeFlags, PNTR ModeFlagsPtr; static ModeFlags flagTable [] = { @@ -3009,7 +3009,7 @@ static ModeFlags flagTable [] = { TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, - TRUE, TRUE}, + TRUE, TRUE, TRUE, TRUE}, /* ENTREZ_MODE */ {FALSE, TRUE, TRUE, TRUE, TRUE, @@ -3017,7 +3017,7 @@ static ModeFlags flagTable [] = { TRUE, TRUE, FALSE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, - TRUE, FALSE}, + TRUE, TRUE, TRUE, FALSE}, /* SEQUIN_MODE */ {FALSE, FALSE, FALSE, FALSE, FALSE, @@ -3025,7 +3025,7 @@ static ModeFlags flagTable [] = { FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, - TRUE, FALSE}, + FALSE, TRUE, FALSE, FALSE}, /* DUMP_MODE */ {FALSE, FALSE, FALSE, FALSE, FALSE, @@ -3033,7 +3033,7 @@ static ModeFlags flagTable [] = { FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, - FALSE, FALSE} + FALSE, FALSE, FALSE, FALSE} }; static void SetFlagsFromMode ( @@ -3083,7 +3083,9 @@ static void SetFlagsFromMode ( ajp->flags.refSeqQualsToNote = *(bp++); ajp->flags.selenocysteineToNote = *(bp++); + ajp->flags.pyrrolysineToNote = *(bp++); ajp->flags.extraProductsToNote = *(bp++); + ajp->flags.codonRecognizedToNote = *(bp++); ajp->flags.forGbRelease = *(bp++); /* unapproved qualifiers suppressed for flatfile, okay for GBSeq XML */ @@ -3109,6 +3111,7 @@ static void SetFlagsFromMode ( /* selenocysteine always a separate qualifier for RefSeq */ ajp->flags.selenocysteineToNote = FALSE; + ajp->flags.pyrrolysineToNote = FALSE; } else { @@ -3128,6 +3131,7 @@ static void SetFlagsFromMode ( /* selenocysteine always a separate qualifier for RefSeq */ ajp->flags.selenocysteineToNote = FALSE; + ajp->flags.pyrrolysineToNote = FALSE; } } @@ -3363,19 +3367,57 @@ static void MakeGapFeats ( } } +static void LookForFeatFetchPolicy ( + SeqDescrPtr sdp, + Pointer userdata +) + +{ + BoolPtr forceOnlyNearFeatsP; + ObjectIdPtr oip; + UserFieldPtr ufp; + UserObjectPtr uop; + + if (sdp == NULL || sdp->choice != Seq_descr_user) return; + forceOnlyNearFeatsP = (BoolPtr) userdata; + if (forceOnlyNearFeatsP == NULL) return; + + uop = (UserObjectPtr) sdp->data.ptrvalue; + if (uop == NULL) return; + oip = uop->type; + if (oip == NULL) return; + if (StringCmp (oip->str, "FeatureFetchPolicy") != 0) return; + + for (ufp = uop->data; ufp != NULL; ufp = ufp->next) { + oip = ufp->label; + if (oip == NULL || ufp->data.ptrvalue == NULL) continue; + if (StringCmp (oip->str, "Policy") == 0) { + if (StringICmp ((CharPtr) ufp->data.ptrvalue, "OnlyNearFeatures") == 0) { + *forceOnlyNearFeatsP = TRUE; + } + } + } +} + +static CharPtr bad_html_strings [] = { + "<script", "<object", "<applet", "<embed", "<form", "javascript:", NULL +}; + static CharPtr defHead = "\ -Content-type: text/html\n\n\ -<HTML>\n\ -<HEAD><TITLE>GenBank entry</TITLE></HEAD>\n\ -<BODY>\n\ -<hr>\n\ +<html>\n\ +<head>\n\ +<meta http-equiv=\"Content-Type\" content=\"text/html; charset=us-ascii\" />\ +<title>GenBank entry</title>\n\ +</head>\n\ +<body>\n\ +<hr />\n\ <pre>"; static CharPtr defTail = "\ </pre>\n\ -<hr>\n\ -</BODY>\n\ -</HTML>\n"; +<hr />\n\ +</body>\n\ +</html>\n"; #define FAR_TRANS_MASK (SHOW_FAR_TRANSLATION | TRANSLATE_IF_NO_PRODUCT | ALWAYS_TRANSLATE_CDS) #define FEAT_FETCH_MASK (ONLY_NEAR_FEATURES | FAR_FEATURES_SUPPRESS | NEAR_FEATURES_SUPPRESS) @@ -3411,6 +3453,7 @@ static Asn2gbJobPtr asn2gnbk_setup_ex ( CharPtr ffhead = NULL; CharPtr fftail = NULL; Asn2gbWriteFunc ffwrite = NULL; + Boolean forceOnlyNearFeats = FALSE; ValNodePtr gapvnp = NULL; GBSeqPtr gbseq = NULL; Int4 i; @@ -3450,6 +3493,7 @@ static Asn2gbJobPtr asn2gnbk_setup_ex ( BaseBlockPtr PNTR paragraphByIDs; BioseqPtr parent = NULL; Int4 prevGi = 0; + Int2 q; Pointer remotedata = NULL; Asn2gbFreeFunc remotefree = NULL; Asn2gbLockFunc remotelock = NULL; @@ -3577,6 +3621,8 @@ static Asn2gbJobPtr asn2gnbk_setup_ex ( ajp = (IntAsn2gbJobPtr) MemNew (sizeof (IntAsn2gbJob)); if (ajp == NULL) return NULL; + VisitDescriptorsInSep (sep, (Pointer) &forceOnlyNearFeats, LookForFeatFetchPolicy); + gapvnp = NULL; if (format != FTABLE_FMT) { if (isG || isTPG || isOnlyLocal || isRefSeq || (isGeneral && (! isGED))) { @@ -3782,6 +3828,8 @@ static Asn2gbJobPtr asn2gnbk_setup_ex ( if ((Boolean) ((flags & FEAT_FETCH_MASK) == ONLY_NEAR_FEATURES)) { aw.onlyNearFeats = TRUE; + } else if (forceOnlyNearFeats) { + aw.onlyNearFeats = TRUE; } else { aw.nearFeatsSuppress = TRUE; } @@ -3794,11 +3842,17 @@ static Asn2gbJobPtr asn2gnbk_setup_ex ( if ((Boolean) ((flags & FEAT_FETCH_MASK) == ONLY_NEAR_FEATURES)) { aw.onlyNearFeats = TRUE; + } else if (forceOnlyNearFeats) { + aw.onlyNearFeats = TRUE; } else { aw.nearFeatsSuppress = TRUE; } ajp->showFarTransl = TRUE; + } else if (forceOnlyNearFeats) { + + aw.onlyNearFeats = TRUE; + } else { aw.onlyNearFeats = (Boolean) ((flags & FEAT_FETCH_MASK) == ONLY_NEAR_FEATURES); @@ -3861,6 +3915,9 @@ static Asn2gbJobPtr asn2gnbk_setup_ex ( if (mode == SEQUIN_MODE || mode == DUMP_MODE) { aw.showBaseCount = TRUE; } + aw.forcePrimaryBlock = (Boolean) ((flags & FORCE_PRIMARY_BLOCK) != 0); + + aw.localFeatCount = VisitFeaturesInSep (sep, NULL, NULL); aw.hup = FALSE; aw.ssp = NULL; @@ -3879,6 +3936,12 @@ static Asn2gbJobPtr asn2gnbk_setup_ex ( } } + ajp->bad_html_fsa = TextFsaNew (); + + for (q = 0; bad_html_strings [q] != NULL; q++) { + TextFsaAdd (ajp->bad_html_fsa, bad_html_strings [q]); + } + oldscope = SeqEntrySetScope (sep); if (stream) { @@ -3896,6 +3959,9 @@ static Asn2gbJobPtr asn2gnbk_setup_ex ( if (ffwrite != NULL) { ffwrite (ffhead, userdata, HEAD_BLOCK); } + if (is_html) { + DoQuickLinkFormat (aw.afp, "<div class=\"sequence\">"); + } } /* if Web Entrez, set awp->sectionMax to decide when Next hyperlink is needed */ @@ -3927,6 +3993,10 @@ static Asn2gbJobPtr asn2gnbk_setup_ex ( } if (stream) { + if (is_html) { + DoQuickLinkFormat (aw.afp, "</div>"); + } + /* send optional tail string */ if (fftail == NULL && is_html) { @@ -4686,16 +4756,16 @@ static void PrintBioSourceFtableEntry ( sprintf (str, "\t\t\tidentified_by\t"); break; case SUBSRC_fwd_primer_seq : - sprintf (str, "\t\t\tleft_primer\t"); + sprintf (str, "\t\t\tfwd_pcr_primer_seq\t"); break; case SUBSRC_rev_primer_seq : - sprintf (str, "\t\t\tright_primer\t"); + sprintf (str, "\t\t\trev_pcr_primer_seq\t"); break; case SUBSRC_fwd_primer_name : - sprintf (str, "\t\t\tleft_primer\t"); + sprintf (str, "\t\t\tfwd_pcr_primer_name\t"); break; case SUBSRC_rev_primer_name : - sprintf (str, "\t\t\tright_primer\t"); + sprintf (str, "\t\t\trev_pcr_primer_name\t"); break; case SUBSRC_other : sprintf (str, "\t\t\tnote\t"); @@ -5266,16 +5336,21 @@ NLM_EXTERN void DoImmediateFormat ( ) { - BlockType blocktype; - BioseqPtr bsp; - FormatProc fmt; - size_t max; - SeqEntryPtr oldscope; - QualValPtr qv = NULL; - SeqEntryPtr sep; - CharPtr str = NULL; + IntAsn2gbJobPtr ajp; + BlockType blocktype; + BioseqPtr bsp; + FormatProc fmt; + Boolean is_www; + size_t max; + SeqEntryPtr oldscope; + QualValPtr qv = NULL; + SeqEntryPtr sep; + CharPtr str = NULL; if (afp == NULL || bbp == NULL) return; + ajp = afp->ajp; + if (ajp == NULL) return; + is_www = GetWWW (ajp); blocktype = bbp->blocktype; if (blocktype < LOCUS_BLOCK || blocktype > SLASH_BLOCK) return; @@ -5557,6 +5632,8 @@ NLM_EXTERN Asn2gbJobPtr asn2gnbk_cleanup ( } } + TextFsaFree (iajp->bad_html_fsa); + ValNodeFree (iajp->gihead); free_buff (); diff --git a/api/asn2gnb2.c b/api/asn2gnb2.c index d0a0f1bb..c8353c89 100644 --- a/api/asn2gnb2.c +++ b/api/asn2gnb2.c @@ -30,7 +30,7 @@ * * Version Creation Date: 10/21/98 * -* $Revision: 1.61 $ +* $Revision: 1.69 $ * * File Description: New GenBank flatfile generator - work in progress * @@ -359,6 +359,45 @@ static Boolean LocusHasBadChars ( return FALSE; } +static CharPtr gbseq_strd [4] = { + NULL, "single", "double", "mixed" +}; + +static CharPtr gbseq_mol [10] = { + "?", "DNA", "RNA", "tRNA", "rRNA", "mRNA", "uRNA", "snRNA", "snoRNA", "AA" +}; + +static CharPtr gbseq_top [3] = { + NULL, "linear", "circular" +}; + +static void LookupAccnForNavLink ( + Int4 gi, + CharPtr seqid, + size_t len, + CharPtr dfault +) + +{ + SeqIdPtr sip; + + if (seqid == NULL) return; + *seqid = '\0'; + if (gi > 0) { + if (GetAccnVerFromServer (gi, seqid)) return; + sip = GetSeqIdForGI (gi); + if (sip != NULL) { + if (SeqIdWrite (sip, seqid, PRINTID_TEXTID_ACC_VER, len) != NULL) { + SeqIdFree (sip); + return; + } + SeqIdFree (sip); + } + } + if (dfault == NULL) return; + StringCpy (seqid, dfault); +} + NLM_EXTERN void AddLocusBlock ( Asn2gbWorkPtr awp, Boolean willshowwgs, @@ -376,7 +415,7 @@ NLM_EXTERN void AddLocusBlock ( BioSourcePtr biop; Int2 bmol = 0; BioseqPtr bsp; - Char buf [512]; + Char buf [1024]; SeqFeatPtr cds; Int4 currGi; Char date [40]; @@ -396,6 +435,7 @@ NLM_EXTERN void AddLocusBlock ( ValNodePtr gilistpos; Char gi_buf [16]; SeqIdPtr gpp = NULL; + Boolean has_next_pref_ul = FALSE; Boolean hasComment; Char id [41]; Int2 imol = 0; @@ -424,6 +464,8 @@ NLM_EXTERN void AddLocusBlock ( Int4 prevGi; SeqDescrPtr sdp; Char sect [128]; + Char seg [32]; + Char seqid [128]; SeqFeatPtr sfp; SeqHistPtr hist; SeqIdPtr sip; @@ -435,6 +477,7 @@ NLM_EXTERN void AddLocusBlock ( UserObjectPtr uop; ValNodePtr vnp; Boolean wgsmaster = FALSE; + Int2 moltype, strandedness, topol; if (awp == NULL) return; ajp = awp->ajp; @@ -1005,9 +1048,26 @@ NLM_EXTERN void AddLocusBlock ( gbseq->locus = StringSave (locus); gbseq->length = length; gbseq->division = StringSave (div); + /* gbseq->strandedness = bsp->strand; gbseq->moltype = imolToMoltype [imol]; gbseq->topology = topology; + */ + strandedness = (Int2) bsp->strand; + if (strandedness < 0 || strandedness > 3) { + strandedness = 0; + } + gbseq->strandedness = StringSave (gbseq_strd [strandedness]); + moltype = (Int2) imolToMoltype [imol]; + if (moltype < 0 || moltype > 9) { + moltype = 0; + } + gbseq->moltype = StringSave (gbseq_mol [moltype]); + topol = (Int2) topology; + if (topol < 0 || topol > 2) { + topol = 0; + } + gbseq->topology = StringSave (gbseq_top [topol]); for (sip = bsp->id; sip != NULL; sip = sip->next) { SeqIdWrite (sip, id, PRINTID_FASTA_SHORT, sizeof (id)); @@ -1095,7 +1155,6 @@ NLM_EXTERN void AddLocusBlock ( DoQuickLinkFormat (awp->afp, buf); buf [0] = '\0'; - prefix = NULL; hasComment = (Boolean) (SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_comment, &dcontext) != NULL); if (! hasComment) { hasComment = (Boolean) (SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_region, &dcontext) != NULL); @@ -1134,43 +1193,33 @@ NLM_EXTERN void AddLocusBlock ( } buf [0] = '\0'; - StringCpy (buf, "<div class=\"localnav\"><ul class=\"locallinks\">"); + StringCpy (buf, "<div class=\"localnav\"><ul class=\"locals\">"); if (hasComment) { - sprintf (sect, "<li><a href=\"#comment_%ld\">Comment</a></li>", (long) awp->currGi); - StringCat (buf, prefix); - prefix = " "; + sprintf (sect, "<li><a href=\"#comment_%ld\" title=\"Jump to the comment section of this record\">Comment</a></li>", (long) awp->currGi); StringCat (buf, sect); } - sprintf (sect, "<li><a href=\"#feature_%ld\">Features</a></li>", (long) awp->currGi); - StringCat (buf, prefix); - prefix = " "; + sprintf (sect, "<li><a href=\"#feature_%ld\" title=\"Jump to the feature table of this record\">Features</a></li>", (long) awp->currGi); StringCat (buf, sect); if (willshowwgs) { - sprintf (sect, "<li><a href=\"#wgs_%ld\">WGS</a></li>", (long) awp->currGi); - StringCat (buf, prefix); - prefix = " "; + sprintf (sect, "<li><a href=\"#wgs_%ld\" title=\"Jump to WGS section of this record\">WGS</a></li>", (long) awp->currGi); StringCat (buf, sect); } if (willshowgenome) { - sprintf (sect, "<li><a href=\"#genome_%ld\">Genome</a></li>", (long) awp->currGi); - StringCat (buf, prefix); - prefix = " "; + sprintf (sect, "<li><a href=\"#genome_%ld\" title=\"Jump to the genome section of this record\">Genome</a></li>", (long) awp->currGi); StringCat (buf, sect); } if (willshowcontig) { - sprintf (sect, "<li><a href=\"#contig_%ld\">Contig</a></li>", (long) awp->currGi); - StringCat (buf, prefix); - prefix = " "; + sprintf (sect, "<li><a href=\"#contig_%ld\" title=\"Jump to the contig section of this record\">Contig</a></li>", (long) awp->currGi); StringCat (buf, sect); } if (willshowsequence) { - sprintf (sect, "<li><a href=\"#sequence_%ld\">Sequence</a></li>", (long) awp->currGi); - StringCat (buf, prefix); - prefix = " "; + sprintf (sect, "<li><a href=\"#sequence_%ld\" title=\"Jump to the sequence of this record\">Sequence</a></li>", (long) awp->currGi); StringCat (buf, sect); } + StringCat (buf, "</ul>"); + prevGi = 0; currGi = 0; nextGi = 0; @@ -1194,30 +1243,55 @@ NLM_EXTERN void AddLocusBlock ( } } while (gilistpos != NULL && currGi != awp->currGi); + has_next_pref_ul = FALSE; if (currGi == awp->currGi && nextGi > 0 && awp->sectionCount < awp->sectionMax) { - sprintf (sect, "<li class=\"localnext\"><a href=\"#locus_%ld\">Next</a></li>", (long) nextGi); - StringCat (buf, prefix); - prefix = " "; + if (! has_next_pref_ul) { + StringCat (buf, "<ul class=\"nextprevlinks\">"); + has_next_pref_ul = TRUE; + } + LookupAccnForNavLink (nextGi, seqid, sizeof (seqid), "the next record"); + if (awp->seg + 1 > 0 && awp->numsegs > 0 && awp->seg + 1 <= awp->numsegs) { + sprintf (seg, " (segment %d of %ld)", (int) (awp->seg + 1), (long) awp->numsegs); + StringCat (seqid, seg); + } + sprintf (sect, "<li class=\"next\"><a href=\"#locus_%ld\" title=\"Jump to %s\">Next</a></li>", (long) nextGi, seqid); StringCat (buf, sect); } else if (awp->nextGi > 0) { - sprintf (sect, "<li class=\"localnext\"><a href=\"#locus_%ld\">Next</a></li>", (long) awp->nextGi); - StringCat (buf, prefix); - prefix = " "; + if (! has_next_pref_ul) { + StringCat (buf, "<ul class=\"nextprevlinks\">"); + has_next_pref_ul = TRUE; + } + LookupAccnForNavLink (nextGi, seqid, sizeof (seqid), "the next record"); + sprintf (sect, "<li class=\"next\"><a href=\"#locus_%ld\" title=\"Jump to %s\">Next</a></li>", (long) awp->nextGi, seqid); StringCat (buf, sect); } if (currGi == awp->currGi && prevGi > 0 && awp->sectionCount > 1) { - sprintf (sect, "<li class=\"localprev\"><a href=\"#locus_%ld\">Previous</a></li>", (long) prevGi); - StringCat (buf, prefix); - prefix = " "; + if (! has_next_pref_ul) { + StringCat (buf, "<ul class=\"nextprevlinks\">"); + has_next_pref_ul = TRUE; + } + LookupAccnForNavLink (prevGi, seqid, sizeof (seqid), "the previous record"); + if (awp->seg - 1 > 0 && awp->numsegs > 0 && awp->seg - 1 <= awp->numsegs) { + sprintf (seg, " (segment %d of %ld)", (int) (awp->seg - 1), (long) awp->numsegs); + StringCat (seqid, seg); + } + sprintf (sect, "<li class=\"prev\"><a href=\"#locus_%ld\" title=\"Jump to %s\">Previous</a></li>", (long) prevGi, seqid); StringCat (buf, sect); } else if (awp->prevGi > 0) { - sprintf (sect, "<li class=\"localprev\"><a href=\"#locus_%ld\">Previous</a></li>", (long) awp->prevGi); - StringCat (buf, prefix); - prefix = " "; + if (! has_next_pref_ul) { + StringCat (buf, "<ul class=\"nextprevlinks\">"); + has_next_pref_ul = TRUE; + } + LookupAccnForNavLink (prevGi, seqid, sizeof (seqid), "the previous record"); + sprintf (sect, "<li class=\"prev\"><a href=\"#locus_%ld\" title=\"Jump to %s\">Previous</a></li>", (long) awp->prevGi, seqid); StringCat (buf, sect); } - StringCat (buf, "</ul></div>\n"); + if (has_next_pref_ul) { + StringCat (buf, "</ul>"); + } + StringCat (buf, "</div>\n"); + StringCat (buf, "<pre class=\"genbank\">"); DoQuickLinkFormat (awp->afp, buf); } @@ -1922,11 +1996,128 @@ NLM_EXTERN void AddVersionBlock ( } } +static void FF_asn2gb_www_projID ( + StringItemPtr ffstring, + CharPtr projID +) + +{ + FFAddOneString (ffstring, "<a href=", FALSE, FALSE, TILDE_IGNORE); + FFAddOneString (ffstring, link_projid, FALSE, FALSE, TILDE_IGNORE); + FFAddOneString (ffstring, projID, FALSE, FALSE, TILDE_IGNORE); + FFAddOneString(ffstring, ">", FALSE, FALSE, TILDE_IGNORE); + FFAddOneString (ffstring, projID, FALSE, FALSE, TILDE_IGNORE); + FFAddOneString(ffstring, "</a>", FALSE, FALSE, TILDE_IGNORE); +} + NLM_EXTERN void AddProjectBlock ( Asn2gbWorkPtr awp ) { + IntAsn2gbJobPtr ajp; + BaseBlockPtr bbp; + BioseqPtr bsp; + Char buf [32]; + UserFieldPtr curr; + SeqMgrDescContext dcontext; + StringItemPtr ffstring; + UserObjectPtr gpuop = NULL; + Uint4 itemID; + ObjectIdPtr oip; + Int4 parentID; + CharPtr prefix; + Int4 projectID; + SeqDescrPtr sdp; + UserObjectPtr uop; + Int4 val; + + if (awp == NULL) return; + ajp = awp->ajp; + if (ajp == NULL) return; + bsp = awp->bsp; + if (bsp == NULL) return; + + if (! ISA_na (bsp->mol)) return; + if (awp->format != GENBANK_FMT) return; + + sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_user, &dcontext); + while (sdp != NULL) { + uop = (UserObjectPtr) sdp->data.ptrvalue; + if (uop != NULL) { + oip = uop->type; + if (oip != NULL && StringICmp (oip->str, "GenomeProjectsDB") == 0) { + gpuop = uop; + itemID = dcontext.itemID; + } + } + sdp = SeqMgrGetNextDescriptor (bsp, sdp, Seq_descr_user, &dcontext); + } + if (gpuop == NULL) return; + + ffstring = FFGetString (ajp); + if ( ffstring == NULL ) return; + + bbp = Asn2gbAddBlock (awp, PROJECT_BLOCK, sizeof (BaseBlock)); + if (bbp == NULL) return; + + bbp->entityID = awp->entityID; + bbp->itemID = itemID; + bbp->itemtype = OBJ_SEQDESC; + + FFStartPrint (ffstring, awp->format, 0, 12, "PROJECT", 12, 5, 5, "XX", TRUE); + + prefix = "GenomeProject:"; + projectID = 0; + parentID = 0; + for (curr = gpuop->data; curr != NULL; curr = curr->next) { + oip = curr->label; + if (oip == NULL) continue; + if (StringICmp (oip->str, "ProjectID") == 0) { + if (curr->choice == 2) { + val = (Int4) curr->data.intvalue; + if (projectID > 0) { + sprintf (buf, "%ld", (long) projectID); + FFAddOneString (ffstring, prefix, FALSE, FALSE, TILDE_IGNORE); + if (GetWWW (ajp)) { + FF_asn2gb_www_projID (ffstring, buf); + } else { + FFAddOneString (ffstring, buf, FALSE, FALSE, TILDE_IGNORE); + } + /* + FFAddTextToString (ffstring, prefix, buf, NULL, FALSE, FALSE, TILDE_IGNORE); + */ + prefix = ","; + parentID = 0; + } + projectID = val; + } + } else if (StringICmp (oip->str, "ParentID") == 0) { + if (curr->choice == 2) { + val = (Int4) curr->data.intvalue; + parentID = val; + } + } + } + if (projectID > 0) { + sprintf (buf, "%ld", (long) projectID); + FFAddOneString (ffstring, prefix, FALSE, FALSE, TILDE_IGNORE); + if (GetWWW (ajp)) { + FF_asn2gb_www_projID (ffstring, buf); + } else { + FFAddOneString (ffstring, buf, FALSE, FALSE, TILDE_IGNORE); + } + /* + FFAddTextToString (ffstring, prefix, buf, NULL, FALSE, FALSE, TILDE_IGNORE); + */ + } + + bbp->string = FFEndPrint (ajp, ffstring, awp->format, 12, 12, 5, 5, "XX"); + FFRecycleString (ajp, ffstring); + + if (awp->afp != NULL) { + DoImmediateFormat (awp->afp, bbp); + } } /* only displaying PID in GenPept format */ @@ -5355,7 +5546,7 @@ NLM_EXTERN void AddSlashBlock ( if (GetWWW (ajp) && awp->mode == ENTREZ_MODE && awp->afp != NULL && (awp->format == GENBANK_FMT || awp->format == GENPEPT_FMT)) { - sprintf (buf, "//\n<a name=\"slash_%ld\"></a>", (long) awp->currGi); + sprintf (buf, "//</pre>\n<a name=\"slash_%ld\"></a>", (long) awp->currGi); str = StringSave (buf); } else { str = MemNew(sizeof(Char) * 4); diff --git a/api/asn2gnb3.c b/api/asn2gnb3.c index 177e02bf..fc3948f8 100644 --- a/api/asn2gnb3.c +++ b/api/asn2gnb3.c @@ -30,7 +30,7 @@ * * Version Creation Date: 10/21/98 * -* $Revision: 1.48 $ +* $Revision: 1.55 $ * * File Description: New GenBank flatfile generator - work in progress * @@ -705,12 +705,14 @@ static Boolean DoGetAnnotationComment ( ) { + Int2 ce = 0, cm = 0; SeqMgrDescContext dcontext; CharPtr method = NULL; UserObjectPtr moduop; CharPtr name = NULL; ObjectIdPtr oip; SeqDescrPtr sdp; + UserFieldPtr u; UserFieldPtr ufp; UserObjectPtr uop; @@ -726,14 +728,32 @@ static Boolean DoGetAnnotationComment ( for (ufp = uop->data; ufp != NULL; ufp = ufp->next) { oip = ufp->label; if (oip == NULL) continue; - if (StringCmp(oip->str, "Contig Name") == 0) { + if (StringCmp (oip->str, "Contig Name") == 0) { name = (CharPtr) ufp->data.ptrvalue; - } else if (StringCmp(oip->str, "Method") == 0) { + } else if (StringCmp (oip->str, "Method") == 0) { method = (CharPtr) ufp->data.ptrvalue; - } else if (StringCmp(oip->str, "mRNA") == 0) { + } else if (StringCmp (oip->str, "mRNA") == 0) { *mrnaEv = TRUE; - } else if (StringCmp(oip->str, "EST") == 0) { + } else if (StringCmp (oip->str, "EST") == 0) { *estEv = TRUE; + } else if (StringCmp (oip->str, "Counts") == 0) { + for (u = (UserFieldPtr) ufp->data.ptrvalue; u != NULL; u = u->next) { + if (u->data.ptrvalue == NULL) continue; + if (u->choice != 2) continue; + oip = u->label; + if (oip == NULL) continue; + if (StringCmp (oip->str, "mRNA") == 0) { + cm = (Int2) u->data.intvalue; + if (cm > 0) { + *mrnaEv = TRUE; + } + } else if (StringCmp (oip->str, "EST") == 0) { + ce = (Int2) u->data.intvalue; + if (ce > 0) { + *estEv = TRUE; + } + } + } } } } @@ -999,6 +1019,10 @@ static CharPtr GetPrimaryStrForDelta ( } else { id = GetSeqIdForGI (gi); } + if (id == NULL) { + sprintf (buf, "%ld", (long) gi); + accn = TRUE; + } } else { id = SeqIdDup (sip); } @@ -1368,10 +1392,8 @@ NLM_EXTERN void AddPrimaryBlock ( hist = bsp->hist; if ((! IsTpa (bsp, has_tpa_assembly, &isRefSeq)) || hist == NULL || hist->assembly == NULL) { - if (awp->contig) { - /* + if (awp->forcePrimaryBlock) { AddAltPrimaryBlock (awp); - */ } return; } @@ -1499,10 +1521,9 @@ NLM_EXTERN void AddCommentBlock ( { size_t acclen; - /* SeqMgrAndContext acontext; AnnotDescPtr adp; - */ + Boolean annotDescCommentToComment; IntAsn2gbJobPtr ajp; BioseqPtr bsp; Char buf [1024]; @@ -1548,6 +1569,7 @@ NLM_EXTERN void AddCommentBlock ( CharPtr str; Char taxID [32]; TextSeqIdPtr tsip; + UserFieldPtr ufp; UserObjectPtr uop; CharPtr wgsaccn = NULL; CharPtr wgsname = NULL; @@ -2582,38 +2604,61 @@ NLM_EXTERN void AddCommentBlock ( /* look for Seq-annot.desc.comment on annots packaged on current bioseq */ - /* - adp = SeqMgrGetNextAnnotDesc (bsp, NULL, Annot_descr_comment, &acontext); + annotDescCommentToComment = FALSE; + adp = SeqMgrGetNextAnnotDesc (bsp, NULL, Annot_descr_user, &acontext); while (adp != NULL) { - str = (CharPtr) adp->data.ptrvalue; - if (StringDoesHaveText (str)) { - cbp = (CommentBlockPtr) Asn2gbAddBlock (awp, COMMENT_BLOCK, sizeof (CommentBlock)); - if (cbp != NULL) { + uop = (UserObjectPtr) adp->data.ptrvalue; + if (uop != NULL) { + oip = uop->type; + if (oip != NULL) { + if (StringCmp (oip->str, "AnnotDescCommentPolicy") == 0) { + for (ufp = uop->data; ufp != NULL; ufp = ufp->next) { + oip = ufp->label; + if (oip == NULL || ufp->data.ptrvalue == NULL) continue; + if (StringCmp (oip->str, "Policy") == 0) { + if (StringICmp ((CharPtr) ufp->data.ptrvalue, "ShowInComment") == 0) { + annotDescCommentToComment = TRUE; + } + } + } + } + } + } + adp = SeqMgrGetNextAnnotDesc (bsp, adp, Annot_descr_user, &acontext); + } - cbp->entityID = awp->entityID; - cbp->first = first; - first = FALSE; + if (annotDescCommentToComment) { + adp = SeqMgrGetNextAnnotDesc (bsp, NULL, Annot_descr_comment, &acontext); + while (adp != NULL) { + str = (CharPtr) adp->data.ptrvalue; + if (StringDoesHaveText (str)) { + cbp = (CommentBlockPtr) Asn2gbAddBlock (awp, COMMENT_BLOCK, sizeof (CommentBlock)); + if (cbp != NULL) { - if (cbp->first) { - FFStartPrint (ffstring, awp->format, 0, 12, "COMMENT", 12, 5, 5, "CC", TRUE); - } else { - FFStartPrint (ffstring, awp->format, 0, 12, NULL, 12, 5, 5, "CC", FALSE); - } + cbp->entityID = awp->entityID; + cbp->first = first; + first = FALSE; - FFAddOneString (ffstring, str, TRUE, FALSE, TILDE_EXPAND); + if (cbp->first) { + FFStartPrint (ffstring, awp->format, 0, 12, "COMMENT", 12, 5, 5, "CC", TRUE); + } else { + FFStartPrint (ffstring, awp->format, 0, 12, NULL, 12, 5, 5, "CC", FALSE); + } - cbp->string = FFEndPrint (ajp, ffstring, awp->format, 12, 12, 5, 5, "CC"); - FFRecycleString (ajp, ffstring); - ffstring = FFGetString (ajp); + FFAddOneString (ffstring, str, TRUE, FALSE, TILDE_EXPAND); - if (awp->afp != NULL) { - DoImmediateFormat (awp->afp, (BaseBlockPtr) cbp); + cbp->string = FFEndPrint (ajp, ffstring, awp->format, 12, 12, 5, 5, "CC"); + FFRecycleString (ajp, ffstring); + ffstring = FFGetString (ajp); + + if (awp->afp != NULL) { + DoImmediateFormat (awp->afp, (BaseBlockPtr) cbp); + } } } + adp = SeqMgrGetNextAnnotDesc (bsp, adp, Annot_descr_comment, &acontext); } - adp = SeqMgrGetNextAnnotDesc (bsp, adp, Annot_descr_comment, &acontext); } - */ FFRecycleString(ajp, ffstring); } @@ -4546,7 +4591,10 @@ static Boolean LIBCALLBACK GetFeatsOnBioseq ( ifp->mapToPep = FALSE; ifp->firstfeat = awp->firstfeat; awp->firstfeat = FALSE; - awp->featseen = TRUE; + /* this allows remote SNP, CDD, MGC, etc., not to be treated as local annotation */ + if (awp->entityID != fbp->entityID || fbp->itemID <= awp->localFeatCount) { + awp->featseen = TRUE; + } awp->featjustseen = TRUE; if (fcontext->seqfeattype == SEQFEAT_PROT) { @@ -4999,7 +5047,7 @@ NLM_EXTERN void AddFeatureBlock ( if (awp->format == GENPEPT_FMT && ISA_aa (bsp->mol)) { cds = SeqMgrGetCDSgivenProduct (bsp, &fcontext); if (cds != NULL && cds->data.choice == SEQFEAT_CDREGION) { - /* if protein bioseq and cds feature but no nucleotide, cannot index cds, so skip */ + if (fcontext.entityID > 0 && fcontext.itemID > 0) { fbp = (FeatBlockPtr) Asn2gbAddBlock (awp, FEATURE_BLOCK, sizeof (IntCdsBlock)); @@ -5023,6 +5071,31 @@ NLM_EXTERN void AddFeatureBlock ( DoImmediateFormat (awp->afp, (BaseBlockPtr) fbp); } } + } else if (cds->idx.entityID > 0 && cds->idx.itemID > 0) { + + /* if protein bioseq and cds feature but no nucleotide, handle as special case */ + + fbp = (FeatBlockPtr) Asn2gbAddBlock (awp, FEATURE_BLOCK, sizeof (IntCdsBlock)); + if (fbp != NULL) { + + fbp->entityID = cds->idx.entityID; + fbp->itemID = cds->idx.itemID; + fbp->itemtype = OBJ_SEQFEAT; + fbp->featdeftype = FEATDEF_CDS; + ifp = (IntFeatBlockPtr) fbp; + ifp->mapToNuc = FALSE; + ifp->mapToProt = TRUE; + ifp->mapToGen = FALSE; + ifp->mapToMrna = FALSE; + ifp->mapToPep = FALSE; + ifp->isCDS = TRUE; + ifp->firstfeat = awp->firstfeat; + awp->firstfeat = FALSE; + + if (awp->afp != NULL) { + DoImmediateFormat (awp->afp, (BaseBlockPtr) fbp); + } + } } } prot = SeqMgrGetPROTgivenProduct (bsp, &fcontext); diff --git a/api/asn2gnb4.c b/api/asn2gnb4.c index f43ff114..2aebe7ad 100644 --- a/api/asn2gnb4.c +++ b/api/asn2gnb4.c @@ -30,7 +30,7 @@ * * Version Creation Date: 10/21/98 * -* $Revision: 1.85 $ +* $Revision: 1.98 $ * * File Description: New GenBank flatfile generator - work in progress * @@ -110,10 +110,12 @@ static FtQualType feat_qual_order [] = { FTQUAL_pseudo, FTQUAL_selenocysteine, + FTQUAL_pyrrolysine, FTQUAL_codon_start, FTQUAL_anticodon, + FTQUAL_trna_codons, FTQUAL_bound_moiety, FTQUAL_clone, FTQUAL_compare, @@ -179,7 +181,7 @@ static FtQualType feat_note_order [] = { FTQUAL_transcript_id_note, /* !!! remove October 15, 2003 !!! */ FTQUAL_gene_desc, FTQUAL_gene_syn, - FTQUAL_trna_codons, + FTQUAL_trna_codons_note, FTQUAL_encodes, FTQUAL_prot_desc, FTQUAL_prot_note, @@ -194,6 +196,7 @@ static FtQualType feat_note_order [] = { FTQUAL_exception_note, FTQUAL_region, FTQUAL_selenocysteine_note, + FTQUAL_pyrrolysine_note, FTQUAL_prot_names, FTQUAL_bond, FTQUAL_site, @@ -294,6 +297,8 @@ static FeaturQual asn2gnbk_featur_quals [ASN2GNBK_TOTAL_FEATUR] = { { "prot_names", Qual_class_protnames }, { "protein_id", Qual_class_seq_id }, { "pseudo", Qual_class_boolean }, + { "pyrrolysine", Qual_class_boolean }, + { "pyrrolysine", Qual_class_string }, { "region", Qual_class_region }, { "region_name", Qual_class_string }, { "replace", Qual_class_replace }, @@ -321,6 +326,7 @@ static FeaturQual asn2gnbk_featur_quals [ASN2GNBK_TOTAL_FEATUR] = { { "transposon", Qual_class_quote }, { "trans_splicing", Qual_class_boolean }, { "trna_aa", Qual_class_ignore }, + { "codon_recognized", Qual_class_trna_codons }, { "trna_codons", Qual_class_trna_codons }, { "usedin", Qual_class_usedin }, { "xtra_products", Qual_class_xtraprds } @@ -456,10 +462,12 @@ static CharPtr trnaList [] = { "tRNA-Gly", "tRNA-His", "tRNA-Ile", + "tRNA-Xle", "tRNA-Lys", "tRNA-Leu", "tRNA-Met", "tRNA-Asn", + "tRNA-Pyl", "tRNA-Pro", "tRNA-Gln", "tRNA-Arg", @@ -1578,9 +1586,10 @@ static void GetStrFormRNAEvidence ( ) { + Int2 ce = 0, cm = 0, cp = 0, ne = 0, nm = 0, np = 0; + Boolean has_counts = FALSE; size_t len; CharPtr method = NULL, prefix = NULL; - Int2 ne = 0, nm = 0, np = 0; ObjectIdPtr oip; CharPtr str = NULL; CharPtr PNTR strp; @@ -1598,8 +1607,7 @@ static void GetStrFormRNAEvidence ( if (oip == NULL || ufp->data.ptrvalue == NULL) continue; if (StringCmp (oip->str, "Method") == 0) { method = StringSaveNoNull ((CharPtr) ufp->data.ptrvalue); - } - if (StringCmp (oip->str, "mRNA") == 0) { + } else if (StringCmp (oip->str, "mRNA") == 0) { for (u = (UserFieldPtr) ufp->data.ptrvalue; u != NULL; u = u->next) { if (u->data.ptrvalue == NULL) continue; for (uu = (UserFieldPtr) u->data.ptrvalue; uu != NULL; uu = uu->next) { @@ -1632,9 +1640,30 @@ static void GetStrFormRNAEvidence ( } } } + } else if (StringCmp (oip->str, "Counts") == 0) { + has_counts = TRUE; + for (u = (UserFieldPtr) ufp->data.ptrvalue; u != NULL; u = u->next) { + if (u->data.ptrvalue == NULL) continue; + if (u->choice != 2) continue; + oip = u->label; + if (oip == NULL) continue; + if (StringCmp (oip->str, "mRNA") == 0) { + cm = (Int2) u->data.intvalue; + } else if (StringCmp (oip->str, "EST") == 0) { + ce = (Int2) u->data.intvalue; + } else if (StringCmp (oip->str, "Protein") == 0) { + cp = (Int2) u->data.intvalue; + } + } } } + if (has_counts) { + nm = cm; + ne = ce; + np = cp; + } + len = StringLen (mrnaevtext1) + StringLen (mrnaevtext2) + StringLen (mrnaevtext3) + StringLen (method) + 80; str = (CharPtr) MemNew (len); if (str == NULL) return; @@ -2330,6 +2359,24 @@ static FloatHi MolWtForProtFeat ( return MolWtForLoc (sfp->location); } +static void ChangeOToX (CharPtr str) + +{ + Char ch; + + if (str == NULL) return; + ch = *str; + while (ch != '\0') { + if (ch == 'O') { + *str = 'X'; + } else if (ch == 'o') { + *str = 'x'; + } + str++; + ch = *str; + } +} + static void FormatFeatureBlockQuals ( StringItemPtr ffstring, IntAsn2gbJobPtr ajp, @@ -3134,6 +3181,11 @@ static void FormatFeatureBlockQuals ( if (str != NULL) { residue = cbaa.value.intvalue; ptr = Get3LetterSymbol (ajp, seqcode, sctp, residue); + if (ajp->mode == RELEASE_MODE || ajp->mode == ENTREZ_MODE) { + if (StringICmp (ptr, "Pyl") == 0 || StringICmp (ptr, "Xle") == 0) { + ptr = "OTHER"; + } + } if (ptr == NULL) { ptr = "OTHER"; } @@ -3200,6 +3252,21 @@ static void FormatFeatureBlockQuals ( } break; + case Qual_class_trna_codons : + trna = qvp [idx].trp; + if (trna) { + numcodons = ComposeCodonsRecognizedString (trna, numbuf, sizeof (numbuf)); + if (numcodons < 1 || StringHasNoText (numbuf)) { + } else { + FFAddTextToString(ffstring, "/", "codon_recognized", "=\"", + FALSE, TRUE, TILDE_IGNORE); + FFAddOneString(ffstring, numbuf, FALSE, TRUE, TILDE_TO_SPACES); + FFAddOneChar(ffstring, '\"', FALSE); + FFAddOneChar(ffstring, '\n', FALSE); + } + } + break; + case Qual_class_codon : gbq = qvp [idx].gbq; if (gbq == NULL || (ajp->flags.dropIllegalQuals && (! AllowedValQual (featdeftype, idx)))) break; @@ -3483,6 +3550,9 @@ static void FormatFeatureBlockQuals ( } } if (! StringHasNoText (str)) { + if (ajp->mode == RELEASE_MODE || ajp->mode == ENTREZ_MODE) { + ChangeOToX (str); + } FFAddTextToString(ffstring, "/translation=\"", str, "\"", FALSE, TRUE, TILDE_TO_SPACES); FFAddOneChar(ffstring, '\n', FALSE); @@ -3509,6 +3579,9 @@ static void FormatFeatureBlockQuals ( */ SeqPortStreamLoc (sfp->product, STREAM_EXPAND_GAPS | STREAM_CORRECT_INVAL, (Pointer) &protein_seq, SaveGBSeqSequence); if (! StringHasNoText (str)) { + if (ajp->mode == RELEASE_MODE || ajp->mode == ENTREZ_MODE) { + ChangeOToX (str); + } FFAddTextToString(ffstring, "/translation=\"", str, "\"", FALSE, TRUE, TILDE_TO_SPACES); FFAddOneChar(ffstring, '\n', FALSE); @@ -4387,10 +4460,12 @@ static CharPtr validRefSeqExceptionString [] = { "nonconsensus splice site", "modified codon recognition", "alternative start codon", + "dicistronic gene", "unclassified transcription discrepancy", "unclassified translation discrepancy", "mismatches in transcription", "mismatches in translation", + "adjusted for low-quality genome", NULL }; @@ -4591,6 +4666,51 @@ static void ParseInference ( *bad_inferenceP = bad; } +typedef struct geneprot { + SeqFeatPtr gene; + SeqFeatPtr cds; + Boolean failed; +} GeneProtData, PNTR GeneProtPtr; + +static void CheckGeneOnIsolatedProtein ( + SeqFeatPtr sfp, + Pointer userdata +) + +{ + GeneProtPtr gpp; + + if (sfp == NULL || sfp->data.choice != SEQFEAT_GENE) return; + gpp = (GeneProtPtr) userdata; + if (gpp == NULL) return; + + if (SeqLocAinB (gpp->cds->location, sfp->location) < 0) return; + if (gpp->gene != NULL) { + gpp->failed = TRUE; + } else { + gpp->gene = sfp; + } +} + +static SeqFeatPtr FindGeneOnIsolatedProtein ( + SeqEntryPtr sep, + SeqFeatPtr cds +) + +{ + GeneProtData gpd; + + if (sep == NULL || cds == NULL) return NULL; + + MemSet ((Pointer) &gpd, 0, sizeof (GeneProtData)); + gpd.cds = cds; + VisitFeaturesInSep (sep, (Pointer) &gpd, CheckGeneOnIsolatedProtein); + + if (gpd.failed) return NULL; + + return gpd.gene; +} + static SeqFeatPtr GetOverlappingGeneInEntity ( Uint2 entityID, SeqMgrFeatContextPtr fcontext, @@ -4645,7 +4765,11 @@ static SeqFeatPtr GetOverlappingGeneInEntity ( } } } else { - gene = SeqMgrGetOverlappingGene (locforgene, gcontext); + if (fcontext->bad_order || fcontext->mixed_strand) { + gene = SeqMgrGetOverlappingFeature (locforgene, FEATDEF_GENE, NULL, 0, NULL, LOCATION_SUBSET, gcontext); + } else { + gene = SeqMgrGetOverlappingGene (locforgene, gcontext); + } } SeqEntrySetScope (oldscope); return gene; @@ -4668,6 +4792,7 @@ static CharPtr FormatFeatureBlockEx ( { Uint1 aa; AnnotDescrPtr adp; + Boolean annotDescCommentToComment; ValNodePtr bad_inference = NULL; Int2 bondidx; BioseqPtr bsp_for_old_locus_tag = NULL; @@ -4718,6 +4843,7 @@ static CharPtr FormatFeatureBlockEx ( Boolean noLeft; Boolean noRight; SeqMgrFeatContext ocontext; + ObjectIdPtr oip; SeqEntryPtr oldscope; SeqFeatPtr operon = NULL; Uint2 partial; @@ -4732,6 +4858,7 @@ static CharPtr FormatFeatureBlockEx ( ProtRefPtr prpxref; Boolean pseudo = FALSE; CharPtr ptr; + Uint2 pEID; Int2 qualclass; Uint1 residue; Boolean riboSlippage = FALSE; @@ -4742,7 +4869,7 @@ static CharPtr FormatFeatureBlockEx ( SeqDescrPtr sdp; SeqEntryPtr sep; Uint1 seqcode; - Uint1 shift; + Uint1 seqfeattype; SeqIdPtr sip; Int2 siteidx; SeqMapTablePtr smtp; @@ -4752,7 +4879,9 @@ static CharPtr FormatFeatureBlockEx ( CharPtr tmp; Boolean transSplice = FALSE; tRNAPtr trna; + UserFieldPtr ufp; BioseqPtr unlockme = NULL; + UserObjectPtr uop; ValNodePtr vnp; StringItemPtr ffstring; @@ -4884,12 +5013,21 @@ static CharPtr FormatFeatureBlockEx ( is_other = TRUE; } - featdeftype = fcontext->featdeftype; + featdeftype = fcontext->featdeftype; + if (featdeftype < FEATDEF_GENE || featdeftype >= FEATDEF_MAX) { featdeftype = FEATDEF_BAD; } + if (featdeftype == 0) { + featdeftype = sfp->idx.subtype; + } key = FindKeyFromFeatDefType (featdeftype, TRUE); + seqfeattype = fcontext->seqfeattype; + if (seqfeattype == 0) { + seqfeattype = sfp->data.choice; + } + if (format == GENPEPT_FMT && isProt) { if (featdeftype == FEATDEF_REGION) { key = "Region"; @@ -4916,7 +5054,7 @@ static CharPtr FormatFeatureBlockEx ( /* deal with unmappable impfeats */ - if (featdeftype == FEATDEF_BAD && fcontext->seqfeattype == SEQFEAT_IMP) { + if (featdeftype == FEATDEF_BAD && seqfeattype == SEQFEAT_IMP) { imp = (ImpFeatPtr) sfp->data.value.ptrvalue; if (imp != NULL) { key = imp->key; @@ -4926,7 +5064,7 @@ static CharPtr FormatFeatureBlockEx ( FFStartPrint(ffstring, format, 5, 21, NULL, 0, 5, 21, "FT", /* ifp->firstfeat */ FALSE); if (ajp->ajp.slp != NULL) { FFAddOneString(ffstring, key, FALSE, FALSE, TILDE_IGNORE); - } else if ( GetWWW(ajp) /* && SeqMgrGetParentOfPart (bsp, NULL) == NULL */ ) { + } else if ( GetWWW(ajp) && StringICmp (key, "gap") != 0 /* && SeqMgrGetParentOfPart (bsp, NULL) == NULL */ ) { FF_asn2gb_www_featkey (ffstring, key, sfp->location, fcontext->left + 1, fcontext->right + 1, fcontext->strand, itemID); } else { FFAddOneString(ffstring, key, FALSE, FALSE, TILDE_IGNORE); @@ -5038,7 +5176,7 @@ static CharPtr FormatFeatureBlockEx ( pseudo = TRUE; } - if (fcontext->seqfeattype == SEQFEAT_GENE) { + if (seqfeattype == SEQFEAT_GENE) { grp = (GeneRefPtr) sfp->data.value.ptrvalue; if (grp != NULL) { if (! StringHasNoText (grp->locus)) { @@ -5079,7 +5217,7 @@ static CharPtr FormatFeatureBlockEx ( } } - } else if (fcontext->featdeftype != FEATDEF_operon && fcontext->featdeftype != FEATDEF_gap) { + } else if (featdeftype != FEATDEF_operon && featdeftype != FEATDEF_gap) { grp = SeqMgrGetGeneXref (sfp); if (grp != NULL) { @@ -5091,11 +5229,23 @@ static CharPtr FormatFeatureBlockEx ( gene_for_old_locus_tag = SeqMgrGetFeatureByLabel (bsp_for_old_locus_tag, grp->locus_tag, SEQFEAT_GENE, 0, &gcontext); } } - if (grp == NULL && fcontext->featdeftype != FEATDEF_primer_bind) { + if (grp == NULL && featdeftype != FEATDEF_primer_bind) { gene = GetOverlappingGeneInEntity (ajp->ajp.entityID, fcontext, &gcontext, locforgene); if (gene == NULL && ajp->ajp.entityID != sfp->idx.entityID) { gene = GetOverlappingGeneInEntity (sfp->idx.entityID, fcontext, &gcontext, locforgene); } + + /* special case to get gene by overlap for coded_by cds on isolated protein bioseq */ + if (ifp->mapToProt && seqfeattype == SEQFEAT_CDREGION) { + sep = GetTopSeqEntryForEntityID (ajp->ajp.entityID); + if (sep != NULL && IS_Bioseq (sep)) { + bsp = (BioseqPtr) sep->data.ptrvalue; + if (bsp != NULL && ISA_aa (bsp->mol)) { + gene = FindGeneOnIsolatedProtein (sep, sfp); + } + } + } + gene_for_old_locus_tag = gene; if (gene != NULL) { qvp [FTQUAL_gene_note].str = gene->comment; @@ -5114,7 +5264,7 @@ static CharPtr FormatFeatureBlockEx ( pseudo = TRUE; } if (grp != NULL && (! SeqMgrGeneIsSuppressed (grp)) && - (fcontext->featdeftype != FEATDEF_repeat_region || gene == NULL)) { + (featdeftype != FEATDEF_repeat_region || gene == NULL)) { if (! StringHasNoText (grp->locus)) { qvp [FTQUAL_gene].str = grp->locus; qvp [FTQUAL_locus_tag].str = grp->locus_tag; @@ -5133,25 +5283,25 @@ static CharPtr FormatFeatureBlockEx ( } } if (grp != NULL && - fcontext->featdeftype != FEATDEF_variation && - fcontext->featdeftype != FEATDEF_repeat_region) { + featdeftype != FEATDEF_variation && + featdeftype != FEATDEF_repeat_region) { qvp [FTQUAL_gene_allele].str = grp->allele; /* now propagating /allele */ } - if (gene_for_old_locus_tag != NULL && fcontext->featdeftype != FEATDEF_repeat_region) { + if (gene_for_old_locus_tag != NULL && featdeftype != FEATDEF_repeat_region) { /* now propagate old_locus_tag to almost any underlying feature */ for (gbq = gene_for_old_locus_tag->qual; gbq != NULL; gbq = gbq->next) { if (StringHasNoText (gbq->val)) continue; idx = GbqualToFeaturIndex (gbq->qual); if (idx == FTQUAL_old_locus_tag) { qvp [FTQUAL_old_locus_tag].gbq = gbq; + break; /* record first old_locus_tag gbqual to display all */ } } } - if (fcontext->seqfeattype != SEQFEAT_CDREGION && - fcontext->seqfeattype != SEQFEAT_RNA) { + if (seqfeattype != SEQFEAT_CDREGION && seqfeattype != SEQFEAT_RNA) { qvp [FTQUAL_gene_xref].vnp = NULL; } - if (fcontext->featdeftype != FEATDEF_operon) { + if (featdeftype != FEATDEF_operon) { grp = SeqMgrGetGeneXref (sfp); if (grp == NULL || (! SeqMgrGeneIsSuppressed (grp))) { operon = SeqMgrGetOverlappingOperon (locforgene, &ocontext); @@ -5167,7 +5317,7 @@ static CharPtr FormatFeatureBlockEx ( /* specific fields set here */ - switch (fcontext->seqfeattype) { + switch (seqfeattype) { case SEQFEAT_CDREGION : if (! ifp->mapToProt) { crp = (CdRegionPtr) sfp->data.value.ptrvalue; @@ -5210,6 +5360,12 @@ static CharPtr FormatFeatureBlockEx ( } else { qvp [FTQUAL_selenocysteine].ble = TRUE; } + } else if (residue == 'O') { + if (ajp->flags.pyrrolysineToNote) { + qvp [FTQUAL_pyrrolysine_note].str = "pyrrolysine"; + } else { + qvp [FTQUAL_pyrrolysine].ble = TRUE; + } } } } @@ -5293,6 +5449,12 @@ static CharPtr FormatFeatureBlockEx ( } } } + pEID = ObjMgrGetEntityIDForPointer (prod); + if (pEID != 0 && pEID != ajp->ajp.entityID && + SeqMgrFeaturesAreIndexed (pEID) == 0) { + /* index far record so SeqMgrGetBestProteinFeature can work */ + SeqMgrIndexFeatures (pEID, NULL); + } prot = SeqMgrGetBestProteinFeature (prod, &pcontext); if (prot != NULL) { prp = (ProtRefPtr) prot->data.value.ptrvalue; @@ -5395,6 +5557,12 @@ static CharPtr FormatFeatureBlockEx ( } else { qvp [FTQUAL_selenocysteine].ble = TRUE; } + } else if (residue == 'O') { + if (ajp->flags.pyrrolysineToNote) { + qvp [FTQUAL_pyrrolysine_note].str = "pyrrolysine"; + } else { + qvp [FTQUAL_pyrrolysine].ble = TRUE; + } } } } @@ -5567,6 +5735,12 @@ static CharPtr FormatFeatureBlockEx ( } } if (aa > 0 && aa != 255) { + if (ajp->mode == RELEASE_MODE || ajp->mode == ENTREZ_MODE) { + if (aa == 79 || aa == 74) { /* O or J quarantined */ + aa = 88; /* X */ + } + } + /* - no gaps now that O and J are added if (aa <= 74) { shift = 0; } else if (aa > 79) { @@ -5574,12 +5748,13 @@ static CharPtr FormatFeatureBlockEx ( } else { shift = 1; } + */ if (aa != '*') { - idx = aa - (64 + shift); + idx = aa - (64 /* + shift */); } else { idx = 25; } - if (idx > 0 && idx < 26) { + if (idx > 0 && idx < 28) { str = trnaList [idx]; qvp [FTQUAL_product].str = str; if (StringNICmp (str, "tRNA-", 5) == 0) { @@ -5588,7 +5763,11 @@ static CharPtr FormatFeatureBlockEx ( } } qvp [FTQUAL_anticodon].slp = trna->anticodon; - qvp [FTQUAL_trna_codons].trp = trna; + if (ajp->flags.codonRecognizedToNote) { + qvp [FTQUAL_trna_codons_note].trp = trna; + } else { + qvp [FTQUAL_trna_codons].trp = trna; + } } } } else { @@ -5673,7 +5852,7 @@ static CharPtr FormatFeatureBlockEx ( qvp [FTQUAL_go_function].ufp = NULL; } - if (fcontext->featdeftype == FEATDEF_repeat_region) { + if (featdeftype == FEATDEF_repeat_region) { pseudo = FALSE; } @@ -5683,19 +5862,39 @@ static CharPtr FormatFeatureBlockEx ( sap = fcontext->sap; if (sap != NULL) { + annotDescCommentToComment = FALSE; for (adp = sap->desc; adp != NULL; adp = adp->next) { if (adp->choice == Annot_descr_comment) { if (StringDoesHaveText ((CharPtr) adp->data.ptrvalue)) { qvp [FTQUAL_seqannot_note].str = (CharPtr) adp->data.ptrvalue; } + } else if (adp->choice == Annot_descr_user) { + uop = (UserObjectPtr) adp->data.ptrvalue; + if (uop == NULL) continue; + oip = uop->type; + if (oip == NULL) continue; + if (StringCmp (oip->str, "AnnotDescCommentPolicy") == 0) { + for (ufp = uop->data; ufp != NULL; ufp = ufp->next) { + oip = ufp->label; + if (oip == NULL || ufp->data.ptrvalue == NULL) continue; + if (StringCmp (oip->str, "Policy") == 0) { + if (StringICmp ((CharPtr) ufp->data.ptrvalue, "ShowInComment") == 0) { + annotDescCommentToComment = TRUE; + } + } + } + } } } + if (annotDescCommentToComment) { + qvp [FTQUAL_seqannot_note].str = NULL; + } } /* if RELEASE_MODE, check list of features that can have /pseudo */ if (ajp->flags.dropIllegalQuals && pseudo && - (fcontext->seqfeattype == SEQFEAT_RNA || fcontext->seqfeattype == SEQFEAT_IMP) ) { + (seqfeattype == SEQFEAT_RNA || seqfeattype == SEQFEAT_IMP) ) { switch (featdeftype) { case FEATDEF_allele: @@ -5993,6 +6192,12 @@ static CharPtr FormatFeatureBlockEx ( qvp [FTQUAL_selenocysteine_note].str = NULL; } + /* suppress pyrrolysine note if already in comment */ + + if (StringStr (sfp->comment, "pyrrolysine") != NULL) { + qvp [FTQUAL_pyrrolysine_note].str = NULL; + } + /* if /allele inherited from gene, suppress allele gbqual on feature */ if (qvp [FTQUAL_gene_allele].str != NULL) { diff --git a/api/asn2gnb5.c b/api/asn2gnb5.c index 8ec9b4b7..7abce569 100644 --- a/api/asn2gnb5.c +++ b/api/asn2gnb5.c @@ -30,7 +30,7 @@ * * Version Creation Date: 10/21/98 * -* $Revision: 1.48 $ +* $Revision: 1.54 $ * * File Description: New GenBank flatfile generator - work in progress * @@ -74,6 +74,9 @@ NLM_EXTERN Char link_featc [MAX_WWWBUF]; NLM_EXTERN Char link_seq [MAX_WWWBUF]; #define DEF_LINK_SEQ "http://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi?" +NLM_EXTERN Char link_projid [MAX_WWWBUF]; +#define DEF_LINK_PROJID "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=genomeprj&cmd=Retrieve&dopt=Overview&list_uids=" + NLM_EXTERN Char link_wgs [MAX_WWWBUF]; #define DEF_LINK_WGS "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?" @@ -204,7 +207,7 @@ static Char link_gabi [MAX_WWWBUF]; #define DEF_LINK_GABI "https://gabi.rzpd.de/cgi-bin-protected/GreenCards.pl.cgi?Mode=ShowBioObject&BioObjectName=" static Char link_fantom [MAX_WWWBUF]; -#define DEF_LINK_FANTOM "http://fantom.gsc.riken.go.jp/db/view/main.cgi?masterid=" +#define DEF_LINK_FANTOM "http://fantom.gsc.riken.jp/db/annotate/main.cgi?masterid=" static Char link_interpro [MAX_WWWBUF]; #define DEF_LINK_INTERPRO "http://www.ebi.ac.uk/interpro/ISearch?mode=ipr&query=" @@ -213,7 +216,7 @@ static Char link_genedb [MAX_WWWBUF]; #define DEF_LINK_GENEDB "http://www.genedb.org/genedb/Dispatcher?formType=navBar&submit=Search+for&organism=All%3Apombe%3Acerevisiae%3Adicty%3Aasp%3Atryp%3Aleish%3Amalaria%3Astyphi%3Aglossina&desc=yes&ohmr=%2F&name=" static Char link_geneid [MAX_WWWBUF]; -#define DEF_LINK_GENEID "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=gene&cmd=retrieve&dopt=graphics&list_uids=" +#define DEF_LINK_GENEID "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=gene&cmd=Retrieve&dopt=full_report&list_uids=" static Char link_zfin [MAX_WWWBUF]; #define DEF_LINK_ZFIN "http://zfin.org/cgi-bin/webdriver?MIval=aa-markerview.apg&OID=" @@ -296,6 +299,7 @@ NLM_EXTERN void InitWWW (IntAsn2gbJobPtr ajp) GetAppParam ("NCBI", "WWWENTREZ", "LINK_FEAT", DEF_LINK_FEAT, link_feat, MAX_WWWBUF); GetAppParam ("NCBI", "WWWENTREZ", "LINK_FEATC", DEF_LINK_FEATC, link_featc, MAX_WWWBUF); GetAppParam ("NCBI", "WWWENTREZ", "LINK_SEQ", DEF_LINK_SEQ, link_seq, MAX_WWWBUF); + GetAppParam ("NCBI", "WWWENTREZ", "LINK_PROJID", DEF_LINK_PROJID, link_projid, MAX_WWWBUF); GetAppParam ("NCBI", "WWWENTREZ", "LINK_WGS", DEF_LINK_WGS, link_wgs, MAX_WWWBUF); GetAppParam ("NCBI", "WWWENTREZ", "LINK_OMIM", DEF_LINK_OMIM, link_omim, MAX_WWWBUF); GetAppParam ("NCBI", "WWWENTREZ", "LINK_REF", DEF_LINK_REF, ref_link, MAX_WWWBUF); @@ -2504,18 +2508,21 @@ static CharPtr FormatCitArt ( } static CharPtr FormatCitPat ( - FmtType format, + FmtType format, + ModType mode, CitPatPtr cpp, - SeqIdPtr seqidp, + SeqIdPtr seqidp, IntAsn2gbJobPtr ajp ) { AffilPtr afp; AuthListPtr alp; + IdPatPtr cit; CharPtr consortium = NULL; Char date [40]; ValNodePtr head = NULL; + Boolean is_us_pre_grant = FALSE; CharPtr prefix = NULL; CharPtr rsult = NULL; SeqIdPtr sip; @@ -2527,9 +2534,30 @@ static CharPtr FormatCitPat ( if (cpp == NULL) return NULL; + if (StringHasNoText (cpp->number) && + StringDoesHaveText (cpp->app_number) && + StringCmp (cpp->country, "US") == 0 && + mode != RELEASE_MODE) { + for (sip = seqidp; sip != NULL; sip = sip->next) { + if (sip->choice != SEQID_PATENT) continue; + psip = (PatentSeqIdPtr) sip->data.ptrvalue; + if (psip == NULL) continue; + cit = psip->cit; + if (cit == NULL) continue; + if (StringDoesHaveText (cit->app_number)) { + is_us_pre_grant = TRUE; + } + } + } + if (format == GENBANK_FMT || format == GENPEPT_FMT) { - ValNodeCopyStr (&head, 0, "Patent: "); - suffix = " "; + if (is_us_pre_grant) { + ValNodeCopyStr (&head, 0, "Pre-Grant Patent: "); + suffix = " "; + } else { + ValNodeCopyStr (&head, 0, "Patent: "); + suffix = " "; + } } else if (format == EMBL_FMT || format == EMBLPEPT_FMT) { ValNodeCopyStr (&head, 0, "Patent number "); } @@ -2550,7 +2578,11 @@ static CharPtr FormatCitPat ( ValNodeCopyStr (&head, 0, cpp->number); } } else if (! StringHasNoText (cpp->app_number)) { - AddValNodeString (&head, "(", cpp->app_number, ")"); + if (is_us_pre_grant) { + AddValNodeString (&head, NULL, cpp->app_number, NULL); + } else { + AddValNodeString (&head, "(", cpp->app_number, ")"); + } } if (! StringHasNoText (cpp->doc_type)) { @@ -2922,12 +2954,13 @@ static CharPtr FormatCitSub ( static CharPtr GetPubJournal ( FmtType format, + ModType mode, Boolean dropBadCitGens, Boolean noAffilOnUnpub, Boolean citArtIsoJta, PubdescPtr pdp, CitSubPtr csp, - SeqIdPtr seqidp, + SeqIdPtr seqidp, IndxPtr index, IntAsn2gbJobPtr ajp ) @@ -2996,7 +3029,7 @@ static CharPtr GetPubJournal ( case PUB_Patent : cpp = (CitPatPtr) vnp->data.ptrvalue; if (cpp != NULL) { - journal = FormatCitPat (format, cpp, seqidp, ajp); + journal = FormatCitPat (format, mode, cpp, seqidp, ajp); } break; default : @@ -3730,9 +3763,9 @@ NLM_EXTERN CharPtr FormatReferenceBlock ( citArtIsoJta = FALSE; } - str = GetPubJournal (afp->format, ajp->flags.dropBadCitGens, - ajp->flags.noAffilOnUnpub, citArtIsoJta, - pdp, csp, bsp->id, index, ajp); + str = GetPubJournal (afp->format, ajp->mode, ajp->flags.dropBadCitGens, + ajp->flags.noAffilOnUnpub, citArtIsoJta, pdp, csp, + bsp->id, index, ajp); if (str == NULL) { str = StringSave ("Unpublished"); } @@ -3815,7 +3848,6 @@ NLM_EXTERN CharPtr FormatReferenceBlock ( if (gbseq != NULL) { if (gbref != NULL) { - gbref->medline = muid; gbref->pubmed = pmid; } } diff --git a/api/asn2gnb6.c b/api/asn2gnb6.c index 773e03f0..698d7587 100644 --- a/api/asn2gnb6.c +++ b/api/asn2gnb6.c @@ -30,7 +30,7 @@ * * Version Creation Date: 10/21/98 * -* $Revision: 1.63 $ +* $Revision: 1.69 $ * * File Description: New GenBank flatfile generator - work in progress * @@ -990,6 +990,34 @@ static CharPtr FindUrlEnding(CharPtr str) { return ptr; } +static Boolean CommentHasSuspiciousHtml ( + IntAsn2gbJobPtr ajp, + CharPtr searchString +) + +{ + Char ch; + CharPtr ptr; + Int2 state; + ValNodePtr matches; + + if (StringHasNoText (searchString)) return FALSE; + + state = 0; + ptr = searchString; + ch = *ptr; + + while (ch != '\0') { + matches = NULL; + state = TextFsaNext (ajp->bad_html_fsa, state, ch, &matches); + if (matches != NULL) return TRUE; + ptr++; + ch = *ptr; + } + + return FALSE; +} + NLM_EXTERN void AddCommentWithURLlinks ( IntAsn2gbJobPtr ajp, StringItemPtr ffstring, @@ -1002,6 +1030,17 @@ NLM_EXTERN void AddCommentWithURLlinks ( Char ch; CharPtr ptr; + if (GetWWW (ajp) && CommentHasSuspiciousHtml (ajp, str)) { + if (prefix != NULL) { + FFAddOneString(ffstring, prefix, FALSE, FALSE, TILDE_IGNORE); + } + AddCommentStringWithTildes (ffstring, str); + if (suffix != NULL) { + FFAddOneString(ffstring, suffix, FALSE, FALSE, TILDE_IGNORE); + } + return; + } + while (! StringHasNoText (str)) { ptr = StringStr (str, "http://"); if (ptr == NULL) { @@ -4111,7 +4150,11 @@ static Int2 ProcessGapSpecialFormat ( FixGapAtEnd (buf, ' '); ajp->seqGapCurrLen += endgap; } else if (endgap > 0) { + /* FixGapAtEnd (buf, pad); + */ + FixGapAtEnd (buf, ' '); + ajp->seqGapCurrLen += endgap; } FixRemainingGaps (buf, pad); @@ -4119,6 +4162,24 @@ static Int2 ProcessGapSpecialFormat ( return startgapgap; } +static void ChangeoTox (CharPtr str) + +{ + Char ch; + + if (str == NULL) return; + ch = *str; + while (ch != '\0') { + if (ch == 'O') { + *str = 'X'; + } else if (ch == 'o') { + *str = 'x'; + } + str++; + ch = *str; + } +} + NLM_EXTERN CharPtr FormatSequenceBlock ( Asn2gbFormatPtr afp, BaseBlockPtr bbp @@ -4181,6 +4242,11 @@ NLM_EXTERN CharPtr FormatSequenceBlock ( } else { SeqPortStream (bsp, STREAM_EXPAND_GAPS | STREAM_CORRECT_INVAL, (Pointer) &tmp, SaveGBSeqSequence); } + if (ISA_aa (bsp->mol) && StringDoesHaveText (str)) { + if (ajp->mode == RELEASE_MODE || ajp->mode == ENTREZ_MODE) { + ChangeoTox (str); + } + } gbseq->sequence = StringSave (str); tmp = gbseq->sequence; @@ -4239,6 +4305,11 @@ NLM_EXTERN CharPtr FormatSequenceBlock ( } else { SeqPortStreamInt (bsp, start, extend - 1, Seq_strand_plus, flags, (Pointer) str, NULL); } + if (ISA_aa (bsp->mol) && StringDoesHaveText (str)) { + if (ajp->mode == RELEASE_MODE || ajp->mode == ENTREZ_MODE) { + ChangeoTox (str); + } + } sbp->bases = str; } } @@ -4306,6 +4377,7 @@ NLM_EXTERN CharPtr FormatSequenceBlock ( return str; } +/* static CharPtr insd_strd [4] = { NULL, "single", "double", "mixed" }; @@ -4317,6 +4389,7 @@ static CharPtr insd_mol [10] = { static CharPtr insd_top [3] = { NULL, "linear", "circular" }; +*/ NLM_EXTERN void AsnPrintNewLine PROTO((AsnIoPtr aip)); @@ -4326,14 +4399,16 @@ NLM_EXTERN CharPtr FormatSlashBlock ( ) { - IntAsn2gbJobPtr ajp; - Asn2gbSectPtr asp; - GBFeaturePtr currf, headf, nextf; - GBReferencePtr currr, headr, nextr; - GBSeqPtr gbseq, gbtmp; - IndxPtr index; - INSDSeq is; - Int2 moltype, strandedness, topology; + IntAsn2gbJobPtr ajp; + Asn2gbSectPtr asp; + GBFeaturePtr currf, headf, nextf; + GBReferencePtr currr, headr, nextr; + GBSeqPtr gbseq, gbtmp; + IndxPtr index; + INSDSeq is; + /* + Int2 moltype, strandedness, topology; + */ if (afp == NULL || bbp == NULL) return NULL; ajp = afp->ajp; @@ -4405,21 +4480,26 @@ NLM_EXTERN CharPtr FormatSlashBlock ( is.OBbits__ = gbseq->OBbits__; is.locus = gbseq->locus; is.length = gbseq->length; + is.strandedness = gbseq->strandedness; + is.moltype = gbseq->moltype; + is.topology = gbseq->topology; + /* strandedness = (Int2) gbseq->strandedness; if (strandedness < 0 || strandedness > 3) { strandedness = 0; } - is.strandedness = insd_strd [strandedness]; + is.strandedness = StringSave (insd_strd [strandedness]); moltype = (Int2) gbseq->moltype; if (moltype < 0 || moltype > 9) { moltype = 0; } - is.moltype = insd_mol [moltype]; + is.moltype = StringSave (insd_mol [moltype]); topology = (Int2) gbseq->topology; if (topology < 0 || topology > 2) { topology = 0; } - is.topology = insd_top [topology]; + is.topology = StringSave (insd_top [topology]); + */ is.division = gbseq->division; is.update_date = gbseq->update_date; is.create_date = gbseq->create_date; diff --git a/api/asn2gnbi.h b/api/asn2gnbi.h index 72d69a81..1c4e4ebd 100644 --- a/api/asn2gnbi.h +++ b/api/asn2gnbi.h @@ -29,7 +29,7 @@ * * Version Creation Date: 12/30/03 * -* $Revision: 1.55 $ +* $Revision: 1.61 $ * * File Description: New GenBank flatfile generator, internal header * @@ -95,7 +95,9 @@ typedef struct asn2gbflags { Boolean geneSynsToNote; Boolean refSeqQualsToNote; Boolean selenocysteineToNote; + Boolean pyrrolysineToNote; Boolean extraProductsToNote; + Boolean codonRecognizedToNote; Boolean forGbRelease; } Asn2gbFlags, PNTR Asn2gbFlagsPtr; @@ -153,6 +155,7 @@ typedef struct int_asn2gb_job { Int4 seqGapCurrLen; ValNodePtr gihead; ValNodePtr gitail; + TextFsaPtr bad_html_fsa; } IntAsn2gbJob, PNTR IntAsn2gbJobPtr; /* array for assigning biosource and feature data fields to qualifiers */ @@ -262,6 +265,7 @@ typedef struct asn2gbwork { Boolean hideGeneFeats; Boolean newLocusLine; Boolean showBaseCount; + Boolean forcePrimaryBlock; Boolean hideImpFeats; Boolean hideRemImpFeats; @@ -300,6 +304,7 @@ typedef struct asn2gbwork { Boolean firstfeat; Boolean featseen; Boolean featjustseen; + Int4 localFeatCount; ValNodePtr wgsaccnlist; Boolean has_mat_peptide; @@ -640,6 +645,8 @@ typedef enum { FTQUAL_prot_names, FTQUAL_protein_id, FTQUAL_pseudo, + FTQUAL_pyrrolysine, + FTQUAL_pyrrolysine_note, FTQUAL_region, FTQUAL_region_name, FTQUAL_replace, @@ -668,6 +675,7 @@ typedef enum { FTQUAL_trans_splicing, FTQUAL_trna_aa, FTQUAL_trna_codons, + FTQUAL_trna_codons_note, FTQUAL_usedin, FTQUAL_xtra_prod_quals, ASN2GNBK_TOTAL_FEATUR @@ -678,6 +686,7 @@ typedef enum { NLM_EXTERN Char link_feat [MAX_WWWBUF]; NLM_EXTERN Char link_featc [MAX_WWWBUF]; NLM_EXTERN Char link_seq [MAX_WWWBUF]; +NLM_EXTERN Char link_projid [MAX_WWWBUF]; NLM_EXTERN Char link_wgs [MAX_WWWBUF]; NLM_EXTERN Char link_omim [MAX_WWWBUF]; NLM_EXTERN Char ref_link [MAX_WWWBUF]; diff --git a/api/asn2gnbk.h b/api/asn2gnbk.h index a49171d5..eb181c3c 100644 --- a/api/asn2gnbk.h +++ b/api/asn2gnbk.h @@ -29,7 +29,7 @@ * * Version Creation Date: 10/21/98 * -* $Revision: 6.68 $ +* $Revision: 6.69 $ * * File Description: New GenBank flatfile generator * @@ -118,6 +118,8 @@ typedef unsigned long FlgType; #define SPECIAL_GAP_DISPLAY 65536 +#define FORCE_PRIMARY_BLOCK 131072 + /* locking behavior for system performance */ typedef unsigned long LckType; diff --git a/api/edutil.c b/api/edutil.c index e11edefe..4a288a79 100644 --- a/api/edutil.c +++ b/api/edutil.c @@ -29,7 +29,7 @@ * * Version Creation Date: 2/4/94 * -* $Revision: 6.52 $ +* $Revision: 6.54 $ * * File Description: Sequence editing utilities * @@ -39,6 +39,14 @@ * ------- ---------- ----------------------------------------------------- * * $Log: edutil.c,v $ +* Revision 6.54 2006/02/07 13:41:29 bollin +* added function AdjustFeatureForGapChange, which changes a feature to accommodate +* a change in the length of a gap +* +* Revision 6.53 2005/12/12 14:12:54 bollin +* BioseqCopyEx was not correctly handling copying the data contents of a +* delta sequence +* * Revision 6.52 2005/09/22 19:21:34 bollin * In the sequence editor, if the user inserts Ns into a gap of known length, * the gap length will be increased instead of creating two gaps on either side @@ -1829,6 +1837,29 @@ NLM_EXTERN Boolean LIBCALL SeqEntryDelFeat (SeqEntryPtr sep, SeqIdPtr sip, Int4 * *****************************************************************************/ +static DeltaSeqPtr CopyDeltaSeqPtrChain (DeltaSeqPtr dsp) +{ + DeltaSeqPtr new_chain = NULL; + SeqLocPtr slp_orig, slp_new; + SeqLitPtr slip_orig, slip_new; + + while (dsp != NULL) { + if (dsp->choice == 1) { + slp_orig = (SeqLocPtr) dsp->data.ptrvalue; + slp_new = AsnIoMemCopy (slp_orig, (AsnReadFunc) SeqLocAsnRead, (AsnWriteFunc) SeqLocAsnWrite); + ValNodeAddPointer (&new_chain, 1, slp_new); + } + else if (dsp->choice ==2) + { + slip_orig = (SeqLitPtr) dsp->data.ptrvalue; + slip_new = AsnIoMemCopy(slip_orig, (AsnReadFunc) SeqLitAsnRead, (AsnWriteFunc) SeqLitAsnWrite); + ValNodeAddPointer (&new_chain, 2, slip_new); + } + dsp = dsp->next; + } + + return new_chain; +} /***************************************************************************** * @@ -1947,9 +1978,8 @@ NLM_EXTERN BioseqPtr LIBCALL BioseqCopyEx (SeqIdPtr newid, BioseqPtr oldbsp, Int else if (newbsp->repr == Seq_repr_delta) { dsp = (DeltaSeqPtr)(oldbsp->seq_ext); /* real data is here */ - the_segs = DeltaSeqsToSeqLocs(dsp); - head = SeqLocCopyPart (the_segs, from, to, strand, FALSE, NULL, NULL); - SeqLocFree (the_segs); + + head = CopyDeltaSeqPtrChain (dsp); } newbsp->seq_ext = (Pointer)head; @@ -6839,6 +6869,36 @@ NLM_EXTERN void SeqEdFeatureAdjust } +NLM_EXTERN void +AdjustFeatureForGapChange +(SeqFeatPtr sfp, + BioseqPtr bsp, + Int4 offset, + Int4 len_diff) +{ + if (sfp == NULL || bsp == NULL || offset < 0 || len_diff == 0) + { + return; + } + + if (len_diff > 0) + { + SeqEdSeqFeatDelete (sfp, bsp, offset, offset + len_diff - 1, TRUE); + } + else + { + sfp->location = SeqEdSeqLocInsert (sfp->location, bsp, offset, -len_diff, FALSE, NULL); + if (sfp->data.choice == SEQFEAT_CDREGION) + { + SeqEdInsertAdjustCdRgn (sfp, bsp, offset, -len_diff, FALSE); + } + else if (sfp->data.choice == SEQFEAT_RNA) + { + SeqEdInsertAdjustRNA (sfp, bsp, offset, -len_diff, FALSE); + } + } +} + diff --git a/api/edutil.h b/api/edutil.h index 58158b83..8a7f0059 100644 --- a/api/edutil.h +++ b/api/edutil.h @@ -29,7 +29,7 @@ * * Version Creation Date: 2/2/94 * -* $Revision: 6.16 $ +* $Revision: 6.17 $ * * File Description: Sequence editing utilities * @@ -39,6 +39,10 @@ * ------- ---------- ----------------------------------------------------- * * $Log: edutil.h,v $ +* Revision 6.17 2006/02/07 13:41:29 bollin +* added function AdjustFeatureForGapChange, which changes a feature to accommodate +* a change in the length of a gap +* * Revision 6.16 2005/05/02 14:21:15 bollin * removed function prototypes for PlayJournal and UnplayJournal, since these * functions live in desktop/seqpanel.c and are only used there @@ -729,6 +733,12 @@ NLM_EXTERN void SeqEdReindexAffectedFeatures (Int4 shift_start, Int4 shift_amt, NLM_EXTERN void SeqEdReindexFeature (SeqFeatPtr sfp, BioseqPtr bsp); NLM_EXTERN Boolean SeqEdDeleteFromBsp (SeqEdJournalPtr sejp, BoolPtr pfeats_deleted); +NLM_EXTERN void +AdjustFeatureForGapChange +(SeqFeatPtr sfp, + BioseqPtr bsp, + Int4 offset, + Int4 len_diff); #ifdef __cplusplus } diff --git a/api/explore.h b/api/explore.h index beea3747..eee7aa9b 100644 --- a/api/explore.h +++ b/api/explore.h @@ -29,7 +29,7 @@ * * Version Creation Date: 6/30/98 * -* $Revision: 6.53 $ +* $Revision: 6.54 $ * * File Description: Reengineered and optimized exploration functions * to be used for future code @@ -122,6 +122,8 @@ typedef struct seqmgrfeatcontext { Boolean partialL; Boolean partialR; Boolean farloc; + Boolean bad_order; + Boolean mixed_strand; Uint1 strand; Uint1 seqfeattype; Uint1 featdeftype; diff --git a/api/findrepl.c b/api/findrepl.c index d555165d..e91104ea 100644 --- a/api/findrepl.c +++ b/api/findrepl.c @@ -44,6 +44,28 @@ * RCS Modification History: * ------------------------- * $Log: findrepl.c,v $ +* Revision 6.21 2006/01/17 17:50:01 bollin +* allow FindReplaceInEntity to search for a string made up of whitespace, as +* long as whole_word is not specified +* +* Revision 6.20 2006/01/10 18:13:56 kans +* FindReplAligns does not have case for SAS_DISC, since visit function recursively presents these components separately +* +* Revision 6.19 2006/01/09 21:15:03 bollin +* allow punctuation to terminate a word in find replace +* +* Revision 6.18 2006/01/04 21:26:57 kans +* FSA hit does not need code from validator unstructured source test, cleaned up variable names +* +* Revision 6.17 2006/01/04 20:39:41 kans +* added FindStringsInEntity using finite state machine, general cleanup of code +* +* Revision 6.16 2005/12/29 21:42:06 kans +* only call callback if text was found or replaced +* +* Revision 6.15 2005/12/29 20:54:41 kans +* FindReplaceInEntity takes callback and userdata +* * Revision 6.14 2005/09/21 14:39:09 bollin * fixed bug in FindReplace where if the whole-word flag was specified but * the substring was found in a not-whole-word context earlier in the string @@ -83,7 +105,9 @@ * using NUM_SEQID, added TPA ids to arrays * * Revision 6.2 2000/11/03 20:36:00 kans -* FindReplaceInEntity replaces FindInEntity and FindInEntityX - complete redesign, no longer using AsnExpOptExplore because of the difficulty of replacing with a larger string (TF + JK) +* FindReplaceInEntity replaces FindInEntity and FindInEntityX - complete redesign, +* no longer using AsnExpOptExplore because of the difficulty of replacing with a +* larger string (TF + JK) * * Revision 6.1 1999/03/05 23:31:07 kans * FindInEntityX was not initializing flen, replen @@ -98,31 +122,31 @@ * added whole_word parameter to FindInEntity and FindInEntityX, and protected * against multiple ObjMgrAlsoSelects on a single itemID * - * Revision 5.1 1996/09/06 20:20:41 kans - * keeps going even if ObjMgrTypeFind returns NULL (e.g., on OBJ_BIOSEQ_SEG), - * and adds a case_counts parameter for case sensitive/insensitive searches. - * - * Revision 5.0 1996/05/28 13:23:23 ostell - * Set to revision 5.0 - * - * Revision 1.7 1996/02/28 04:53:06 ostell - * fix to prevernt recursion on substring replaces - * - * Revision 1.6 1996/02/26 20:24:05 kans - * replace needs MemCopy instead of StringMove (JO), and set dirty flag - * - * Revision 1.5 1996/01/03 23:06:32 ostell - * support for longer replaces, controlled updating - * - * Revision 1.3 1996/01/02 18:40:07 ostell - * simplified code. - * - * Revision 1.2 1996/01/01 00:05:14 kans - * replaced StringStr with StringISearch to ignore case - * - * Revision 1.1 1995/12/31 18:13:14 kans - * Initial revision - * +* Revision 5.1 1996/09/06 20:20:41 kans +* keeps going even if ObjMgrTypeFind returns NULL (e.g., on OBJ_BIOSEQ_SEG), +* and adds a case_counts parameter for case sensitive/insensitive searches. +* +* Revision 5.0 1996/05/28 13:23:23 ostell +* Set to revision 5.0 +* +* Revision 1.7 1996/02/28 04:53:06 ostell +* fix to prevernt recursion on substring replaces +* +* Revision 1.6 1996/02/26 20:24:05 kans +* replace needs MemCopy instead of StringMove (JO), and set dirty flag +* +* Revision 1.5 1996/01/03 23:06:32 ostell +* support for longer replaces, controlled updating +* +* Revision 1.3 1996/01/02 18:40:07 ostell +* simplified code. +* +* Revision 1.2 1996/01/01 00:05:14 kans +* replaced StringStr with StringISearch to ignore case +* +* Revision 1.1 1995/12/31 18:13:14 kans +* Initial revision +* * Revision 1.1.1.1 1995/10/19 18:42:10 sad * Initial version * @@ -137,47 +161,43 @@ #include <subutil.h> #include <findrepl.h> -/* internal structure passed to callbacks */ +/* callback type for search/replace functions */ -typedef struct findstruct { - Uint2 entityID; - CharPtr find_string; - CharPtr replace_string; - Boolean case_counts; - Boolean whole_word; - Boolean do_replace; - Boolean select_item; - Int2 send_update; - Boolean did_find; - Boolean did_replace; - Boolean dirty; - Boolean descFilter [SEQDESCR_MAX]; - Boolean featFilter [FEATDEF_MAX]; - Boolean seqidFilter [NUM_SEQID]; - int d [256]; - size_t subLen; -} FindStruct, PNTR FindStructPtr; +typedef void (*FindReplFunc) (CharPtr PNTR strp, Pointer fspdata); -#define PID_NOTSET 0 -#define PID_DBTAG 1 -#define PID_NAME 2 -#define PID_ML 3 -#define PID_STR 4 +/* internal data structure */ -#define NAMESTD_LAST 0 -#define NAMESTD_FIRST 1 -#define NAMESTD_MIDDLE 2 -#define NAMESTD_FULL 3 -#define NAMESTD_INITIALS 4 -#define NAMESTD_SUFFIX 5 -#define NAMESTD_TITLE 6 +typedef struct findstruct { + Uint2 entityID; + FindReplFunc func; + FindReplProc callback; + Pointer userdata; + + CharPtr find_string; + CharPtr replace_string; + Boolean case_counts; + Boolean whole_word; + Int4 findLen; + Int4 replaceLen; + + Boolean select_item; + Int2 send_update; + Boolean did_find; + Boolean did_replace; + Boolean dirty; + + Boolean descFilter [SEQDESCR_MAX]; + Boolean featFilter [FEATDEF_MAX]; + Boolean seqidFilter [NUM_SEQID]; + + int d [256]; + TextFsaPtr fsa; +} FindStruct, PNTR FindStructPtr; -#define AUTHLIST_STRUCTURED 1 -#define AUTHLIST_ML 2 -#define AUTHLIST_STRING 3 - #define FINDREPL_BUFFER_MAX 1000000 +/* BOYER-MOORE SEARCH FUNCTIONS */ + /* StringSearch and StringISearch use the Boyer-Moore algorithm, as described in Niklaus Wirth, Algorithms and Data Structures, Prentice- Hall, Inc., Englewood Cliffs, NJ., 1986, p. 69. The original had an error, where @@ -226,8 +246,8 @@ static CharPtr FindSubString ( return NULL; } -/* passed subLen and d array to avoid repeated initialization of the Boyer-Moore - displacement table */ +/* passed subLen and d array to avoid repeated initialization + of the Boyer-Moore displacement table */ static CharPtr SearchForString ( CharPtr str, @@ -251,86 +271,70 @@ static CharPtr SearchForString ( ptr = FindSubString (str, sub, case_counts, strLen, subLen, d); if (ptr == NULL) return NULL; - if (whole_word) { - while (keep_looking && ptr != NULL) - { - keep_looking = FALSE; - if (ptr > str) { - tmp = ptr - 1; - if (! IS_WHITESP (*tmp)) - { - keep_looking = TRUE; - } - } - if (!keep_looking) - { - tmp = ptr + StringLen (sub); - if (*tmp != '\0' && (! IS_WHITESP (*tmp))) - { - keep_looking = TRUE; - } + if (! whole_word) return ptr; + + while (keep_looking && ptr != NULL) { + keep_looking = FALSE; + if (ptr > str) { + tmp = ptr - 1; + if (! IS_WHITESP (*tmp)) { + keep_looking = TRUE; } - if (keep_looking) - { - ptr = FindSubString (ptr + subLen, sub, case_counts, strLen, subLen, d); + } + if (! keep_looking) { + tmp = ptr + StringLen (sub); + if (*tmp != '\0' && (! IS_WHITESP (*tmp)) && (! ispunct (*tmp))) { + keep_looking = TRUE; } } + if (keep_looking) { + ptr = FindSubString (ptr + subLen, sub, case_counts, strLen, subLen, d); + } } return ptr; } -/*=======================================================================*/ -/* */ -/* FindReplString () - Does a search and replace in a given string. */ -/* */ -/* Main Parameters: */ -/* */ -/* strp : The string to operate on. Passed as a pointer to */ -/* a string so that it can be replaced by the */ -/* resulting string. */ -/* */ -/* fsp->find_string : The substring that is being replaced */ -/* in strp. */ -/* */ -/* fsp->replace_string : The substring that is replacing */ -/* find_string in strp. */ -/* */ -/*=======================================================================*/ - -static Boolean FindReplString ( +static void BoyerMooreFindString ( CharPtr PNTR strp, - FindStructPtr fsp + Pointer userdata ) { - Boolean wasChanged; - Int4 replaceLen; - Int4 findLen; - Int4 searchLen; - Int4 buffSize; - CharPtr workingBuffer; - CharPtr searchString; - CharPtr substringPtr; - - if (strp == NULL || fsp == NULL) return FALSE; + FindStructPtr fsp; + CharPtr searchString; - replaceLen = StringLen (fsp->replace_string); - findLen = StringLen (fsp->find_string); - searchLen = StringLen (*strp); + if (strp == NULL || userdata == NULL) return; + fsp = (FindStructPtr) userdata; searchString = *strp; - wasChanged = FALSE; + if (SearchForString (searchString, fsp->find_string, fsp->case_counts, + fsp->whole_word, fsp->findLen, fsp->d) != NULL) { + fsp->did_find = TRUE; + } +} - if (! fsp->do_replace) { - if (SearchForString (searchString, fsp->find_string, - fsp->case_counts, fsp->whole_word, - findLen, fsp->d) != NULL) { +static void BoyerMooreReplaceString ( + CharPtr PNTR strp, + Pointer userdata +) - fsp->did_find = TRUE; - } - return TRUE; - } +{ + Int4 buffSize; + FindStructPtr fsp; + Int4 searchLen; + CharPtr searchString; + CharPtr substringPtr; + Boolean wasChanged; + CharPtr workingBuffer; + + if (strp == NULL || userdata == NULL) return; + fsp = (FindStructPtr) userdata; + + searchString = *strp; + searchLen = StringLen (searchString); + + wasChanged = FALSE; /*------------------------------------------------*/ /* Make a guess of how big a working buffer we'll */ @@ -347,18 +351,17 @@ static Boolean FindReplString ( /* */ /*------------------------------------------------*/ - if (replaceLen > findLen) - { - buffSize = searchLen + ((searchLen/findLen) * (replaceLen - findLen)); - if (buffSize > FINDREPL_BUFFER_MAX) + if (fsp->replaceLen > fsp->findLen) { + buffSize = searchLen + ((searchLen/fsp->findLen) * (fsp->replaceLen - fsp->findLen)); + if (buffSize > FINDREPL_BUFFER_MAX) { buffSize = FINDREPL_BUFFER_MAX; - } - else + } + } else { buffSize = searchLen; + } workingBuffer = (CharPtr) MemNew (buffSize + 2); - if (workingBuffer == NULL) - return FALSE; + if (workingBuffer == NULL) return; workingBuffer[0] = '\0'; @@ -369,45 +372,104 @@ static Boolean FindReplString ( /*----------------------------------------*/ while ((substringPtr = SearchForString (searchString, fsp->find_string, - fsp->case_counts, fsp->whole_word, - findLen, fsp->d)) != NULL) - { - wasChanged = TRUE; - substringPtr[0] = '\0'; + fsp->case_counts, fsp->whole_word, fsp->findLen, fsp->d)) != NULL) { + wasChanged = TRUE; + substringPtr [0] = '\0'; - if (StringLen (workingBuffer) + StringLen (searchString) > buffSize) - return FALSE; + if (StringLen (workingBuffer) + StringLen (searchString) > buffSize) return; - StringCat (workingBuffer, searchString); - StringCat (workingBuffer, fsp->replace_string); - substringPtr[0] = 'x'; - searchString = substringPtr + findLen; - } + StringCat (workingBuffer, searchString); + StringCat (workingBuffer, fsp->replace_string); + substringPtr [0] = 'x'; + searchString = substringPtr + fsp->findLen; + } - if (searchString != NULL) + if (searchString != NULL) { StringCat (workingBuffer, searchString); + } /*-------------------------------------*/ /* If any replacements were made, then */ /* swap in the new string for the old. */ /*-------------------------------------*/ - if (wasChanged) - { - MemFree (*strp); - (*strp) = workingBuffer; + if (wasChanged) { + MemFree (*strp); + (*strp) = workingBuffer; - fsp->did_replace = TRUE; - fsp->dirty = TRUE; - } - else + fsp->did_replace = TRUE; + fsp->dirty = TRUE; + } else { MemFree (workingBuffer); + } +} + +/* FINITE-STATE AUTOMATON SEARCH FUNCTION */ + +static void FSAFindStrings ( + CharPtr PNTR strp, + Pointer userdata +) + +{ + Char ch; + FindStructPtr fsp; + CharPtr ptr; + CharPtr searchString; + Int2 state; + ValNodePtr matches; + + if (strp == NULL || userdata == NULL) return; + fsp = (FindStructPtr) userdata; + + searchString = *strp; + if (searchString == NULL) return; - /*---------------------*/ - /* Return successfully */ - /*---------------------*/ + state = 0; + ptr = searchString; + ch = *ptr; - return TRUE; + while (ch != '\0') { + matches = NULL; + state = TextFsaNext (fsp->fsa, state, ch, &matches); + if (matches != NULL) { + fsp->did_find = TRUE; + return; + } + ptr++; + ch = *ptr; + } +} + +/* MASTER SEARCH FUNCTION CALLS DESIGNATED FUNC CALLBACK */ + +/*=======================================================================*/ +/* */ +/* FindReplString () - Does a search and replace in a given string. */ +/* */ +/* Main Parameters: */ +/* */ +/* strp : The string to operate on. Passed as a pointer to */ +/* a string so that it can be replaced by the */ +/* resulting string. */ +/* */ +/* fsp->find_string : The substring that is being replaced */ +/* in strp. */ +/* */ +/* fsp->replace_string : The substring that is replacing */ +/* find_string in strp. */ +/* */ +/*=======================================================================*/ + +static void FindReplString ( + CharPtr PNTR strp, + FindStructPtr fsp +) + +{ + if (strp == NULL || fsp == NULL || fsp->func == NULL) return; + + fsp->func (strp, (Pointer) fsp); } /*=======================================================================*/ @@ -464,26 +526,25 @@ static void FindReplDbxrefs ( static void FindReplAffil ( AffilPtr pAffil, - FindStructPtr pFindStruct + FindStructPtr fsp ) { - if (pAffil == NULL) - return; + if (pAffil == NULL) return; if (pAffil->choice == 1) { - FindReplString (&(pAffil->affil) , pFindStruct); + FindReplString (&(pAffil->affil) , fsp); } else { - FindReplString (&(pAffil->affil) , pFindStruct); - FindReplString (&(pAffil->div) , pFindStruct); - FindReplString (&(pAffil->city) , pFindStruct); - FindReplString (&(pAffil->sub) , pFindStruct); - FindReplString (&(pAffil->country) , pFindStruct); - FindReplString (&(pAffil->street) , pFindStruct); - FindReplString (&(pAffil->email) , pFindStruct); - FindReplString (&(pAffil->fax) , pFindStruct); - FindReplString (&(pAffil->phone) , pFindStruct); - FindReplString (&(pAffil->postal_code), pFindStruct); + FindReplString (&(pAffil->affil) , fsp); + FindReplString (&(pAffil->div) , fsp); + FindReplString (&(pAffil->city) , fsp); + FindReplString (&(pAffil->sub) , fsp); + FindReplString (&(pAffil->country) , fsp); + FindReplString (&(pAffil->street) , fsp); + FindReplString (&(pAffil->email) , fsp); + FindReplString (&(pAffil->fax) , fsp); + FindReplString (&(pAffil->phone) , fsp); + FindReplString (&(pAffil->postal_code), fsp); } } @@ -493,9 +554,23 @@ static void FindReplAffil ( /* */ /*=======================================================================*/ +#define NAMESTD_LAST 0 +#define NAMESTD_FIRST 1 +#define NAMESTD_MIDDLE 2 +#define NAMESTD_FULL 3 +#define NAMESTD_INITIALS 4 +#define NAMESTD_SUFFIX 5 +#define NAMESTD_TITLE 6 + +#define PID_NOTSET 0 +#define PID_DBTAG 1 +#define PID_NAME 2 +#define PID_ML 3 +#define PID_STR 4 + static void FindReplAuthor ( AuthorPtr pAuthor, - FindStructPtr pFindStruct + FindStructPtr fsp ) { @@ -503,36 +578,33 @@ static void FindReplAuthor ( CharPtr pNameStr; ValNodePtr pDbxref; - if (pAuthor == NULL) - return; + if (pAuthor == NULL) return; - FindReplAffil (pAuthor->affil, pFindStruct); + FindReplAffil (pAuthor->affil, fsp); - switch (pAuthor->name->choice) - { + switch (pAuthor->name->choice) { case PID_NOTSET : break; case PID_DBTAG : pDbxref = pAuthor->name->data; - FindReplDbxrefs (pDbxref, pFindStruct); + FindReplDbxrefs (pDbxref, fsp); break; case PID_NAME : pNameStandard = pAuthor->name->data; - if (pNameStandard != NULL) - { - FindReplString (&(pNameStandard->names [NAMESTD_LAST]) , pFindStruct); - FindReplString (&(pNameStandard->names [NAMESTD_FIRST]) , pFindStruct); - FindReplString (&(pNameStandard->names [NAMESTD_MIDDLE]) , pFindStruct); - FindReplString (&(pNameStandard->names [NAMESTD_FULL]) , pFindStruct); - FindReplString (&(pNameStandard->names [NAMESTD_INITIALS]), pFindStruct); - FindReplString (&(pNameStandard->names [NAMESTD_SUFFIX]) , pFindStruct); - FindReplString (&(pNameStandard->names [NAMESTD_TITLE]) , pFindStruct); + if (pNameStandard != NULL) { + FindReplString (&(pNameStandard->names [NAMESTD_LAST]) , fsp); + FindReplString (&(pNameStandard->names [NAMESTD_FIRST]) , fsp); + FindReplString (&(pNameStandard->names [NAMESTD_MIDDLE]) , fsp); + FindReplString (&(pNameStandard->names [NAMESTD_FULL]) , fsp); + FindReplString (&(pNameStandard->names [NAMESTD_INITIALS]), fsp); + FindReplString (&(pNameStandard->names [NAMESTD_SUFFIX]) , fsp); + FindReplString (&(pNameStandard->names [NAMESTD_TITLE]) , fsp); } break; case PID_ML : case PID_STR : pNameStr = pAuthor->name->data; - FindReplString (&pNameStr, pFindStruct); + FindReplString (&pNameStr, fsp); break; default: break; @@ -545,6 +617,10 @@ static void FindReplAuthor ( /* */ /*=======================================================================*/ +#define AUTHLIST_STRUCTURED 1 +#define AUTHLIST_ML 2 +#define AUTHLIST_STRING 3 + static void FindReplAuthlist ( AuthListPtr alp, FindStructPtr fsp @@ -555,30 +631,25 @@ static void FindReplAuthlist ( CharPtr szAuthor; AuthorPtr pAuthor; - if (alp == NULL) - return; + if (alp == NULL) return; FindReplAffil (alp->affil, fsp); vnpNames = alp->names; - while (vnpNames != NULL) - { - if (alp->choice == AUTHLIST_STRUCTURED) - { - pAuthor = (AuthorPtr) vnpNames->data.ptrvalue; - if (pAuthor != NULL) - FindReplAuthor (pAuthor, fsp); + while (vnpNames != NULL) { + if (alp->choice == AUTHLIST_STRUCTURED) { + pAuthor = (AuthorPtr) vnpNames->data.ptrvalue; + if (pAuthor != NULL) { + FindReplAuthor (pAuthor, fsp); } - else - { - szAuthor = (CharPtr) vnpNames->data.ptrvalue; - if (szAuthor != NULL) - { - FindReplString (&szAuthor, fsp); - vnpNames->data.ptrvalue = szAuthor; - } + } else { + szAuthor = (CharPtr) vnpNames->data.ptrvalue; + if (szAuthor != NULL) { + FindReplString (&szAuthor, fsp); + vnpNames->data.ptrvalue = szAuthor; } - vnpNames = vnpNames->next; } + vnpNames = vnpNames->next; + } } /*=======================================================================*/ @@ -589,14 +660,13 @@ static void FindReplAuthlist ( static void FindReplCitRetract ( CitRetractPtr pCitRetract, - FindStructPtr pFindStruct + FindStructPtr fsp ) { - if (pCitRetract == NULL) - return; + if (pCitRetract == NULL) return; - FindReplString (&(pCitRetract->exp), pFindStruct); + FindReplString (&(pCitRetract->exp), fsp); } /*=======================================================================*/ @@ -607,32 +677,22 @@ static void FindReplCitRetract ( static void FindReplImprint ( ImprintPtr pImprint, - FindStructPtr pFindStruct + FindStructPtr fsp ) { - - /*------------------*/ - /* Check parameters */ - /*------------------*/ - - if (pImprint == NULL) - return; - - /*-------------------------*/ - /* Do the find and replace */ - /*-------------------------*/ - - FindReplString (&(pImprint->volume) , pFindStruct); - FindReplString (&(pImprint->issue) , pFindStruct); - FindReplString (&(pImprint->pages) , pFindStruct); - FindReplString (&(pImprint->section) , pFindStruct); - FindReplString (&(pImprint->part_sup) , pFindStruct); - FindReplString (&(pImprint->language) , pFindStruct); - FindReplString (&(pImprint->part_supi), pFindStruct); - - FindReplAffil (pImprint->pub, pFindStruct); - FindReplCitRetract (pImprint->retract, pFindStruct); + if (pImprint == NULL) return; + + FindReplString (&(pImprint->volume) , fsp); + FindReplString (&(pImprint->issue) , fsp); + FindReplString (&(pImprint->pages) , fsp); + FindReplString (&(pImprint->section) , fsp); + FindReplString (&(pImprint->part_sup) , fsp); + FindReplString (&(pImprint->language) , fsp); + FindReplString (&(pImprint->part_supi), fsp); + + FindReplAffil (pImprint->pub, fsp); + FindReplCitRetract (pImprint->retract, fsp); } /*=======================================================================*/ @@ -643,31 +703,31 @@ static void FindReplImprint ( static void FindReplCitBook ( CitBookPtr pCitBook, - FindStructPtr pFindStruct + FindStructPtr fsp ) { AffilPtr afp; - ValNodePtr vnp; CharPtr tmpStr; + ValNodePtr vnp; if (pCitBook == NULL) return; - FindReplStringList (pCitBook->title, pFindStruct); - FindReplImprint (pCitBook->imp, pFindStruct); - FindReplAuthlist (pCitBook->authors, pFindStruct); - FindReplStringList (pCitBook->title, pFindStruct); - FindReplStringList (pCitBook->coll, pFindStruct); + FindReplStringList (pCitBook->title, fsp); + FindReplImprint (pCitBook->imp, fsp); + FindReplAuthlist (pCitBook->authors, fsp); + FindReplStringList (pCitBook->title, fsp); + FindReplStringList (pCitBook->coll, fsp); if (pCitBook->othertype == 1) { for (vnp = (ValNodePtr) pCitBook->otherdata; vnp != NULL; vnp = vnp->next) { switch (vnp->choice) { case 1 : - FindReplString ((CharPtr PNTR) &(vnp->data.ptrvalue), pFindStruct); + FindReplString ((CharPtr PNTR) &(vnp->data.ptrvalue), fsp); break; case 3 : afp = (AffilPtr) vnp->data.ptrvalue; - FindReplAffil (afp, pFindStruct); + FindReplAffil (afp, fsp); break; default : break; @@ -675,35 +735,34 @@ static void FindReplCitBook ( } } else if (pCitBook->othertype == 2) { tmpStr = (CharPtr) pCitBook->otherdata; - FindReplString (&tmpStr, pFindStruct); + FindReplString (&tmpStr, fsp); pCitBook->otherdata = tmpStr; } } static void FindReplCitArt ( CitArtPtr pCitArt, - FindStructPtr pFindStruct + FindStructPtr fsp ) { - CitJourPtr pCitJournal; - CitBookPtr pCitBook; + CitBookPtr pCitBook; + CitJourPtr pCitJournal; - if (pCitArt == NULL) - return; + if (pCitArt == NULL) return; - FindReplAuthlist (pCitArt->authors, pFindStruct); + FindReplAuthlist (pCitArt->authors, fsp); if (pCitArt->fromptr != NULL) { switch (pCitArt->from) { case 1 : pCitJournal = (CitJourPtr) pCitArt->fromptr; - FindReplStringList (pCitArt->title, pFindStruct); - FindReplImprint (pCitJournal->imp, pFindStruct); + FindReplStringList (pCitArt->title, fsp); + FindReplImprint (pCitJournal->imp, fsp); break; case 2 : case 3 : pCitBook = (CitBookPtr) pCitArt->fromptr; - FindReplCitBook (pCitBook, pFindStruct); + FindReplCitBook (pCitBook, fsp); break; default : break; @@ -719,7 +778,7 @@ static void FindReplCitArt ( static void FindReplMedlineEntry ( MedlineEntryPtr pMedlineEntry, - FindStructPtr pFindStruct + FindStructPtr fsp ) { @@ -728,61 +787,53 @@ static void FindReplMedlineEntry ( MedlineRnPtr pRn; CharPtr tmpStr; - if (pMedlineEntry == NULL) - return; + if (pMedlineEntry == NULL) return; - FindReplCitArt(pMedlineEntry->cit, pFindStruct); - FindReplString (&(pMedlineEntry->abstract), pFindStruct); + FindReplCitArt(pMedlineEntry->cit, fsp); + FindReplString (&(pMedlineEntry->abstract), fsp); pRn = pMedlineEntry->substance; - while (pRn != NULL) - { - FindReplString (&(pRn->cit), pFindStruct); - FindReplString (&(pRn->name), pFindStruct); - pRn = pRn->next; - } + while (pRn != NULL) { + FindReplString (&(pRn->cit), fsp); + FindReplString (&(pRn->name), fsp); + pRn = pRn->next; + } pMesh = pMedlineEntry->mesh; - while (pMesh != NULL) - { - FindReplString (&(pMesh->term), pFindStruct); - pMesh = pMesh->next; - } + while (pMesh != NULL) { + FindReplString (&(pMesh->term), fsp); + pMesh = pMesh->next; + } - if (pMedlineEntry->xref != NULL) - { - tmpStr = (CharPtr) pMedlineEntry->xref->data.ptrvalue; - FindReplString (&tmpStr, pFindStruct); - pMedlineEntry->xref->data.ptrvalue = tmpStr; - } + if (pMedlineEntry->xref != NULL) { + tmpStr = (CharPtr) pMedlineEntry->xref->data.ptrvalue; + FindReplString (&tmpStr, fsp); + pMedlineEntry->xref->data.ptrvalue = tmpStr; + } - if (pMedlineEntry->idnum != NULL) - { - tmpStr = (CharPtr) pMedlineEntry->idnum->data.ptrvalue; - FindReplString (&tmpStr, pFindStruct); - pMedlineEntry->idnum->data.ptrvalue = tmpStr; - } + if (pMedlineEntry->idnum != NULL) { + tmpStr = (CharPtr) pMedlineEntry->idnum->data.ptrvalue; + FindReplString (&tmpStr, fsp); + pMedlineEntry->idnum->data.ptrvalue = tmpStr; + } - if (pMedlineEntry->pub_type != NULL) - { - tmpStr = (CharPtr) pMedlineEntry->pub_type->data.ptrvalue; - FindReplString (&tmpStr, pFindStruct); - pMedlineEntry->pub_type->data.ptrvalue = tmpStr; - } + if (pMedlineEntry->pub_type != NULL) { + tmpStr = (CharPtr) pMedlineEntry->pub_type->data.ptrvalue; + FindReplString (&tmpStr, fsp); + pMedlineEntry->pub_type->data.ptrvalue = tmpStr; + } - if (pMedlineEntry->gene != NULL) - { - tmpStr = (CharPtr) pMedlineEntry->gene->data.ptrvalue; - FindReplString (&tmpStr, pFindStruct); - pMedlineEntry->gene->data.ptrvalue = tmpStr; - } + if (pMedlineEntry->gene != NULL) { + tmpStr = (CharPtr) pMedlineEntry->gene->data.ptrvalue; + FindReplString (&tmpStr, fsp); + pMedlineEntry->gene->data.ptrvalue = tmpStr; + } pField = pMedlineEntry->mlfield; - while (pField != NULL) - { - FindReplString (&(pField->str), pFindStruct); - pField = pField->next; - } + while (pField != NULL) { + FindReplString (&(pField->str), fsp); + pField = pField->next; + } } /*=======================================================================*/ @@ -822,8 +873,7 @@ static void FindReplPub ( } if (vnp->data.ptrvalue == NULL) return; - switch (vnp->choice) - { + switch (vnp->choice) { case PUB_Gen : cgp = (CitGenPtr) vnp->data.ptrvalue; FindReplAuthlist (cgp->authors, fsp); @@ -832,8 +882,7 @@ static void FindReplPub ( FindReplString (&(cgp->issue), fsp); FindReplString (&(cgp->pages), fsp); FindReplString (&(cgp->title), fsp); - if (cgp->journal != NULL) - { + if (cgp->journal != NULL) { tmpStr = (CharPtr) cgp->journal->data.ptrvalue; FindReplString (&tmpStr, fsp); cgp->journal->data.ptrvalue = tmpStr; @@ -854,8 +903,7 @@ static void FindReplPub ( break; case PUB_Journal : cjp = (CitJourPtr) vnp->data.ptrvalue; - if (cjp->title != NULL) - { + if (cjp->title != NULL) { tmpStr = (CharPtr) cjp->title->data.ptrvalue; FindReplString (&tmpStr, fsp); cjp->title->data.ptrvalue = tmpStr; @@ -869,16 +917,14 @@ static void FindReplPub ( case PUB_Proc : cbp = (CitBookPtr) vnp->data.ptrvalue; cpvnp = cbp->otherdata; - while (cpvnp != NULL) - { - if (cpvnp->choice == 1) - { - tmpStr = (CharPtr) cpvnp->data.ptrvalue; - FindReplString (&tmpStr, fsp); - cpvnp->data.ptrvalue = tmpStr; - } - else if (cpvnp->choice == 3) + while (cpvnp != NULL) { + if (cpvnp->choice == 1) { + tmpStr = (CharPtr) cpvnp->data.ptrvalue; + FindReplString (&tmpStr, fsp); + cpvnp->data.ptrvalue = tmpStr; + } else if (cpvnp->choice == 3) { FindReplAffil((AffilPtr) cpvnp->data.ptrvalue, fsp); + } cpvnp = cpvnp->next; } break; @@ -1021,20 +1067,17 @@ static void FindReplBioSource ( static void FindReplPatentSeqId ( PatentSeqIdPtr pPatentSeqId, - FindStructPtr pFindStruct + FindStructPtr fsp ) { - if (pPatentSeqId == NULL) - return; - - if (pPatentSeqId->cit == NULL) - return; + if (pPatentSeqId == NULL) return; + if (pPatentSeqId->cit == NULL) return; - FindReplString (&(pPatentSeqId->cit->country), pFindStruct); - FindReplString (&(pPatentSeqId->cit->number), pFindStruct); - FindReplString (&(pPatentSeqId->cit->app_number), pFindStruct); - FindReplString (&(pPatentSeqId->cit->doc_type), pFindStruct); + FindReplString (&(pPatentSeqId->cit->country), fsp); + FindReplString (&(pPatentSeqId->cit->number), fsp); + FindReplString (&(pPatentSeqId->cit->app_number), fsp); + FindReplString (&(pPatentSeqId->cit->doc_type), fsp); } /*=======================================================================*/ @@ -1045,16 +1088,15 @@ static void FindReplPatentSeqId ( static void FindReplTextSeqId ( TextSeqIdPtr pTextSeqId, - FindStructPtr pFindStruct + FindStructPtr fsp ) { - if (pTextSeqId == NULL) - return; + if (pTextSeqId == NULL) return; - FindReplString (&(pTextSeqId->name), pFindStruct); - FindReplString (&(pTextSeqId->accession), pFindStruct); - FindReplString (&(pTextSeqId->release), pFindStruct); + FindReplString (&(pTextSeqId->name), fsp); + FindReplString (&(pTextSeqId->accession), fsp); + FindReplString (&(pTextSeqId->release), fsp); } /*=======================================================================*/ @@ -1065,15 +1107,14 @@ static void FindReplTextSeqId ( static void FindReplGiim ( GiimPtr pGiim, - FindStructPtr pFindStruct + FindStructPtr fsp ) { - if (pGiim == NULL) - return; + if (pGiim == NULL) return; - FindReplString (&(pGiim->db), pFindStruct); - FindReplString (&(pGiim->release), pFindStruct); + FindReplString (&(pGiim->db), fsp); + FindReplString (&(pGiim->release), fsp); } /*=======================================================================*/ @@ -1084,14 +1125,13 @@ static void FindReplGiim ( static void FindReplPDBSeqId ( PDBSeqIdPtr pPDBSeqId, - FindStructPtr pFindStruct + FindStructPtr fsp ) { - if (pPDBSeqId == NULL) - return; + if (pPDBSeqId == NULL) return; - FindReplString (&(pPDBSeqId->mol), pFindStruct); + FindReplString (&(pPDBSeqId->mol), fsp); } /*=======================================================================*/ @@ -1102,14 +1142,13 @@ static void FindReplPDBSeqId ( static void FindReplObjectId ( ObjectIdPtr pObjectId, - FindStructPtr pFindStruct + FindStructPtr fsp ) { - if (pObjectId == NULL) - return; + if (pObjectId == NULL) return; - FindReplString (&(pObjectId->str), pFindStruct); + FindReplString (&(pObjectId->str), fsp); } /*=======================================================================*/ @@ -1127,35 +1166,16 @@ static void FindReplSeqId ( FindStructPtr fsp; Uint1 subtype; - /*------------------*/ - /* Check parameters */ - /*------------------*/ - - if (sip == NULL) - return; - + if (sip == NULL) return; fsp = (FindStructPtr) userdata; - if (fsp == NULL) - return; - /*-----------------------------------*/ - /* Check to see if we're supposed to */ - /* process this subtype or not. */ - /*-----------------------------------*/ + /* check subtype against filter */ subtype = sip->choice; - if (subtype >= NUM_SEQID) - return; - if (! fsp->seqidFilter [subtype]) - return; - - /*------------------------------*/ - /* Do search/replace on all the */ - /* different SeqId types. */ - /*------------------------------*/ + if (subtype >= NUM_SEQID) return; + if (! fsp->seqidFilter [subtype]) return; - switch (subtype) - { + switch (subtype) { case SEQID_NOT_SET : break; case SEQID_LOCAL : @@ -1216,6 +1236,9 @@ static void FindReplSendMessages ( if (fsp->select_item && (fsp->did_find || fsp->did_replace)) { ObjMgrAlsoSelect (fsp->entityID, itemID, itemtype, 0, NULL); } + if (fsp->callback != NULL && (fsp->did_find || fsp->did_replace)) { + fsp->callback (fsp->entityID, itemID, itemtype, fsp->userdata); + } } /*=======================================================================*/ @@ -1233,6 +1256,8 @@ static void FindReplBioseqs ( FindStructPtr fsp; SeqIdPtr sip; + if (bsp == NULL) return; + fsp = (FindStructPtr) userdata; fsp->did_find = FALSE; fsp->did_replace = FALSE; @@ -1241,7 +1266,9 @@ static void FindReplBioseqs ( FindReplSeqId (sip, userdata); } - SeqMgrReplaceInBioseqIndex(bsp); + if (fsp->did_replace) { + SeqMgrReplaceInBioseqIndex (bsp); + } FindReplSendMessages (fsp, bsp->idx.itemID, bsp->idx.itemtype); } @@ -1265,6 +1292,8 @@ static void FindReplAligns ( SeqLocPtr slp; StdSegPtr ssp; + if (sap == NULL) return; + fsp = (FindStructPtr) userdata; fsp->did_find = FALSE; fsp->did_replace = FALSE; @@ -1274,6 +1303,8 @@ static void FindReplAligns ( if (sap->segs == NULL) return; + /* SAS_DISC recursively presented by visit function, so removed here */ + switch (sap->segtype) { case SAS_DENDIAG : ddp = (DenseDiagPtr) sap->segs; @@ -1297,12 +1328,6 @@ static void FindReplAligns ( } } break; - case SAS_DISC : - /* recursive */ - for (sap = (SeqAlignPtr) sap->segs; sap != NULL; sap = sap->next) { - FindReplAligns (sap, userdata); - } - break; default : break; } @@ -1324,6 +1349,8 @@ static void FindReplGraphs ( { FindStructPtr fsp; + if (sgp == NULL) return; + fsp = (FindStructPtr) userdata; fsp->did_find = FALSE; fsp->did_replace = FALSE; @@ -1359,6 +1386,8 @@ static void FindReplFeats ( Uint1 subtype; tRNAPtr trp; + if (sfp == NULL) return; + fsp = (FindStructPtr) userdata; fsp->did_find = FALSE; fsp->did_replace = FALSE; @@ -1498,6 +1527,8 @@ static void FindReplDescs ( PubdescPtr pdp; Uint1 subtype; + if (sdp == NULL) return; + fsp = (FindStructPtr) userdata; fsp->did_find = FALSE; fsp->did_replace = FALSE; @@ -1613,6 +1644,7 @@ static void FindReplSubmitBlock ( if (ssp == NULL) return; sub = ssp->sub; if (sub == NULL) return; + fsp->did_find = FALSE; fsp->did_replace = FALSE; @@ -1644,6 +1676,8 @@ static void FindReplSubmitBlock ( FindReplSendMessages (fsp, ssp->idx.itemID, ssp->idx.itemtype); } +/* EXTERNAL FIND-REPLACE FUNCTIONS */ + /*=======================================================================*/ /* */ /* FindReplaceInEntity() - New find/replace function. */ @@ -1662,7 +1696,9 @@ NLM_EXTERN void FindReplaceInEntity ( BoolPtr descFilter, BoolPtr featFilter, BoolPtr seqidFilter, - Boolean do_seqid_local + Boolean do_seqid_local, + FindReplProc callback, + Pointer userdata ) { @@ -1672,8 +1708,10 @@ NLM_EXTERN void FindReplaceInEntity ( ObjMgrDataPtr omdp; SeqEntryPtr sep = NULL; SeqSubmitPtr ssp = NULL; + size_t subLen; - if (entityID == 0 || StringHasNoText (find_string)) return; + if (entityID == 0 || find_string == NULL + || (whole_word && StringHasNoText (find_string))) return; omdp = ObjMgrGetData (entityID); if (omdp != NULL) { @@ -1698,29 +1736,38 @@ NLM_EXTERN void FindReplaceInEntity ( MemSet ((Pointer) &fs, 0, sizeof (FindStruct)); fs.entityID = entityID; + if (do_replace) { + fs.func = BoyerMooreReplaceString; + } else { + fs.func = BoyerMooreFindString; + } + fs.callback = callback; + fs.userdata = userdata; + fs.find_string = find_string; fs.replace_string = replace_string; fs.case_counts = case_counts; fs.whole_word = whole_word; - fs.do_replace = do_replace; + fs.findLen = StringLen (find_string); + fs.replaceLen = StringLen (replace_string); + fs.select_item = select_item; fs.send_update = send_update; - fs.did_find = FALSE; fs.did_replace = FALSE; fs.dirty = FALSE; /* build Boyer-Moore displacement array in advance */ - fs.subLen = StringLen (find_string); + subLen = StringLen (find_string); for (ch = 0; ch < 256; ch++) { - fs.d [ch] = fs.subLen; + fs.d [ch] = subLen; } - for (j = 0; j < (int) (fs.subLen - 1); j++) { + for (j = 0; j < (int) (subLen - 1); j++) { ch = (int) (case_counts ? find_string [j] : TO_UPPER (find_string [j])); if (ch >= 0 && ch <= 255) { - fs.d [ch] = fs.subLen - j - 1; + fs.d [ch] = subLen - j - 1; } } @@ -1779,6 +1826,141 @@ NLM_EXTERN void FindReplaceInEntity ( /*=======================================================================*/ /* */ +/* FindStringsInEntity() - Multi-string find function. */ +/* */ +/*=======================================================================*/ + +NLM_EXTERN void FindStringsInEntity ( + Uint2 entityID, + CharPtr PNTR find_strings, + Boolean case_counts, + Boolean whole_word, + Boolean select_item, + Int2 send_update, + BoolPtr descFilter, + BoolPtr featFilter, + BoolPtr seqidFilter, + Boolean do_seqid_local, + FindReplProc callback, + Pointer userdata +) + +{ + FindStruct fs; + int j; + ObjMgrDataPtr omdp; + SeqEntryPtr sep = NULL; + SeqSubmitPtr ssp = NULL; + + if (entityID == 0 || find_strings == NULL) return; + + omdp = ObjMgrGetData (entityID); + if (omdp != NULL) { + switch (omdp->datatype) { + case OBJ_SEQSUB : + ssp = (SeqSubmitPtr) omdp->dataptr; + if (ssp != NULL && ssp->datatype == 1) { + sep = (SeqEntryPtr) ssp->data; + } + break; + case OBJ_BIOSEQ : + sep = (SeqEntryPtr) omdp->choice; + case OBJ_BIOSEQSET : + sep = (SeqEntryPtr) omdp->choice; + default : + break; + } + } + /* sep = GetTopSeqEntryForEntityID (entityID); */ + if (sep == NULL) return; + + MemSet ((Pointer) &fs, 0, sizeof (FindStruct)); + + fs.entityID = entityID; + fs.func = FSAFindStrings; + fs.callback = callback; + fs.userdata = userdata; + + fs.find_string = NULL; + fs.replace_string = NULL; + fs.case_counts = case_counts; + fs.whole_word = whole_word; + fs.findLen = 0; + fs.replaceLen = 0; + + fs.select_item = select_item; + fs.send_update = send_update; + fs.did_find = FALSE; + fs.did_replace = FALSE; + fs.dirty = FALSE; + + /* build finite state machine in advance */ + + fs.fsa = TextFsaNew (); + + for (j = 0; find_strings [j] != NULL; j++) { + TextFsaAdd (fs.fsa, find_strings [j]); + } + + /* if desc or feat filter arrays not supplied, default to all TRUE */ + + if (descFilter != NULL) { + MemCopy ((Pointer) &fs.descFilter, (Pointer) descFilter, sizeof (fs.descFilter)); + } else { + MemSet ((Pointer) &fs.descFilter, (int) TRUE, sizeof (fs.descFilter)); + } + + if (featFilter != NULL) { + MemCopy ((Pointer) &fs.featFilter, (Pointer) featFilter, sizeof (fs.featFilter)); + } else { + MemSet ((Pointer) &fs.featFilter, (int) TRUE, sizeof (fs.featFilter)); + } + + /* if seqid filter array not supplied, default to all FALSE */ + + if (seqidFilter != NULL) { + MemCopy ((Pointer) &fs.seqidFilter, (Pointer) seqidFilter, sizeof (fs.seqidFilter)); + } else if (do_seqid_local) { + MemSet ((Pointer) &fs.seqidFilter, (int) FALSE, sizeof (fs.seqidFilter)); + fs.seqidFilter [SEQID_LOCAL] = TRUE; + } else { + MemSet ((Pointer) &fs.seqidFilter, (int) FALSE, sizeof (fs.seqidFilter)); + } + + /* ensure feature subtype is set in sfp->idx block */ + + AssignIDsInEntity (entityID, 0, NULL); + + /* visit callbacks that find/replace specific fields */ + + VisitBioseqsInSep (sep, (Pointer) &fs, FindReplBioseqs); + + VisitFeaturesInSep (sep, (Pointer) &fs, FindReplFeats); + + VisitAlignmentsInSep (sep, (Pointer) &fs, FindReplAligns); + + VisitGraphsInSep (sep, (Pointer) &fs, FindReplGraphs); + + VisitDescriptorsInSep (sep, (Pointer) &fs, FindReplDescs); + + if (ssp != NULL) { + FindReplSubmitBlock (ssp, &fs); + } + + /* clean up finite state machine */ + + TextFsaFree (fs.fsa); + + /* send select message, if applicable */ + + if (fs.send_update == UPDATE_ONCE && fs.dirty) { + ObjMgrSetDirtyFlag (entityID, TRUE); + ObjMgrSendMsg (OM_MSG_UPDATE, entityID, 0, 0); + } +} + +/*=======================================================================*/ +/* */ /* FindReplaceString() - find/replace just one string. */ /* */ /*=======================================================================*/ @@ -1795,35 +1977,41 @@ NLM_EXTERN void FindReplaceString ( int ch; FindStruct fs; int j; + size_t subLen; if (strp == NULL || StringHasNoText (find_string)) return; MemSet ((Pointer) &fs, 0, sizeof (FindStruct)); fs.entityID = 0; + fs.func = BoyerMooreReplaceString; + fs.callback = NULL; + fs.userdata = NULL; + fs.find_string = find_string; fs.replace_string = replace_string; fs.case_counts = case_counts; fs.whole_word = whole_word; - fs.do_replace = TRUE; - fs.select_item = FALSE; - fs.send_update = FALSE; + fs.findLen = StringLen (find_string); + fs.replaceLen = StringLen (replace_string); + fs.select_item = FALSE; + fs.send_update = UPDATE_NEVER; fs.did_find = FALSE; fs.did_replace = FALSE; fs.dirty = FALSE; /* build Boyer-Moore displacement array in advance */ - fs.subLen = StringLen (find_string); + subLen = StringLen (find_string); for (ch = 0; ch < 256; ch++) { - fs.d [ch] = fs.subLen; + fs.d [ch] = subLen; } - for (j = 0; j < (int) (fs.subLen - 1); j++) { + for (j = 0; j < (int) (subLen - 1); j++) { ch = (int) (case_counts ? find_string [j] : TO_UPPER (find_string [j])); if (ch >= 0 && ch <= 255) { - fs.d [ch] = fs.subLen - j - 1; + fs.d [ch] = subLen - j - 1; } } diff --git a/api/findrepl.h b/api/findrepl.h index 20df8b12..a23ffad4 100644 --- a/api/findrepl.h +++ b/api/findrepl.h @@ -44,6 +44,12 @@ * RCS Modification History: * ------------------------- * $Log: findrepl.h,v $ +* Revision 6.5 2006/01/04 20:39:41 kans +* added FindStringsInEntity using finite state machine, general cleanup of code +* +* Revision 6.4 2005/12/29 20:54:41 kans +* FindReplaceInEntity takes callback and userdata +* * Revision 6.3 2003/07/31 20:54:54 kans * FindReplaceString does not need do_replace argument * @@ -103,6 +109,8 @@ extern "C" { #define UPDATE_EACH 1 /* send it on each replace */ #define UPDATE_ONCE 2 /* send once for whole entityID, if any replacements occur */ +typedef void (*FindReplProc) (Uint2 entityID, Uint2 itemID, Uint2 itemtype, Pointer userdata); + NLM_EXTERN void FindReplaceInEntity ( Uint2 entityID, CharPtr find_string, @@ -115,7 +123,9 @@ NLM_EXTERN void FindReplaceInEntity ( BoolPtr descFilter, BoolPtr featFilter, BoolPtr seqidFilter, - Boolean do_seqid_local + Boolean do_seqid_local, + FindReplProc callback, + Pointer userdata ); NLM_EXTERN void FindReplaceString ( @@ -126,6 +136,21 @@ NLM_EXTERN void FindReplaceString ( Boolean whole_word ); +NLM_EXTERN void FindStringsInEntity ( + Uint2 entityID, + CharPtr PNTR find_strings, + Boolean case_counts, + Boolean whole_word, + Boolean select_item, + Int2 send_update, + BoolPtr descFilter, + BoolPtr featFilter, + BoolPtr seqidFilter, + Boolean do_seqid_local, + FindReplProc callback, + Pointer userdata +); + #ifdef __cplusplus extern "C" } diff --git a/api/salsap.c b/api/salsap.c index 1be3b1b7..30f5dcd5 100644 --- a/api/salsap.c +++ b/api/salsap.c @@ -29,7 +29,7 @@ * * Version Creation Date: 1/27/96 * -* $Revision: 6.11 $ +* $Revision: 6.13 $ * * File Description: * @@ -2250,6 +2250,650 @@ NLM_EXTERN SeqAlignPtr LIBCALL SeqAlignDeleteByLoc (SeqLocPtr slp, SeqAlignPtr s return salp; } +static Boolean AreDenseSegSegmentsValid (DenseSegPtr dsp, Int4 start, Int4 num) +{ + Int4 k, seg_num, next_pos; + + if (dsp == NULL || start < 0 || num < 1) + { + return FALSE; + } + + for (k = 0; k < dsp->dim; k++) + { + if (dsp->strands == NULL || dsp->strands[k] == Seq_strand_plus) + { + if(dsp->starts [dsp->dim * start + k] > -1) + { + next_pos = dsp->starts [dsp->dim * start + k] + dsp->lens[start]; + } + else + { + next_pos = -1; + } + for (seg_num = start + 1; seg_num - start < num; seg_num++) + { + if (dsp->starts[dsp->dim * seg_num + k] == -1) + { + continue; + } + if (next_pos != -1) + { + if (dsp->starts[dsp->dim * seg_num + k] != next_pos) + { + return FALSE; + } + } + next_pos = dsp->starts[dsp->dim * seg_num + k] + dsp->lens[seg_num]; + } + } + else + { + if (dsp->starts [dsp->dim * (start + num - 1) + k] > -1) + { + next_pos = dsp->starts [dsp->dim * (start + num - 1) + k] + dsp->lens [start + num - 1]; + } + else + { + next_pos = -1; + } + for (seg_num = start + num - 2; seg_num >= start; seg_num--) + { + if (dsp->starts [dsp->dim * seg_num + k] == -1) + { + continue; + } + if (next_pos != -1) + { + if (dsp->starts[dsp->dim * seg_num + k] != next_pos) + { + return FALSE; + } + } + next_pos = dsp->starts[dsp->dim * seg_num + k] + dsp->lens[seg_num]; + } + } + + } + + return TRUE; +} + + +static void +FillInPlusStrandInsertionSegmentA +(DenseSegPtr dsp_orig, + DenseSegPtr dsp_new, + Int4 insert_start, + Int4 insert_len, + Int4 insert_row, + Int4 first_len, + Int4 second_len, + Int4 orig_segment, + Int4Ptr this_seg) +{ + Int4 k; + + if (dsp_orig == NULL || dsp_new == NULL + || insert_start < 0 || insert_len < 0 + || first_len < 0 + || second_len < 0 + || orig_segment < 0 || orig_segment >= dsp_orig->numseg + || this_seg == NULL + || *this_seg < 0 || *this_seg >= dsp_new->numseg) + { + return; + } + + if (first_len == 0) + { + return; + } + + for (k = 0; k < dsp_orig->dim; k++) + { + dsp_new->starts[(*this_seg) * dsp_new->dim + k] + = dsp_orig->starts[orig_segment * dsp_orig->dim + k]; + if (dsp_orig->strands != NULL) + { + dsp_new->strands[(*this_seg) * dsp_new->dim + k] + = dsp_orig->strands[orig_segment * dsp_orig->dim + k]; + if (dsp_orig->strands[orig_segment * dsp_orig->dim + k] == Seq_strand_minus + && dsp_new->starts[(*this_seg) * dsp_new->dim + k] > -1) + { + dsp_new->starts[(*this_seg) * dsp_new->dim + k] += second_len; + } + } + } + + dsp_new->lens[*this_seg] = first_len; + (*this_seg)++; +} + + +static void +FillInInsertionSegmentB +(DenseSegPtr dsp_orig, + DenseSegPtr dsp_new, + Int4 insert_start, + Int4 insert_len, + Int4 insert_row, + Int4 first_len, + Int4 second_len, + Int4 orig_segment, + Int4Ptr this_seg) +{ + Int4 k; + + if (dsp_orig == NULL || dsp_new == NULL + || insert_start < 0 || insert_len < 0 + || first_len < 0 + || second_len < 0 + || orig_segment < 0 || orig_segment >= dsp_orig->numseg + || this_seg == NULL + || *this_seg < 0 || *this_seg >= dsp_new->numseg) + { + return; + } + + if (insert_len == 0) + { + return; + } + + for (k = 0; k < dsp_orig->dim; k++) + { + dsp_new->starts[(*this_seg) * dsp_new->dim + k] = -1; + if (dsp_orig->strands != NULL) + { + dsp_new->strands[(*this_seg) * dsp_new->dim + k] + = dsp_orig->strands[orig_segment * dsp_orig->dim + k]; + } + } + dsp_new->starts[(*this_seg) * dsp_new->dim + insert_row] = insert_start; + + dsp_new->lens[*this_seg] = insert_len; + (*this_seg)++; +} + +static void FillInPlusStrandInsertionSegmentC +(DenseSegPtr dsp_orig, + DenseSegPtr dsp_new, + Int4 insert_start, + Int4 insert_len, + Int4 insert_row, + Int4 first_len, + Int4 second_len, + Int4 orig_segment, + Int4Ptr this_seg) +{ + Int4 k; + + if (dsp_orig == NULL || dsp_new == NULL + || insert_start < 0 || insert_len < 0 + || first_len < 0 + || second_len < 0 + || orig_segment < 0 || orig_segment >= dsp_orig->numseg + || this_seg == NULL + || *this_seg < 0 || *this_seg >= dsp_new->numseg) + { + return; + } + + if (second_len == 0) + { + return; + } + + for (k = 0; k < dsp_orig->dim; k++) + { + if ((dsp_orig->strands == NULL + || dsp_orig->strands[orig_segment * dsp_new->dim + k] != Seq_strand_minus) + && dsp_new->starts[(*this_seg) * dsp_new->dim + k] > -1) + { + dsp_new->starts[(*this_seg) * dsp_new->dim + k] = + dsp_orig->starts[orig_segment * dsp_orig->dim + k] + first_len; + } + else + { + dsp_new->starts[(*this_seg) * dsp_new->dim + k] = + dsp_orig->starts[orig_segment * dsp_orig->dim + k]; + } + + if (dsp_orig->strands != NULL) + { + dsp_new->strands[(*this_seg) * dsp_new->dim + k] + = dsp_orig->strands[orig_segment * dsp_orig->dim + k]; + } + } + dsp_new->starts[(*this_seg) * dsp_new->dim + insert_row] += insert_len; + + dsp_new->lens[*this_seg] = second_len; + (*this_seg)++; +} + + +static void +FillInMinusStrandInsertionSegmentA +(DenseSegPtr dsp_orig, + DenseSegPtr dsp_new, + Int4 insert_start, + Int4 insert_len, + Int4 insert_row, + Int4 first_len, + Int4 second_len, + Int4 orig_segment, + Int4Ptr this_seg) +{ + Int4 k; + + if (dsp_orig == NULL || dsp_new == NULL + || insert_start < 0 || insert_len < 0 + || first_len < 0 + || second_len < 0 + || orig_segment < 0 || orig_segment >= dsp_orig->numseg + || this_seg == NULL + || *this_seg < 0 || *this_seg >= dsp_new->numseg) + { + return; + } + + if (first_len == 0) + { + return; + } + + for (k = 0; k < dsp_orig->dim; k++) + { + dsp_new->starts[(*this_seg) * dsp_new->dim + k] + = dsp_orig->starts[orig_segment * dsp_orig->dim + k]; + if (dsp_orig->strands != NULL) + { + dsp_new->strands[(*this_seg) * dsp_new->dim + k] + = dsp_orig->strands[orig_segment * dsp_orig->dim + k]; + if (dsp_orig->strands[orig_segment * dsp_orig->dim + k] == Seq_strand_minus + && dsp_new->starts[(*this_seg) * dsp_new->dim + k] != -1) + { + dsp_new->starts[(*this_seg) * dsp_new->dim + k] += second_len; + } + } + } + + dsp_new->starts[(*this_seg) * dsp_new->dim + insert_row] += insert_len; + + dsp_new->lens[*this_seg] = first_len; + (*this_seg)++; +} + +static void FillInMinusStrandInsertionSegmentC +(DenseSegPtr dsp_orig, + DenseSegPtr dsp_new, + Int4 insert_start, + Int4 insert_len, + Int4 insert_row, + Int4 first_len, + Int4 second_len, + Int4 orig_segment, + Int4Ptr this_seg) +{ + Int4 k; + + if (dsp_orig == NULL || dsp_new == NULL + || insert_start < 0 || insert_len < 0 + || first_len < 0 + || second_len < 0 + || orig_segment < 0 || orig_segment >= dsp_orig->numseg + || this_seg == NULL + || *this_seg < 0 || *this_seg >= dsp_new->numseg) + { + return; + } + + if (second_len == 0) + { + return; + } + + for (k = 0; k < dsp_orig->dim; k++) + { + dsp_new->starts[(*this_seg) * dsp_new->dim + k] = + dsp_orig->starts[orig_segment * dsp_orig->dim + k]; + + if ((dsp_orig->strands == NULL + || dsp_orig->strands[orig_segment * dsp_orig->dim + k] != Seq_strand_minus) + && dsp_new->starts[(*this_seg) * dsp_new->dim + k] != -1) + { + dsp_new->starts[(*this_seg) * dsp_new->dim + k] += first_len; + } + + if (dsp_orig->strands != NULL) + { + dsp_new->strands[(*this_seg) * dsp_new->dim + k] + = dsp_orig->strands[orig_segment * dsp_orig->dim + k]; + } + } + + dsp_new->lens[*this_seg] = second_len; + (*this_seg)++; +} + +static void +InsertInSegment +(DenseSegPtr dsp_orig, + DenseSegPtr dsp_new, + Int4 insert_start, + Int4 insert_len, + Int4 insert_row, + Int4 orig_segment, + Int4Ptr this_segment) +{ + /* The original segment needs to be replaced by either one or two segments + * in the new alignment. + * Call segment O the segment that contains insert_start. + * If insert_start == the start of segment O, only one additional segment + * will be needed, otherwise allocate space for two extra segments. + * If insert_row is a plus row: + * If insert_start == the start of segment O, + * the gap segment will be inserted immediately before + * segment O. For insert_row, all starts + * for segment O and beyond will be increased by insert_len. + * Otherwise, segment O will be replaced by segment (A) (a truncated + version of segment O), a gap segment (B) will be + * inserted after A, and a third segment (C) will be inserted + * after segment B. + * Call first_len = insert_start - start of segment O on insert_row + * Call second_len = length of segment O on insert_row - first_len + * The length of the segment A will be first_len. + * The start of segment A for all plus strand rows will be + * the start of segment O. + * The start of segment A for all minus strand rows will be + * the start of segment O + second_len. + * The start of segment B for insert_row will be insert_start, + * The start of segment B for all other rows in the gap segment will be -1. + * The length of segment B will be insert_len. + * For insert_row, the start of segment C will be + * the start of segment O + first_len + insert_len. + * For all remaining plus strand rows, the start of segment C + * will be the start of segment O + first_len. + * For all minus strand rows, the start of segment C will be + * be the start of segment O. + * The length of segment C will be second_len. + * If insert_row is a minus row: + * If insert_start == the start of segment O, + * the gap segment will be inserted immediately after + * segment O and all of the starts for insert_row + * before segment O will be increased by insert_len. + * Otherwise, segment O will be replaced by segment A, a gap segment (B), + * and segment C. + * Call first_len = start of segment O on insert_row + length of segment O on insert_row - insert_start + * Call second_len = insert_start - start of segment O on insert_row + * The length of segment A will be first_len. + * For every plus strand row, the start of segment A will be the start of + * segment O. + * For insert_row, the start of segment A will be the start of segment O + second_len + insert_len. + * For every other minus strand row, the start of segment A will be the + * the start of segment O + second_len. + * The length of segment B will be insert_len. + * For insert_row, the start of segment B will be insert_start. + * For every other row, the start of segment B will be -1. + * The length of segment C will be second_len. + * For every minus row, the start of segment C will be the start of segment O. + * For every plus row, the start of segment C will be the start of segment O + first_len. + * For insert_row, the start of every segment prior to segment O will be increased + * by insert_len. + */ + + Int4 first_len, second_len; + + if (dsp_orig->strands != NULL && dsp_orig->strands[insert_row] == Seq_strand_minus) + { + first_len = dsp_orig->starts [dsp_orig->dim * orig_segment + insert_row] + + dsp_orig->lens [orig_segment] - insert_start; + second_len = insert_start - dsp_orig->starts [dsp_orig->dim * orig_segment + insert_row]; + FillInMinusStrandInsertionSegmentA(dsp_orig, dsp_new, insert_start, insert_len, + insert_row, first_len, second_len, + orig_segment, this_segment); + FillInInsertionSegmentB(dsp_orig, dsp_new, insert_start, insert_len, + insert_row, first_len, second_len, + orig_segment, this_segment); + FillInMinusStrandInsertionSegmentC(dsp_orig, dsp_new, insert_start, insert_len, + insert_row, first_len, second_len, + orig_segment, this_segment); + } + else + { + first_len = insert_start - dsp_orig->starts [dsp_orig->dim * orig_segment + insert_row]; + second_len = dsp_orig->lens[orig_segment] - first_len; + FillInPlusStrandInsertionSegmentA(dsp_orig, dsp_new, insert_start, insert_len, + insert_row, first_len, second_len, + orig_segment, this_segment); + FillInInsertionSegmentB(dsp_orig, dsp_new, insert_start, insert_len, + insert_row, first_len, second_len, + orig_segment, this_segment); + FillInPlusStrandInsertionSegmentC(dsp_orig, dsp_new, insert_start, insert_len, + insert_row, first_len, second_len, + orig_segment, this_segment); + } +} + +static Int4 +FindSegmentForInsertPoint +(DenseSegPtr dsp, + Int4 insert_start, + Int4 insert_row, + Uint1 insert_strand) +{ + Int4 insert_segment = -1, k = 0; + + if (dsp == NULL || insert_start < 0 + || insert_row < 0 || insert_row >= dsp->dim) + { + return -1; + } + + if (insert_strand == Seq_strand_minus) + { + while (k < dsp->numseg && insert_segment == -1) + { + if (dsp->starts [k * dsp->dim + insert_row] != -1 + && dsp->starts [k * dsp->dim + insert_row] <= insert_start + && dsp->starts [k * dsp->dim + insert_row] + dsp->lens[k] > insert_start) + { + insert_segment = k; + } + k++; + } + } + else + { + while (k < dsp->numseg && insert_segment == -1) + { + if (dsp->starts [k * dsp->dim + insert_row] != -1 + && dsp->starts [dsp->dim * k + insert_row] <= insert_start + && dsp->starts [dsp->dim * k + insert_row] + dsp->lens [k] > insert_start) + { + insert_segment = k; + } + k++; + } + } + return insert_segment; +} + +static void +CopyDensegSegments +(DenseSegPtr dsp_orig, + DenseSegPtr dsp_new, + Int4 start_seg, + Int4 copy_seg, + Int4 num_to_copy) +{ + Int4 num_copied = 0, k; + + if (dsp_orig == NULL || dsp_new == NULL) + { + return; + } + + while (start_seg < dsp_orig->numseg && copy_seg < dsp_new->numseg + && num_copied < num_to_copy) + { + if (start_seg >= 0 && copy_seg >= 0) + { + for (k = 0; k < dsp_orig->dim && k < dsp_new->dim; k++) + { + dsp_new->starts [copy_seg * dsp_new->dim + k] + = dsp_orig->starts[start_seg * dsp_orig->dim + k]; + if (dsp_orig->strands != NULL && dsp_new->strands != NULL) + { + dsp_new->strands[copy_seg * dsp_new->dim + k] + = dsp_orig->strands[start_seg * dsp_orig->dim + k]; + } + } + dsp_new->lens [copy_seg] = dsp_orig->lens[start_seg]; + num_copied++; + } + start_seg ++; + copy_seg ++; + } +} + + +/************************************************** +*** +***************************************************/ +NLM_EXTERN SeqAlignPtr LIBCALL SeqAlignInsertByLoc (SeqLocPtr slp, SeqAlignPtr salp) +{ + SeqIdPtr sip; + DenseSegPtr dsp, dsp_new; + Int4 from, start; + Int2 j; + Int2 index; + Int4 insert_len; + Int4 extra_segs; + Uint1 insert_strand; + Int4 insert_seg; + Int4 orig_segment; + + if (salp == NULL || salp->segtype != SAS_DENSEG) + return salp; + sip = SeqLocId(slp); + insert_len = SeqLocLen (slp); + dsp = (DenseSegPtr) salp->segs; + if (dsp == NULL) { + return salp; + } + + index = SeqIdOrderInBioseqIdList (sip, dsp->ids); + if (index == 0) { + /* bioseq not in alignment */ + return salp; + } + index -= 1; + insert_strand = SeqAlignStrand (salp, index); + + if (insert_strand == Seq_strand_minus) + { + from = SeqAlignStop (salp, index); + } + else + { + from = SeqAlignStart(salp, index); + } + start = SeqLocStart (slp); + if (start <= from) + { + /* just adjust the starts */ + for (j = 0; j < dsp->numseg; j++) + { + if (dsp->starts [dsp->dim * j + index] > -1) + { + dsp->starts [dsp->dim * j + index] += insert_len; + } + } + } + else + { + /* need to insert gap of length insert_len at start */ + /* first, find affected segment */ + insert_seg = FindSegmentForInsertPoint (dsp, start, index, insert_strand); + if (insert_seg < 0 || insert_seg > dsp->numseg) + { + return salp; + } + + if (dsp->starts[dsp->dim * insert_seg + index] == start) + { + extra_segs = 1; + } + else + { + extra_segs = 2; + } + + dsp_new = (DenseSegPtr) MemNew (sizeof (DenseSeg)); + dsp_new->dim = dsp->dim; + dsp_new->numseg = dsp->numseg + extra_segs; + dsp_new->starts = (Int4Ptr) MemNew (dsp->dim * (dsp->numseg + extra_segs) * sizeof (Int4)); + if (dsp->strands != NULL) + { + dsp_new->strands = (Uint1Ptr) MemNew (dsp->dim * (dsp->numseg + extra_segs) * sizeof (Uint1)); + } + dsp_new->lens = (Int4Ptr) MemNew ((dsp->numseg + extra_segs) * sizeof (Int4)); + + /* copy alignment up to point of insertion */ + CopyDensegSegments (dsp, dsp_new, 0, 0, insert_seg); + + /* adjust starts in insert_row before insert_seg if insert_row on minus strand */ + if (insert_strand == Seq_strand_minus) + { + for (j = 0; j < insert_seg; j++) + { + if (dsp_new->starts[dsp_new->dim * j + index] != -1) + { + dsp_new->starts[dsp_new->dim * j + index] += insert_len; + } + } + } + + /* create gap */ + orig_segment = insert_seg; + InsertInSegment (dsp, dsp_new, start, insert_len, index, orig_segment, &insert_seg); + + /* Copy after insertion point */ + CopyDensegSegments (dsp, dsp_new, orig_segment + 1, insert_seg, dsp->numseg - orig_segment); + + /* Adjust starts in insert row after insert_seg if insert_row on plus strand */ + if (insert_strand == Seq_strand_plus) + { + while (insert_seg < dsp_new->numseg) + { + if (dsp_new->starts[dsp_new->dim * insert_seg + index] != -1) + { + dsp_new->starts[dsp_new->dim * insert_seg + index] += insert_len; + } + insert_seg++; + } + } + + /* replace in old DenseSeg */ + dsp->starts = MemFree (dsp->starts); + dsp->starts = dsp_new->starts; + dsp_new->starts = NULL; + dsp->strands = MemFree (dsp->strands); + dsp->strands = dsp_new->strands; + dsp_new->strands = NULL; + dsp->lens = MemFree (dsp->lens); + dsp->lens = dsp_new->lens; + dsp_new->lens = NULL; + dsp->numseg = dsp_new->numseg; + + } + + return salp; +} + + /******************************************* *** *** DeleteRegion diff --git a/api/salsap.h b/api/salsap.h index c48112f4..f06d8f0e 100644 --- a/api/salsap.h +++ b/api/salsap.h @@ -28,13 +28,18 @@ * * Version Creation Date: 1/27/96 * -* $Revision: 6.3 $ +* $Revision: 6.4 $ * * File Description: * * Modifications: * -------------------------------------------------------------------------- * $Log: salsap.h,v $ +* Revision 6.4 2006/01/10 22:27:06 bollin +* added function SeqAlignInsertByLoc, to be used when nucleotides are inserted +* into a sequence that is in an alignment. An extra segment, where all functions +* except the one with the insertion are gapped, is created. +* * Revision 6.3 1999/11/24 21:24:28 vakatov * Fixed for the C++ and/or MSVC DLL compilation * @@ -156,6 +161,7 @@ NLM_EXTERN SeqAlignPtr LIBCALL SeqAlignEndExtend (SeqAlignPtr sap, Int4 start1, Int4 stop1, Int4 stop2, Int4 x1, Int4 y1, Int4 x2, Int4 y2, Uint1 strand1, Uint1 strand2); +NLM_EXTERN SeqAlignPtr LIBCALL SeqAlignInsertByLoc (SeqLocPtr slp, SeqAlignPtr salp); /**Delete, Truncate**/ NLM_EXTERN SeqAlignPtr LIBCALL SeqAlignDeleteByLoc (SeqLocPtr slp, SeqAlignPtr salp); diff --git a/api/seqmgr.c b/api/seqmgr.c index fe80abac..ecefefa8 100644 --- a/api/seqmgr.c +++ b/api/seqmgr.c @@ -29,7 +29,7 @@ * * Version Creation Date: 9/94 * -* $Revision: 6.252 $ +* $Revision: 6.259 $ * * File Description: Manager for Bioseqs and BioseqSets * @@ -39,6 +39,27 @@ * ------- ---------- ----------------------------------------------------- * * $Log: seqmgr.c,v $ +* Revision 6.259 2006/02/17 19:05:05 kans +* special case coded_by only for CDS feature on isolated protein bioseq +* +* Revision 6.258 2006/02/17 18:46:20 kans +* get gene overlapping coded_by CDS on isolated protein bioseq within flatfile generator, not feature indexer +* +* Revision 6.257 2006/02/17 17:24:24 kans +* changes to index CDS feature (with ignore flag) on isolated protein bioseq, xref gene feature +* +* Revision 6.256 2006/02/16 22:00:55 kans +* always pass FALSE for circular to CheckForTransSplice for more stringency +* +* Revision 6.255 2006/02/16 21:09:20 kans +* SeqMgrGetBestOverlappingFeat takes new parameter from get best gene by overlap, uses LOCATION_SUBSET if gene candidate is bad_order or mixed_strand +* +* Revision 6.254 2006/02/16 20:24:32 kans +* added bad_order and mixed_strand fields to feature index - to be used for get best gene overlap function in cases of trans-splicing +* +* Revision 6.253 2006/01/20 20:12:21 kans +* in LockAllSegments, bail if BioseqLockById returns NULL +* * Revision 6.252 2005/09/21 19:20:45 kans * SeqMgrGetNextAnnotDesc sets context->index properly * @@ -5373,6 +5394,8 @@ NLM_EXTERN SeqFeatPtr LIBCALL SeqMgrGetDesiredFeature (Uint2 entityID, BioseqPtr context->partialL = item->partialL; context->partialR = item->partialR; context->farloc = item->farloc; + context->bad_order = item->bad_order; + context->mixed_strand = item->mixed_strand; context->strand = item->strand; if (curr != NULL) { context->seqfeattype = curr->data.choice; @@ -5665,6 +5688,7 @@ NLM_EXTERN AnnotDescPtr LIBCALL SeqMgrGetDesiredAnnotDesc ( *****************************************************************************/ typedef struct extraindex { + SeqEntryPtr topsep; BioseqPtr lastbsp; SeqAnnotPtr lastsap; BioseqSetPtr lastbssp; @@ -5764,6 +5788,7 @@ NLM_EXTERN void LIBCALL SeqMgrIndexAlignments (Uint2 entityID) /* count alignments */ + exind.topsep = NULL; exind.lastbsp = NULL; exind.lastsap = NULL; exind.lastbssp = NULL; @@ -6004,7 +6029,8 @@ static void ProcessFeatureProducts (SeqFeatPtr sfp, Uint2 itemID, GatherObjectPt static void RecordOneFeature (BioseqExtraPtr bspextra, ObjMgrDataPtr omdp, BioseqPtr bsp, ExtraIndexPtr exindx, SeqFeatPtr sfp, Int4 left, Int4 right, Uint4 itemID, Uint2 subtype, - Boolean farloc, Boolean ignore) + Boolean farloc, Boolean bad_order, Boolean mixed_strand, + Boolean ignore) { Char buf [129]; @@ -6084,6 +6110,8 @@ static void RecordOneFeature (BioseqExtraPtr bspextra, ObjMgrDataPtr omdp, item->dnaStop = -1; CheckSeqLocForPartial (sfp->location, &(item->partialL), &(item->partialR)); item->farloc = farloc; + item->bad_order = bad_order; + item->mixed_strand = mixed_strand; /* item->strand = SeqLocStrand (sfp->location); if (exindx->flip) { @@ -6159,6 +6187,105 @@ static void RecordOneFeature (BioseqExtraPtr bspextra, ObjMgrDataPtr omdp, } } + +static void CheckForTransSplice ( + SeqFeatPtr sfp, + BoolPtr bad_orderP, + BoolPtr mixed_strandP, + Boolean circular +) + +{ + Boolean mixed_strand = FALSE, ordered = TRUE; + SeqIdPtr id1, id2; + SeqLocPtr prev, tmp; + SeqIntPtr sip1, sip2, prevsip; + Uint1 strand1, strand2; + + if (sfp == NULL || sfp->location == NULL) return; + + tmp = NULL; + prev = NULL; + sip1 = NULL; + id1 = NULL; + prevsip = NULL; + strand1 = Seq_strand_other; + + while ((tmp = SeqLocFindNext (sfp->location, tmp)) != NULL) { + + /* just check seqloc_interval */ + + if (tmp->choice == SEQLOC_INT) { + sip1 = prevsip; + sip2 = (SeqIntPtr) (tmp->data.ptrvalue); + strand2 = sip2->strand; + id2 = sip2->id; + if ((sip1 != NULL) && (ordered) && (! circular)) { + if (SeqIdForSameBioseq (sip1->id, sip2->id)) { + if (strand2 == Seq_strand_minus) { + if (sip1->to < sip2->to) { + ordered = FALSE; + } + } else { + if (sip1->to > sip2->to) { + ordered = FALSE; + } + } + } + } + prevsip = sip2; + if ((strand1 != Seq_strand_other) && (strand2 != Seq_strand_other)) { + if (SeqIdForSameBioseq (id1, id2)) { + if (strand1 != strand2) { + if (strand1 == Seq_strand_plus && strand2 == Seq_strand_unknown) { + /* unmarked_strand = TRUE; */ + } else if (strand1 == Seq_strand_unknown && strand2 == Seq_strand_plus) { + /* unmarked_strand = TRUE; */ + } else { + mixed_strand = TRUE; + } + } + } + } + + strand1 = strand2; + id1 = id2; + } + } + + /* Publication intervals ordering does not matter */ + + if (sfp->idx.subtype == FEATDEF_PUB) { + ordered = TRUE; + } + + /* ignore ordering of heterogen bonds */ + + if (sfp->data.choice == SEQFEAT_HET) { + ordered = TRUE; + } + + /* misc_recomb intervals SHOULD be in reverse order */ + + if (sfp->idx.subtype == FEATDEF_misc_recomb) { + ordered = TRUE; + } + + /* primer_bind intervals MAY be in on opposite strands */ + + if (sfp->idx.subtype == FEATDEF_primer_bind) { + mixed_strand = FALSE; + ordered = TRUE; + } + + if (! ordered) { + *bad_orderP = TRUE; + } + if (mixed_strand) { + *mixed_strandP = TRUE; + } +} + typedef struct adpbspdata { AnnotDescPtr adp; BioseqPtr bsp; @@ -6171,6 +6298,7 @@ static Boolean RecordFeaturesInBioseqs (GatherObjectPtr gop) { AdpBspPtr abp; AnnotDescPtr adp = NULL; + Boolean bad_order; BioseqPtr bsp = NULL; BioseqExtraPtr bspextra; BioseqSetPtr bssp = NULL; @@ -6181,6 +6309,7 @@ static Boolean RecordFeaturesInBioseqs (GatherObjectPtr gop) ExtraIndexPtr exindx; Int4 left; CharPtr loclbl; + Boolean mixed_strand; ObjMgrDataPtr omdp; ProtRefPtr prp; Int4 right; @@ -6189,6 +6318,7 @@ static Boolean RecordFeaturesInBioseqs (GatherObjectPtr gop) SeqFeatPtr sfp = NULL; SeqAlignPtr sal = NULL; SeqLocPtr slp; + Boolean special_case = FALSE; Int4 swap; SeqFeatPtr tmp; Boolean usingLocalBsp = FALSE; @@ -6376,8 +6506,18 @@ static Boolean RecordFeaturesInBioseqs (GatherObjectPtr gop) } MemFree (ctmp); - if (bsp == NULL) return TRUE; - usingLocalBsp = TRUE; + if (bsp == NULL && sfp->product != NULL && + sfp->data.choice == SEQFEAT_CDREGION && + IS_Bioseq (exindx->topsep)) { + bsp = (BioseqPtr) exindx->topsep->data.ptrvalue; + if (bsp == NULL || (! ISA_aa (bsp->mol))) return TRUE; + special_case = TRUE; + bsp = FindAppropriateBioseq (sfp->product, exindx->lastbsp); + if (bsp == NULL) return TRUE; + } else { + if (bsp == NULL) return TRUE; + usingLocalBsp = TRUE; + } } /* assume subsequent features will be on this bioseq */ @@ -6403,7 +6543,11 @@ static Boolean RecordFeaturesInBioseqs (GatherObjectPtr gop) /* slp = SeqLocMergeEx (bsp, sfp->location, NULL, TRUE, TRUE, FALSE, FALSE); */ - slp = sfp->location; + if (special_case) { + slp = sfp->product; + } else { + slp = sfp->location; + } left = GetOffsetInNearBioseq (slp, bsp, SEQLOC_LEFT_END); right = GetOffsetInNearBioseq (slp, bsp, SEQLOC_RIGHT_END); /* @@ -6517,8 +6661,13 @@ static Boolean RecordFeaturesInBioseqs (GatherObjectPtr gop) right = swap; } + bad_order = FALSE; + mixed_strand = FALSE; + CheckForTransSplice (sfp, &bad_order, &mixed_strand, /* (Boolean) (bsp->topology == TOPOLOGY_CIRCULAR) */ FALSE); + RecordOneFeature (bspextra, omdp, bsp, exindx, sfp, left, - right, gop->itemID, gop->subtype, usingLocalBsp, FALSE); + right, gop->itemID, gop->subtype, usingLocalBsp, + bad_order, mixed_strand, special_case); /* record gene, publication, and biosource features twice if spanning the origin */ @@ -6529,7 +6678,8 @@ static Boolean RecordFeaturesInBioseqs (GatherObjectPtr gop) sfp->idx.subtype == FEATDEF_operon) { RecordOneFeature (bspextra, omdp, bsp, exindx, sfp, left + bsp->length, - right + bsp->length, gop->itemID, gop->subtype, usingLocalBsp, TRUE); + right + bsp->length, gop->itemID, gop->subtype, usingLocalBsp, + bad_order, mixed_strand, TRUE); } } @@ -8270,6 +8420,7 @@ NLM_EXTERN Uint2 LIBCALL SeqMgrIndexFeaturesExEx ( /* gather all segmented locations */ + exind.topsep = sep; exind.lastbsp = NULL; exind.lastsap = NULL; exind.lastbssp = NULL; @@ -8302,6 +8453,7 @@ NLM_EXTERN Uint2 LIBCALL SeqMgrIndexFeaturesExEx ( /* now gather to get descriptor itemID counts on each bioseq or bioseq set, and record features on the bioseq indicated by the feature location */ + exind.topsep = sep; exind.lastbsp = NULL; exind.lastsap = NULL; exind.lastbssp = NULL; @@ -8514,6 +8666,8 @@ static void SetContextForFeature (SeqFeatPtr sfp, SeqMgrFeatContext PNTR context context->partialL = best->partialL; context->partialR = best->partialR; context->farloc = best->farloc; + context->bad_order = best->bad_order; + context->mixed_strand = best->mixed_strand; context->strand = best->strand; if (bst != NULL) { context->seqfeattype = bst->data.choice; @@ -8826,6 +8980,8 @@ static void SeqMgrBestOverlapSetContext ( context->partialL = best->partialL; context->partialR = best->partialR; context->farloc = best->farloc; + context->bad_order = best->bad_order; + context->mixed_strand = best->mixed_strand; context->strand = best->strand; if (bst != NULL) { context->seqfeattype = bst->data.choice; @@ -8841,14 +8997,19 @@ static void SeqMgrBestOverlapSetContext ( } } -static SeqFeatPtr SeqMgrGetBestOverlappingFeat (SeqLocPtr slp, Uint2 subtype, - SMFeatItemPtr PNTR array, - Int4 num, Int4Ptr pos, - Int2 overlapType, - SeqMgrFeatContext PNTR context, - Int2Ptr count, - Pointer userdata, - SeqMgrFeatExploreProc userfunc) +static SeqFeatPtr SeqMgrGetBestOverlappingFeat ( + SeqLocPtr slp, + Uint2 subtype, + SMFeatItemPtr PNTR array, + Int4 num, + Int4Ptr pos, + Int2 overlapType, + SeqMgrFeatContext PNTR context, + Int2Ptr count, + Pointer userdata, + SeqMgrFeatExploreProc userfunc, + Boolean special +) { SMFeatItemPtr best = NULL; @@ -9028,7 +9189,11 @@ static SeqFeatPtr SeqMgrGetBestOverlappingFeat (SeqLocPtr slp, Uint2 subtype, /* requires feature to be contained within gene, etc. */ - diff = TestForOverlap (feat, slp, left, right, overlapType, numivals, ivals); + if (special && (feat->bad_order || feat->mixed_strand)) { + diff = TestForOverlap (feat, slp, left, right, LOCATION_SUBSET, numivals, ivals); + } else { + diff = TestForOverlap (feat, slp, left, right, overlapType, numivals, ivals); + } if (diff >= 0) { if (StrandsMatch (feat->strand, strand)) { @@ -9063,7 +9228,11 @@ static SeqFeatPtr SeqMgrGetBestOverlappingFeat (SeqLocPtr slp, Uint2 subtype, feat = array [hier]; if (feat != NULL && ((! feat->ignore) || userfunc == NULL)) { - diff = TestForOverlap (feat, slp, left, right, overlapType, numivals, ivals); + if (special && (feat->bad_order || feat->mixed_strand)) { + diff = TestForOverlap (feat, slp, left, right, LOCATION_SUBSET, numivals, ivals); + } else { + diff = TestForOverlap (feat, slp, left, right, overlapType, numivals, ivals); + } if (diff >= 0) { if (StrandsMatch (feat->strand, strand)) { @@ -9134,43 +9303,43 @@ NLM_EXTERN Int4 TestFeatOverlap (SeqFeatPtr sfpA, SeqFeatPtr sfpB, Int2 overlapT NLM_EXTERN SeqFeatPtr LIBCALL SeqMgrGetOverlappingGene (SeqLocPtr slp, SeqMgrFeatContext PNTR context) { - return SeqMgrGetBestOverlappingFeat (slp, FEATDEF_GENE, NULL, 0, NULL, CONTAINED_WITHIN, context, NULL, NULL, NULL); + return SeqMgrGetBestOverlappingFeat (slp, FEATDEF_GENE, NULL, 0, NULL, CONTAINED_WITHIN, context, NULL, NULL, NULL, TRUE); } NLM_EXTERN SeqFeatPtr LIBCALL SeqMgrGetOverlappingmRNA (SeqLocPtr slp, SeqMgrFeatContext PNTR context) { - return SeqMgrGetBestOverlappingFeat (slp, FEATDEF_mRNA, NULL, 0, NULL, CONTAINED_WITHIN, context, NULL, NULL, NULL); + return SeqMgrGetBestOverlappingFeat (slp, FEATDEF_mRNA, NULL, 0, NULL, CONTAINED_WITHIN, context, NULL, NULL, NULL, FALSE); } NLM_EXTERN SeqFeatPtr LIBCALL SeqMgrGetLocationSupersetmRNA (SeqLocPtr slp, SeqMgrFeatContext PNTR context) { - return SeqMgrGetBestOverlappingFeat (slp, FEATDEF_mRNA, NULL, 0, NULL, LOCATION_SUBSET, context, NULL, NULL, NULL); + return SeqMgrGetBestOverlappingFeat (slp, FEATDEF_mRNA, NULL, 0, NULL, LOCATION_SUBSET, context, NULL, NULL, NULL, FALSE); } NLM_EXTERN SeqFeatPtr LIBCALL SeqMgrGetOverlappingCDS (SeqLocPtr slp, SeqMgrFeatContext PNTR context) { - return SeqMgrGetBestOverlappingFeat (slp, FEATDEF_CDS, NULL, 0, NULL, CONTAINED_WITHIN, context, NULL, NULL, NULL); + return SeqMgrGetBestOverlappingFeat (slp, FEATDEF_CDS, NULL, 0, NULL, CONTAINED_WITHIN, context, NULL, NULL, NULL, FALSE); } NLM_EXTERN SeqFeatPtr LIBCALL SeqMgrGetOverlappingPub (SeqLocPtr slp, SeqMgrFeatContext PNTR context) { - return SeqMgrGetBestOverlappingFeat (slp, FEATDEF_PUB, NULL, 0, NULL, CONTAINED_WITHIN, context, NULL, NULL, NULL); + return SeqMgrGetBestOverlappingFeat (slp, FEATDEF_PUB, NULL, 0, NULL, CONTAINED_WITHIN, context, NULL, NULL, NULL, FALSE); } NLM_EXTERN SeqFeatPtr LIBCALL SeqMgrGetOverlappingSource (SeqLocPtr slp, SeqMgrFeatContext PNTR context) { - return SeqMgrGetBestOverlappingFeat (slp, FEATDEF_BIOSRC, NULL, 0, NULL, CONTAINED_WITHIN, context, NULL, NULL, NULL); + return SeqMgrGetBestOverlappingFeat (slp, FEATDEF_BIOSRC, NULL, 0, NULL, CONTAINED_WITHIN, context, NULL, NULL, NULL, FALSE); } NLM_EXTERN SeqFeatPtr LIBCALL SeqMgrGetOverlappingOperon (SeqLocPtr slp, SeqMgrFeatContext PNTR context) { - return SeqMgrGetBestOverlappingFeat (slp, FEATDEF_operon, NULL, 0, NULL, CONTAINED_WITHIN, context, NULL, NULL, NULL); + return SeqMgrGetBestOverlappingFeat (slp, FEATDEF_operon, NULL, 0, NULL, CONTAINED_WITHIN, context, NULL, NULL, NULL, FALSE); } /***************************************************************************** @@ -9275,6 +9444,8 @@ static SeqFeatPtr LIBCALL SeqMgrGetFeatureByLabelEx (BioseqPtr bsp, CharPtr labe context->partialL = feat->partialL; context->partialR = feat->partialR; context->farloc = feat->farloc; + context->bad_order = feat->bad_order; + context->mixed_strand = feat->mixed_strand; context->strand = feat->strand; context->seqfeattype = seqfeattype; context->featdeftype = feat->subtype; @@ -9398,6 +9569,8 @@ NLM_EXTERN SeqFeatPtr LIBCALL SeqMgrGetFeatureByFeatID ( context->partialL = feat->partialL; context->partialR = feat->partialR; context->farloc = feat->farloc; + context->bad_order = feat->bad_order; + context->mixed_strand = feat->mixed_strand; context->strand = feat->strand; context->seqfeattype = sfp->data.choice;; context->featdeftype = feat->subtype; @@ -9519,7 +9692,7 @@ NLM_EXTERN SeqFeatPtr LIBCALL SeqMgrGetOverlappingFeature (SeqLocPtr slp, Uint2 { return SeqMgrGetBestOverlappingFeat (slp, subtype, (SMFeatItemPtr PNTR) featarray, - numfeats, position, overlapType, context, NULL, NULL, NULL); + numfeats, position, overlapType, context, NULL, NULL, NULL, FALSE); } NLM_EXTERN Int2 LIBCALL SeqMgrGetAllOverlappingFeatures (SeqLocPtr slp, Uint2 subtype, @@ -9535,7 +9708,7 @@ NLM_EXTERN Int2 LIBCALL SeqMgrGetAllOverlappingFeatures (SeqLocPtr slp, Uint2 su SeqMgrGetBestOverlappingFeat (slp, subtype, (SMFeatItemPtr PNTR) featarray, numfeats, NULL, overlapType, &context, &count, - userdata, userfunc); + userdata, userfunc, FALSE); return count; } @@ -9579,6 +9752,8 @@ NLM_EXTERN SeqFeatPtr LIBCALL SeqMgrGetFeatureInIndex (BioseqPtr bsp, VoidPtr fe context->partialL = item->partialL; context->partialR = item->partialR; context->farloc = item->farloc; + context->bad_order = item->bad_order; + context->mixed_strand = item->mixed_strand; context->strand = item->strand; if (curr != NULL) { context->seqfeattype = curr->data.choice; @@ -9781,6 +9956,8 @@ static SeqFeatPtr LIBCALL SeqMgrGetNextFeatureEx (BioseqPtr bsp, SeqFeatPtr curr context->partialL = item->partialL; context->partialR = item->partialR; context->farloc = item->farloc; + context->bad_order = item->bad_order; + context->mixed_strand = item->mixed_strand; context->strand = item->strand; context->seqfeattype = seqfeattype; context->featdeftype = item->subtype; @@ -10258,6 +10435,8 @@ static Int4 LIBCALL SeqMgrExploreFeaturesInt (BioseqPtr bsp, Pointer userdata, context.partialL = item->partialL; context.partialR = item->partialR; context.farloc = item->farloc; + context.bad_order = item->bad_order; + context.mixed_strand = item->mixed_strand; context.strand = item->strand; context.seqfeattype = seqfeattype; context.featdeftype = item->subtype; @@ -10421,6 +10600,8 @@ NLM_EXTERN Int2 LIBCALL SeqMgrVisitFeatures (Uint2 entityID, Pointer userdata, context.partialL = item->partialL; context.partialR = item->partialR; context.farloc = item->farloc; + context.bad_order = item->bad_order; + context.mixed_strand = item->mixed_strand; context.strand = item->strand; context.seqfeattype = seqfeattype; context.featdeftype = item->subtype; @@ -10812,6 +10993,7 @@ static void LockAllSegments (SeqLocPtr slp, ValNodePtr PNTR vnpp) } bsp = BioseqLockById (sip); + if (bsp == NULL) return; ValNodeAddPointer (vnpp, 0, (Pointer) bsp); /* now recurse if component is also far delta or seg */ diff --git a/api/seqmgr.h b/api/seqmgr.h index aa1b06d8..c80ab55e 100644 --- a/api/seqmgr.h +++ b/api/seqmgr.h @@ -29,7 +29,7 @@ * * Version Creation Date: 9/94 * -* $Revision: 6.58 $ +* $Revision: 6.59 $ * * File Description: Manager for Bioseqs and BioseqSets * @@ -40,6 +40,9 @@ * * * $Log: seqmgr.h,v $ +* Revision 6.59 2006/02/16 20:24:32 kans +* added bad_order and mixed_strand fields to feature index - to be used for get best gene overlap function in cases of trans-splicing +* * Revision 6.58 2005/08/18 21:02:34 kans * defined SMFidItemPtr structure and added featsByFeatID and numfids fields, in preparation for indexing by feature ID * @@ -884,24 +887,26 @@ NLM_EXTERN void FreeSeqIdGiCache (void); /* the following structures are not frequently used directly by applications */ typedef struct smfeatitem { - SeqFeatPtr sfp; /* freed when TL_CACHED, later will implement reassignment when reloaded */ - SeqAnnotPtr sap; /* SeqAnnot containing SeqFeat, same reap/reload criteria as above */ - BioseqPtr bsp; /* Bioseq on which this feature is indexed */ - CharPtr label; /* featdef content label */ - Int4 left; /* extreme left on bioseq (first copy spanning origin is < 1) */ - Int4 right; /* extreme right on bioseq (second copy spanning origin is > length) */ - Int4Ptr ivals; /* array of start/stop pairs */ - Int2 numivals; /* number of start/stop pairs in ivals array */ - Int4 dnaStop; /* last stop on protein mapped to DNA coordinate for flatfile */ - Boolean partialL; /* left end is partial */ - Boolean partialR; /* right end is partial */ - Boolean farloc; /* location has an accession not packaged in entity */ - Uint1 strand; /* strand (mapped to segmented bioseq if segmented) */ - Uint1 subtype; /* featdef subtype */ - Uint4 itemID; /* storing itemID so no need to gather again */ - Boolean ignore; /* ignore this second copy of a feature spanning the origin */ - Uint4 index; /* position index needed for SeqMgrGetDesiredFeature */ - Int4 overlap; /* for xxxByPos, index of leftmost candidate that overlaps this */ + SeqFeatPtr sfp; /* freed when TL_CACHED, later will implement reassignment when reloaded */ + SeqAnnotPtr sap; /* SeqAnnot containing SeqFeat, same reap/reload criteria as above */ + BioseqPtr bsp; /* Bioseq on which this feature is indexed */ + CharPtr label; /* featdef content label */ + Int4 left; /* extreme left on bioseq (first copy spanning origin is < 1) */ + Int4 right; /* extreme right on bioseq (second copy spanning origin is > length) */ + Int4Ptr ivals; /* array of start/stop pairs */ + Int2 numivals; /* number of start/stop pairs in ivals array */ + Int4 dnaStop; /* last stop on protein mapped to DNA coordinate for flatfile */ + Boolean partialL; /* left end is partial */ + Boolean partialR; /* right end is partial */ + Boolean farloc; /* location has an accession not packaged in entity */ + Boolean bad_order; /* location is out of order - possibly trans-spliced */ + Boolean mixed_strand; /* location has mixed strands - possibly trans-spliced */ + Uint1 strand; /* strand (mapped to segmented bioseq if segmented) */ + Uint1 subtype; /* featdef subtype */ + Uint4 itemID; /* storing itemID so no need to gather again */ + Boolean ignore; /* ignore this second copy of a feature spanning the origin */ + Uint4 index; /* position index needed for SeqMgrGetDesiredFeature */ + Int4 overlap; /* for xxxByPos, index of leftmost candidate that overlaps this */ } SMFeatItem, PNTR SMFeatItemPtr; typedef struct smfeatblock { diff --git a/api/seqport.c b/api/seqport.c index fcaf4bd6..89550a8a 100644 --- a/api/seqport.c +++ b/api/seqport.c @@ -29,7 +29,7 @@ * * Version Creation Date: 7/13/91 * -* $Revision: 6.144 $ +* $Revision: 6.147 $ * * File Description: Ports onto Bioseqs * @@ -39,6 +39,16 @@ * ------- ---------- ----------------------------------------------------- * * $Log: seqport.c,v $ +* Revision 6.147 2006/01/23 13:01:41 bollin +* when converting sequences from raw to delta, adjust any alignments that the +* sequence may be part of. +* +* Revision 6.146 2005/12/16 20:19:56 bollin +* only allow reverse for delta sequences when the delta sequence has no far locations +* +* Revision 6.145 2005/12/15 19:45:24 bollin +* added functions to reverse and complement delta sequences +* * Revision 6.144 2005/08/24 15:14:31 kans * modified MolWtForLoc to use StreamCache, added MolWtForBsp and MolWtForStr * @@ -597,6 +607,7 @@ static char *this_file = __FILE__; #include <subutil.h> #include <tofasta.h> /* for FastaSeqLineEx function */ #include <salutil.h> +#include <alignmgr2.h> /* for correcting alignments when converting to delta */ NLM_EXTERN Boolean LIBCALL SeqPortAdjustLength (SeqPortPtr spp); @@ -4989,45 +5000,19 @@ NLM_EXTERN Boolean LIBCALL BioseqRevComp (BioseqPtr bsp) return retval; } -/*-------------- BioseqComplement () ---------------------------*/ -/*********************************************************************** -* BioseqComplement: Takes the nucleic acid sequence from Bioseq -* Entry and gives the complement sequence in place -* Does not change features. -************************************************************************/ -NLM_EXTERN Boolean LIBCALL BioseqComplement (BioseqPtr bsp) +static Boolean ComplementSeqData (Uint1 seqtype, Int4 seqlen, ByteStorePtr bysp) { SeqCodeTablePtr sctp; - ByteStorePtr bysp; - long readbyte, bslen; - Int4 seqlen; - Uint1 seqtype, byte = 0, byte_to, newbyte = 0, residue; + long readbyte, bslen; + Uint1 byte = 0, byte_to, newbyte = 0, residue; Uint1 comp, bitctr, mask, lshift, rshift, bc; - if (bsp == NULL) - { - ErrPostEx(SEV_ERROR,0,0, "Error: not a BioseqPtr\n"); - return FALSE; - } - - if (bsp->repr != Seq_repr_raw) - { - ErrPostEx(SEV_ERROR,0,0, "Error: not a raw sequence\n"); - return FALSE; - } - - if (bsp->seq_data == NULL) + if (bysp == NULL) { ErrPostEx(SEV_ERROR,0,0, "Error: no sequence data\n"); - return FALSE; + return FALSE; } - seqtype = bsp->seq_data_type; - if ( ISA_aa(bsp->mol)) { - ErrPostEx(SEV_ERROR,0,0, "Error: cannot complement aa\n"); - return FALSE; - } - if ((sctp = SeqCodeTableFind (seqtype)) == NULL) { ErrPostEx(SEV_ERROR,0,0, "Can't open table\n"); @@ -5056,22 +5041,21 @@ NLM_EXTERN Boolean LIBCALL BioseqComplement (BioseqPtr bsp) lshift = 0; mask = 255; break; - case Seq_code_iupacaa: - case Seq_code_ncbi8aa: - case Seq_code_ncbieaa: - case Seq_code_ncbipaa: - case Seq_code_iupacaa3: - case Seq_code_ncbistdaa: /* ignore amino acid */ - ErrPostEx(SEV_ERROR,0,0, "Error: cannot complement aa ; No ->mol flag on Bioseq\n"); - return FALSE; - case Seq_code_ncbipna: - ErrPostEx(SEV_WARNING,0,0, "Error: Don't yet know how to complement profile\n"); + case Seq_code_iupacaa: + case Seq_code_ncbi8aa: + case Seq_code_ncbieaa: + case Seq_code_ncbipaa: + case Seq_code_iupacaa3: + case Seq_code_ncbistdaa: /* ignore amino acid */ + ErrPostEx(SEV_ERROR,0,0, "Error: cannot complement aa ; No ->mol flag on Bioseq\n"); + return FALSE; + case Seq_code_ncbipna: + ErrPostEx(SEV_WARNING,0,0, "Error: Don't yet know how to complement profile\n"); + return FALSE; default: return FALSE; } - seqlen = bsp->length; - bysp = bsp->seq_data; bslen = BSLen(bysp); bitctr = 0; readbyte = 0; @@ -5110,46 +5094,100 @@ together*/ } } return TRUE; + +} -} /* BioseqComplement */ - -/*-------------- BioseqReverse () ---------------------------*/ +static Boolean DeltaBioseqComplement (BioseqPtr bsp) +{ + DeltaSeqPtr dsp; + SeqLitPtr slip; + Boolean rval = FALSE; + + if (bsp == NULL || bsp->repr != Seq_repr_delta) + { + return rval; + } + + dsp = (DeltaSeqPtr) bsp->seq_ext; + while (dsp != NULL) + { + if (dsp->choice != 2) + { + ErrPostEx(SEV_ERROR,0,0, "Error: Can't complement delta sequences with far locs\n"); + return FALSE; + } + dsp = dsp->next; + } + rval = TRUE; + dsp = (DeltaSeqPtr) bsp->seq_ext; + while (dsp != NULL) + { + slip = (SeqLitPtr) dsp->data.ptrvalue; + /* complement data */ + if (slip->seq_data != NULL) + { + rval &= ComplementSeqData (slip->seq_data_type, slip->length, slip->seq_data); + } + dsp = dsp->next; + } + return rval; +} + + +/*-------------- BioseqComplement () ---------------------------*/ /*********************************************************************** -* BioseqReverse: Takes nucleic acid sequence from Bioseq Entry and -* reverses the whole sequence in place +* BioseqComplement: Takes the nucleic acid sequence from Bioseq +* Entry and gives the complement sequence in place * Does not change features. ************************************************************************/ -NLM_EXTERN Boolean LIBCALL BioseqReverse (BioseqPtr bsp) +NLM_EXTERN Boolean LIBCALL BioseqComplement (BioseqPtr bsp) +{ + Boolean rval = FALSE; + + if (bsp == NULL) + { + ErrPostEx(SEV_ERROR,0,0, "Error: not a BioseqPtr\n"); + rval = FALSE; + } + else if (ISA_aa(bsp->mol)) + { + ErrPostEx(SEV_ERROR,0,0, "Error: cannot complement aa\n"); + rval = FALSE; + } + else if (bsp->repr == Seq_repr_delta) + { + rval = DeltaBioseqComplement (bsp); + } + else if (bsp->repr == Seq_repr_raw) + { + rval = ComplementSeqData (bsp->seq_data_type, bsp->length, bsp->seq_data); + } + else + { + ErrPostEx(SEV_ERROR,0,0, "Error: not a raw or delta sequence\n"); + rval = FALSE; + } + return rval; + +} /* BioseqComplement */ + + +static Boolean LIBCALL ReverseSeqData (Uint1 seqtype, Int4 seqlen, ByteStorePtr bysp1) { - ByteStorePtr bysp1 = '\0'; ByteStorePtr bysp2 = '\0'; long readbyte, bslen = 0; - Int4 seqlen, count = 0; - Uint1 seqtype, byte = 0, byte2, byte_to = 0, byte_to2, newbyte = 0; + Int4 count = 0; + Uint1 byte = 0, byte2, byte_to = 0, byte_to2, newbyte = 0; Uint1 newbyte2, finalbyte, residue, residue2, bitctr, bc2 = 0; Uint1 bitctr2, mask, mask2, lshift, rshift, bc = 0, jagged; - if (bsp == NULL) - { - ErrPostEx(SEV_ERROR,0,0, "Error: not a BioseqPtr\n"); - return FALSE; - } - - if (bsp->repr != Seq_repr_raw) - { - ErrPostEx(SEV_ERROR,0,0, "Error: not a raw sequence\n"); - return FALSE; - } - - if (bsp->seq_data == NULL) - { - ErrPostEx(SEV_ERROR,0,0, "Error: No sequence data\n"); - return FALSE; - } + if (bysp1 == NULL) + { + ErrPostEx(SEV_ERROR,0,0, "Error: No sequence data\n"); + return FALSE; + } - seqlen = bsp->length; - seqtype = bsp->seq_data_type; switch (seqtype){ case Seq_code_ncbi2na: /*bitshifts needed*/ mask = 192; @@ -5219,7 +5257,6 @@ NLM_EXTERN Boolean LIBCALL BioseqReverse (BioseqPtr bsp) default: /*ignores amino acid sequence*/ return FALSE; } - bysp1 = bsp->seq_data; bysp2 = BSDup(bysp1); bslen = BSLen (bysp1); bitctr = bitctr2 = 0; @@ -5314,6 +5351,84 @@ bytes*/ } BSFree(bysp2); return TRUE; +} /* ReverseSeqData */ + + +static Boolean DeltaBioseqReverse (BioseqPtr bsp) +{ + DeltaSeqPtr dsp, next_dsp, newchain = NULL; + SeqLitPtr slip; + Boolean rval = FALSE; + Boolean split = FALSE; + + if (bsp == NULL || bsp->repr != Seq_repr_delta) + { + return rval; + } + + dsp = (DeltaSeqPtr) bsp->seq_ext; + while (dsp != NULL) + { + if (dsp->choice != 2) + { + ErrPostEx(SEV_ERROR,0,0, "Error: Can't reverse delta sequences with far locs\n"); + return FALSE; + } + dsp = dsp->next; + } + + dsp = (DeltaSeqPtr) bsp->seq_ext; + rval = TRUE; + while (dsp != NULL) + { + slip = (SeqLitPtr) dsp->data.ptrvalue; + /* reverse data */ + if (slip->seq_data != NULL) + { + rval &= ReverseSeqData (slip->seq_data_type, slip->length, slip->seq_data); + } + + /* reverse the chain */ + next_dsp = dsp->next; + dsp->next = newchain; + newchain = dsp; + + dsp = next_dsp; + } + bsp->seq_ext = newchain; + return rval; +} + +/*-------------- BioseqReverse () ---------------------------*/ +/*********************************************************************** +* BioseqReverse: Takes nucleic acid sequence from Bioseq Entry and +* reverses the whole sequence in place +* Does not change features. +************************************************************************/ +NLM_EXTERN Boolean LIBCALL BioseqReverse (BioseqPtr bsp) +{ + Boolean rval; + + if (bsp == NULL) + { + ErrPostEx(SEV_ERROR,0,0, "Error: not a BioseqPtr\n"); + rval = FALSE; + } + else if (bsp->repr == Seq_repr_delta) + { + rval = DeltaBioseqReverse (bsp); + } + else if (bsp->repr == Seq_repr_raw) + { + rval = ReverseSeqData (bsp->seq_data_type, bsp->length, bsp->seq_data); + } + else + { + ErrPostEx(SEV_ERROR,0,0, "Error: not a raw or delta sequence\n"); + rval = FALSE; + } + + return rval; } /* BioseqReverse */ #define SPC_BUFF_CHUNK 1024 @@ -7817,12 +7932,17 @@ NLM_EXTERN CharPtr GetDNAbyAccessionDotVersion (CharPtr accession) } -static void FixGapLength (SeqIdPtr sip, Uint2 moltype, Int4 offset, Int4 diff) +static void FixGapLength (BioseqPtr bsp, Int4 offset, Int4 diff) { - CharPtr extra_ns; - SeqLocPtr slp; + CharPtr extra_ns; + SeqLocPtr slp; + ValNodePtr align_annot_list, vnp; + SeqAnnotPtr sanp; + + if (bsp == NULL || bsp->id == NULL || diff == 0) return; - if (sip == NULL || diff == 0) return; + align_annot_list = FindAlignSeqAnnotsForBioseq (bsp); + if (diff > 0) { extra_ns = (CharPtr)MemNew ((diff + 1) * sizeof (Char)); @@ -7830,13 +7950,33 @@ static void FixGapLength (SeqIdPtr sip, Uint2 moltype, Int4 offset, Int4 diff) { MemSet (extra_ns, 'N', diff); extra_ns [diff] = 0; - insertchar (extra_ns, offset, sip, moltype, FALSE); + insertchar (extra_ns, offset, bsp->id, bsp->mol, FALSE); } + slp = SeqLocIntNew (offset, offset + diff - 1, Seq_strand_plus, bsp->id); + for (vnp = align_annot_list; vnp != NULL; vnp = vnp->next) + { + sanp = vnp->data.ptrvalue; + if (sanp != NULL && sanp->type == 2) + { + sanp->data = SeqAlignInsertByLoc (slp, sanp->data); + } + } + SeqLocFree (slp); } else { - slp = SeqLocIntNew (offset, offset - diff - 1, Seq_strand_plus, sip); + slp = SeqLocIntNew (offset, offset - diff - 1, Seq_strand_plus, bsp->id); SeqDeleteByLoc (slp, TRUE, FALSE); + + for (vnp = align_annot_list; vnp != NULL; vnp = vnp->next) + { + sanp = vnp->data.ptrvalue; + if (sanp != NULL && sanp->type == 2) + { + sanp->data = SeqAlignDeleteByLoc (slp, sanp->data); + } + } + SeqLocFree (slp); } } @@ -7998,7 +8138,7 @@ NLM_EXTERN void ConvertNsToGaps ( slp->fuzz = ifp; if (slp->length != 100) { - FixGapLength (bsp->id, bsp->mol, len, 100 - slp->length); + FixGapLength (bsp, len, 100 - slp->length); slp->length = 100; } } diff --git a/api/sequtil.c b/api/sequtil.c index 5d459f8c..c7d8d53e 100644 --- a/api/sequtil.c +++ b/api/sequtil.c @@ -29,13 +29,36 @@ * * Version Creation Date: 4/1/91 * -* $Revision: 6.183 $ +* $Revision: 6.190 $ * * File Description: Sequence Utilities for objseq and objsset * * Modifications: * -------------------------------------------------------------------------- * $Log: sequtil.c,v $ +* Revision 6.190 2006/02/16 17:19:14 kans +* better handling of trans splicing in GetThePointForOffset, SeqLocStart (CB) +* +* Revision 6.189 2006/02/07 17:50:53 kans +* support for pgp instead of pat for pre-grant publication in SeqIdWrite and SeqIdParse +* +* Revision 6.188 2006/02/01 21:53:44 kans +* DZ and EA for ncbi patent in WHICH_db_accession +* +* Revision 6.187 2006/01/24 17:59:26 kans +* use DY for NCBI EST +* +* Revision 6.186 2006/01/05 14:11:56 bollin +* added SeqLocPrintUseBestID function, which prints out the sequence location +* but uses the "best" sequence ID instead of the one actually stored in the +* SeqLoc. +* +* Revision 6.185 2006/01/03 15:49:36 kans +* added DX as ncbi gss to WHICH_db_accession +* +* Revision 6.184 2005/12/09 19:43:43 kans +* added DW as NCBI EST +* * Revision 6.183 2005/09/20 21:11:34 kans * added DV as NCBI EST * @@ -3321,7 +3344,7 @@ NLM_EXTERN SeqIdPtr SeqIdSelect (SeqIdPtr sip, Uint1Ptr order, Int2 num) "emb", /* embl = emb|accession|locus */ "pir", /* pir = pir|accession|name */ "sp", /* swissprot = sp|accession|name */ - "pat", /* patent = pat|country|patent number (string)|seq number (integer) */ + "pat", /* patent = pat|country|patent number (string)|seq number (integer) - use pgp for pre-grant pub */ "ref", /* other = ref|accession|name|release - changed from oth to ref */ "gnl", /* general = gnl|database(string)|id (string or number) */ "gi", /* gi = gi|integer */ @@ -3465,8 +3488,10 @@ NLM_EXTERN CharPtr SeqIdWrite (SeqIdPtr isip, CharPtr buf, Uint1 format, Uint4 b PDBSeqIdPtr psip; ObjectIdPtr oip; PatentSeqIdPtr patsip; + IdPatPtr ipp; Boolean got_gi = FALSE; Boolean got_tmsmart = FALSE; + Boolean is_us_pre_grant = FALSE; DbtagPtr dbt; Char chainbuf[3]; Char versionbuf[10]; @@ -3556,6 +3581,14 @@ NLM_EXTERN CharPtr SeqIdWrite (SeqIdPtr isip, CharPtr buf, Uint1 format, Uint4 b if (dbt != NULL && StringICmp (dbt->db, "TMSMART") == 0) { got_tmsmart = TRUE; } + } else if (sip->choice == SEQID_PATENT) { + patsip = (PatentSeqIdPtr) sip->data.ptrvalue; + if (patsip != NULL) { + ipp = patsip->cit; + if (ipp != NULL && StringDoesHaveText (ipp->app_number)) { + is_us_pre_grant = TRUE; + } + } } } if (useGeneral) { @@ -3576,8 +3609,18 @@ NLM_EXTERN CharPtr SeqIdWrite (SeqIdPtr isip, CharPtr buf, Uint1 format, Uint4 b } format = PRINTID_FASTA_SHORT; /* put on second (or only) SeqId in this format */ } - else + else { sip = isip; /* only one id processed */ + if (sip != NULL && sip->choice == SEQID_PATENT) { + patsip = (PatentSeqIdPtr) sip->data.ptrvalue; + if (patsip != NULL) { + ipp = patsip->cit; + if (ipp != NULL && StringDoesHaveText (ipp->app_number)) { + is_us_pre_grant = TRUE; + } + } + } + } /* deal with LOCUS and ACCESSION */ if ((format == PRINTID_TEXTID_ACCESSION) || (format == PRINTID_TEXTID_LOCUS) || @@ -3626,7 +3669,11 @@ NLM_EXTERN CharPtr SeqIdWrite (SeqIdPtr isip, CharPtr buf, Uint1 format, Uint4 b if (format == PRINTID_FASTA_SHORT) { - Nlm_LabelCopyNext(&tmp, txtid[sip->choice], &buflen); + if (sip->choice == SEQID_PATENT && is_us_pre_grant) { + Nlm_LabelCopyNext(&tmp, "pgp", &buflen); + } else { + Nlm_LabelCopyNext(&tmp, txtid[sip->choice], &buflen); + } Nlm_LabelCopyNext(&tmp, ldelim, &buflen); } @@ -3694,7 +3741,11 @@ NLM_EXTERN CharPtr SeqIdWrite (SeqIdPtr isip, CharPtr buf, Uint1 format, Uint4 b Nlm_LabelCopyNext(&tmp, patsip->cit->country, &buflen); if (format == PRINTID_FASTA_SHORT) Nlm_LabelCopyNext(&tmp, ldelim, &buflen); - Nlm_LabelCopyNext(&tmp, patsip->cit->number, &buflen); + if (is_us_pre_grant) { + Nlm_LabelCopyNext(&tmp, patsip->cit->app_number, &buflen); + } else { + Nlm_LabelCopyNext(&tmp, patsip->cit->number, &buflen); + } if (format == PRINTID_FASTA_SHORT) Nlm_LabelCopyNext(&tmp, ldelim, &buflen); else @@ -3884,7 +3935,7 @@ NLM_EXTERN SeqIdPtr SeqIdParse(CharPtr buf) IdPatPtr ipp; PDBSeqIdPtr psip; GiimPtr gim; - Boolean done = FALSE; + Boolean done = FALSE, is_us_pre_grant = FALSE; static Uint1 expect_tokens[NUM_SEQID] = { /* number of tokens to expect */ 0, /* 0 = not set */ 1, /* 1 = local Object-id */ @@ -3941,6 +3992,12 @@ NLM_EXTERN SeqIdPtr SeqIdParse(CharPtr buf) type = SEQID_OTHER; } + /* pgp is for pre-grant patent publications */ + if ((! type) && (! StringCmp(localbuf, "pgp"))) { + type = SEQID_PATENT; + is_us_pre_grant = TRUE; + } + if (! type) goto erret; /* copy and tokenize - token\0token\0\n */ @@ -4077,7 +4134,11 @@ NLM_EXTERN SeqIdPtr SeqIdParse(CharPtr buf) ipp = IdPatNew(); patsip->cit = ipp; ipp->country = StringSave(tokens[0]); - ipp->number = StringSave(tokens[1]); + if (is_us_pre_grant) { + ipp->app_number = StringSave(tokens[1]); + } else { + ipp->number = StringSave(tokens[1]); + } sscanf(tokens[2], "%ld", &num); patsip->seqid = (Int2)num; break; @@ -4831,8 +4892,9 @@ NLM_EXTERN Int4 SeqLocStart (SeqLocPtr anp) /* seqloc */ { Int4 pos = -1L, tpos, numpnt; - SeqIdPtr sip; - SeqLocPtr slp; + SeqIdPtr sip; + SeqLocPtr slp; + SeqIntPtr sintp; if (anp == NULL) return pos; @@ -4869,7 +4931,8 @@ NLM_EXTERN Int4 SeqLocStart (SeqLocPtr anp) /* seqloc */ } break; case SEQLOC_INT: /* int */ - pos = ((SeqIntPtr)anp->data.ptrvalue)->from; + sintp = (SeqIntPtr) anp->data.ptrvalue; + pos = sintp->from; break; case SEQLOC_PNT: /* pnt */ pos = ((SeqPntPtr)anp->data.ptrvalue)->point; @@ -6558,6 +6621,61 @@ NLM_EXTERN Int4 CheckPointInBioseq (SeqPntPtr sp, BioseqPtr in) return retval; /* all failed */ } + +static SeqIdPtr GetEarlierSeqIdPtr (SeqIdPtr sip1, SeqIdPtr sip2) +{ + BioseqPtr bsp1, bsp2; + BioseqSetPtr bssp; + SeqEntryPtr sep; + + if (sip1 == NULL && sip2 != NULL) + { + return sip2; + } + else if (sip1 != NULL && sip2 == NULL) + { + return sip1; + } + else if (SeqIdComp(sip1, sip2) == SIC_YES) + { + return sip1; + } + + bsp1 = BioseqFind (sip1); + bsp2 = BioseqFind (sip2); + if (bsp1 == NULL && bsp2 == NULL) + { + return sip1; + } + else if (bsp1 == NULL) + { + return sip2; + } + else if (bsp2 == NULL) + { + return sip1; + } + + if (bsp1->idx.parenttype == OBJ_BIOSEQSET + && bsp2->idx.parenttype == OBJ_BIOSEQSET + && bsp1->idx.parentptr == bsp2->idx.parentptr) + { + bssp = (BioseqSetPtr) bsp1->idx.parentptr; + for (sep = bssp->seq_set; sep != NULL; sep = sep->next) + { + if (sep->data.ptrvalue == bsp1) + { + return sip1; + } + else if (sep->data.ptrvalue == bsp2) + { + return sip2; + } + } + } + return sip1; +} + /***************************************************************************** * * Boolean GetThePointForOffset(SeqLocPtr of, SeqPntPtr target, Uint1 which_end) @@ -6565,46 +6683,118 @@ NLM_EXTERN Int4 CheckPointInBioseq (SeqPntPtr sp, BioseqPtr in) *****************************************************************************/ Boolean GetThePointForOffset(SeqLocPtr of, SeqPntPtr target, Uint1 which_end) { - SeqLocPtr tmp, pnt, first=NULL, last=NULL; - Uint1 ofstrand; - Boolean getstart; + SeqLocPtr pnt, first=NULL, last=NULL; + Uint1 first_strand, last_strand; + Boolean all_minus = TRUE; + Int4 lowest = -1, highest = 0, tmp; + SeqIdPtr low_sip = NULL, high_sip = NULL, first_sip = NULL, last_sip = NULL; + Boolean id_same; pnt = NULL; /* get first or last single span type in "of"*/ - tmp = NULL; while ((pnt = SeqLocFindNext(of, pnt)) != NULL) { + last_strand = SeqLocStrand (pnt); + last_sip = SeqLocId (pnt); + if (last_strand != Seq_strand_minus) + { + all_minus = FALSE; + } last = pnt; if (first == NULL) + { first = pnt; + first_strand = last_strand; + first_sip = last_sip; + lowest = SeqLocStart(pnt); + highest = SeqLocStop (pnt); + low_sip = last_sip; + high_sip = last_sip; + } + else + { + tmp = SeqLocStart (pnt); + if (SeqIdComp (last_sip, low_sip)) + { + id_same = TRUE; + } + else + { + id_same = FALSE; + } + if ((id_same && tmp < lowest) + || (!id_same && last_sip == GetEarlierSeqIdPtr (last_sip, low_sip))) + { + lowest = tmp; + low_sip = last_sip; + } + tmp = SeqLocStop (pnt); + + if (SeqIdComp (last_sip, high_sip)) + { + id_same = TRUE; + } + else + { + id_same = FALSE; + } + if ((id_same && tmp > highest) + || (!id_same && high_sip == GetEarlierSeqIdPtr (high_sip, last_sip))) + { + highest = tmp; + high_sip = last_sip; + } + } } /* otherwise, get last */ if (first == NULL) return FALSE; - ofstrand = SeqLocStrand(first); - getstart = TRUE; /* assume we are getting SeqLocStart() */ + switch (which_end) { case SEQLOC_LEFT_END: - if (ofstrand == Seq_strand_minus) - tmp = last; - else - tmp = first; + target->point = lowest; + target->id = low_sip; break; case SEQLOC_RIGHT_END: - if (ofstrand == Seq_strand_minus) - tmp = first; - else - tmp = last; - getstart = FALSE; + target->point = highest; + target->id = high_sip; break; case SEQLOC_START: - tmp = first; - if (ofstrand == Seq_strand_minus) - getstart = FALSE; + if (all_minus) + { + target->point = SeqLocStop (last); + target->id = last_sip; + } + else + { + if (first_strand == Seq_strand_minus) + { + target->point = SeqLocStop (first); + } + else + { + target->point = SeqLocStart (first); + } + target->id = first_sip; + } break; case SEQLOC_STOP: - tmp = last; - if (ofstrand != Seq_strand_minus) - getstart = FALSE; + if (all_minus) + { + target->point = SeqLocStart (first); + target->id = first_sip; + } + else + { + if (last_strand == Seq_strand_minus) + { + target->point = SeqLocStart (last); + } + else + { + target->point = SeqLocStop (last); + } + target->id = last_sip; + } break; default: return FALSE; /* error */ @@ -6612,12 +6802,6 @@ Boolean GetThePointForOffset(SeqLocPtr of, SeqPntPtr target, Uint1 which_end) /* SeqLocStart returns 'from', and SeqLocStop returns 'to', regardless of strand! */ - if (getstart) - target->point = SeqLocStart(tmp); - else - target->point = SeqLocStop(tmp); - target->id = SeqLocId(tmp); - if ((target->point < 0) || (target->id == NULL)) return FALSE; @@ -6792,15 +6976,11 @@ NLM_EXTERN Int2 SeqLocMol (SeqLocPtr seqloc) return the_mol; } -static SeqIdPtr SeqLocPrintProc(SeqLocPtr slp, ByteStorePtr bsp, Boolean first, SeqIdPtr lastid); +static SeqIdPtr SeqLocPrintProc(SeqLocPtr slp, ByteStorePtr bsp, Boolean first, SeqIdPtr lastid, Boolean use_best_id); static void BSstring(ByteStorePtr bsp, CharPtr str); -/***************************************************************************** -* -* SeqLocPrint(slp) -* -*****************************************************************************/ -NLM_EXTERN CharPtr SeqLocPrint(SeqLocPtr slp) + +static CharPtr SeqLocPrintEx (SeqLocPtr slp, Boolean use_best_id) { ByteStorePtr bsp; CharPtr str; @@ -6813,19 +6993,35 @@ NLM_EXTERN CharPtr SeqLocPrint(SeqLocPtr slp) tmp = slp->next; /* save possible chain */ slp->next = NULL; /* take out of possible chain */ - SeqLocPrintProc(slp, bsp, TRUE, NULL); + SeqLocPrintProc(slp, bsp, TRUE, NULL, use_best_id); slp->next = tmp; /* replace possible chain */ str = (CharPtr)BSMerge(bsp, NULL); BSFree(bsp); - return str; + return str; +} + +/***************************************************************************** +* +* SeqLocPrint(slp) +* +*****************************************************************************/ +NLM_EXTERN CharPtr SeqLocPrint(SeqLocPtr slp) +{ + return SeqLocPrintEx (slp, FALSE); +} + +NLM_EXTERN CharPtr SeqLocPrintUseBestID(SeqLocPtr slp) +{ + return SeqLocPrintEx (slp, TRUE); } NLM_EXTERN SeqIdPtr SeqPointWrite(SeqPntPtr spp, CharPtr buf, SeqIdPtr lastid, Int2 buflen); NLM_EXTERN SeqIdPtr SeqPointPrint(SeqPntPtr spp, CharPtr buf, SeqIdPtr lastid); NLM_EXTERN void IntFuzzPrint(IntFuzzPtr ifp, Int4 pos, CharPtr buf, Boolean right); static char strandsymbol[5] = { '\0', '\0', 'c', 'b', 'r' }; +static SeqIdPtr SeqPointWriteEx (SeqPntPtr spp, CharPtr buf, SeqIdPtr lastid, Int2 buflen, Boolean use_best_id); /***************************************************************************** @@ -6835,7 +7031,13 @@ static char strandsymbol[5] = { '\0', '\0', 'c', 'b', 'r' }; * goes down slp chain * *****************************************************************************/ -static SeqIdPtr SeqLocPrintProc(SeqLocPtr slp, ByteStorePtr bsp, Boolean first, SeqIdPtr lastid) +static SeqIdPtr +SeqLocPrintProc +(SeqLocPtr slp, + ByteStorePtr bsp, + Boolean first, + SeqIdPtr lastid, + Boolean use_best_id) { Char buf[41]; SeqBondPtr sbp; @@ -6844,6 +7046,8 @@ static SeqIdPtr SeqLocPrintProc(SeqLocPtr slp, ByteStorePtr bsp, Boolean first, IntFuzzPtr ifp1, ifp2; Int4 from, to; Int2 delim, delim2; + BioseqPtr seq; + SeqIdPtr thisid; while (slp != NULL) { @@ -6861,7 +7065,7 @@ static SeqIdPtr SeqLocPrintProc(SeqLocPtr slp, ByteStorePtr bsp, Boolean first, sbp = (SeqBondPtr)(slp->data.ptrvalue); if (sbp->a != NULL) { - lastid = SeqPointWrite(sbp->a, buf, lastid, 40); + lastid = SeqPointWriteEx(sbp->a, buf, lastid, 40, use_best_id); BSstring(bsp, buf); } else @@ -6871,7 +7075,7 @@ static SeqIdPtr SeqLocPrintProc(SeqLocPtr slp, ByteStorePtr bsp, Boolean first, if (sbp->b != NULL) { - lastid = SeqPointWrite(sbp->b, buf, lastid, 40); + lastid = SeqPointWriteEx(sbp->b, buf, lastid, 40, use_best_id); BSstring(bsp, buf); } else @@ -6906,18 +7110,27 @@ static SeqIdPtr SeqLocPrintProc(SeqLocPtr slp, ByteStorePtr bsp, Boolean first, delim2 = ']'; } BSPutByte(bsp, delim); - lastid = SeqLocPrintProc((SeqLocPtr)(slp->data.ptrvalue), bsp, TRUE, lastid); + lastid = SeqLocPrintProc((SeqLocPtr)(slp->data.ptrvalue), bsp, TRUE, lastid, use_best_id); BSPutByte(bsp, delim2); break; case SEQLOC_INT: /* int */ sip = (SeqIntPtr)(slp->data.ptrvalue); + thisid = sip->id; + if (use_best_id) + { + seq = BioseqFind (thisid); + if (seq != NULL) + { + thisid = SeqIdFindBest (seq->id, SEQID_GENBANK); + } + } if (! SeqIdMatch(sip->id, lastid)) { - SeqIdWrite(sip->id, buf, PRINTID_FASTA_SHORT, 40); + SeqIdWrite(thisid, buf, PRINTID_FASTA_SHORT, 40); BSstring(bsp, buf); BSPutByte(bsp, ':'); } - lastid = sip->id; + lastid = thisid; if (strandsymbol[sip->strand]) BSPutByte(bsp, (Int2)strandsymbol[sip->strand]); if ((sip->strand == Seq_strand_minus) || @@ -6944,8 +7157,8 @@ static SeqIdPtr SeqLocPrintProc(SeqLocPtr slp, ByteStorePtr bsp, Boolean first, break; case SEQLOC_PNT: /* pnt */ - lastid = SeqPointWrite((SeqPntPtr)(slp->data.ptrvalue), - buf, lastid, 40); + lastid = SeqPointWriteEx((SeqPntPtr)(slp->data.ptrvalue), + buf, lastid, 40, use_best_id); BSstring(bsp, buf); break; case SEQLOC_PACKED_PNT: /* packed pnt */ @@ -7004,22 +7217,37 @@ NLM_EXTERN SeqIdPtr SeqPointPrint(SeqPntPtr spp, CharPtr buf, SeqIdPtr lastid) return spp->id; } -/***************************************************************************** -* -* SeqPointWrite(spp, buf, lastid, buflen) -* -*****************************************************************************/ -NLM_EXTERN SeqIdPtr SeqPointWrite(SeqPntPtr spp, CharPtr buf, SeqIdPtr lastid, Int2 buflen) +static SeqIdPtr +SeqPointWriteEx +(SeqPntPtr spp, + CharPtr buf, + SeqIdPtr lastid, + Int2 buflen, + Boolean use_best_id) { - CharPtr tmp; + CharPtr tmp; + SeqIdPtr best_id, tmp_next; + BioseqPtr bsp; if ((spp == NULL) || (buf == NULL)) return NULL; tmp = buf; *tmp = '\0'; - if (! SeqIdMatch(spp->id, lastid)) + best_id = spp->id; + if (use_best_id) + { + bsp = BioseqFind (spp->id); + if (bsp != NULL) + { + best_id = SeqIdFindBest (bsp->id, SEQID_GENBANK); + } + } + tmp_next = best_id->next; + best_id->next = NULL; + + if (! SeqIdMatch(best_id, lastid)) { - SeqIdWrite(spp->id, tmp, PRINTID_FASTA_SHORT, buflen); + SeqIdWrite(best_id, tmp, PRINTID_FASTA_SHORT, buflen); while (*tmp != '\0') tmp++; *tmp = ':'; tmp++; *tmp = '\0'; @@ -7031,7 +7259,19 @@ NLM_EXTERN SeqIdPtr SeqPointWrite(SeqPntPtr spp, CharPtr buf, SeqIdPtr lastid, I } IntFuzzPrint(spp->fuzz, spp->point, tmp, TRUE); - return spp->id; + best_id->next = tmp_next; + + return best_id; +} + +/***************************************************************************** +* +* SeqPointWrite(spp, buf, lastid, buflen) +* +*****************************************************************************/ +NLM_EXTERN SeqIdPtr SeqPointWrite(SeqPntPtr spp, CharPtr buf, SeqIdPtr lastid, Int2 buflen) +{ + return SeqPointWriteEx (spp, buf, lastid, buflen, FALSE); } /***************************************************************************** @@ -9083,7 +9323,9 @@ NLM_EXTERN Uint4 LIBCALL WHICH_db_accession (CharPtr s) (StringICmp(temp,"DN") == 0) || (StringICmp(temp,"DR") == 0) || (StringICmp(temp,"DT") == 0) || - (StringICmp(temp,"DV") == 0) ) { /* NCBI EST */ + (StringICmp(temp,"DV") == 0) || + (StringICmp(temp,"DW") == 0) || + (StringICmp(temp,"DY") == 0) ) { /* NCBI EST */ retcode = ACCN_NCBI_EST; } else if ((StringICmp(temp,"BV") == 0)) { /* NCBI STS */ retcode = ACCN_NCBI_STS; @@ -9118,9 +9360,12 @@ NLM_EXTERN Uint4 LIBCALL WHICH_db_accession (CharPtr s) (StringICmp(temp,"CL") == 0) || (StringICmp(temp,"CW") == 0) || (StringICmp(temp,"CZ") == 0) || - (StringICmp(temp,"DU") == 0) ) { /* NCBI GSS */ + (StringICmp(temp,"DU") == 0) || + (StringICmp(temp,"DX") == 0) ) { /* NCBI GSS */ retcode = ACCN_NCBI_GSS; - } else if ((StringICmp(temp,"AR") == 0)) { /* NCBI patent */ + } else if ((StringICmp(temp,"AR") == 0) || + (StringICmp(temp,"DZ") == 0) || + (StringICmp(temp,"EA") == 0)) { /* NCBI patent */ retcode = ACCN_NCBI_PATENT; } else if((StringICmp(temp,"BC")==0)) { /* NCBI long cDNA project : MGC */ retcode = ACCN_NCBI_cDNA; diff --git a/api/sequtil.h b/api/sequtil.h index c643448d..7e0213a5 100644 --- a/api/sequtil.h +++ b/api/sequtil.h @@ -29,13 +29,18 @@ * * Version Creation Date: 4/1/91 * -* $Revision: 6.48 $ +* $Revision: 6.49 $ * * File Description: Sequence Utilities for objseq and objsset * * Modifications: * -------------------------------------------------------------------------- * $Log: sequtil.h,v $ +* Revision 6.49 2006/01/05 14:11:56 bollin +* added SeqLocPrintUseBestID function, which prints out the sequence location +* but uses the "best" sequence ID instead of the one actually stored in the +* SeqLoc. +* * Revision 6.48 2005/08/03 18:28:36 kans * ValidateAccnDotVer returns -5 for missing version and -6 for bad version (not just digits) * @@ -844,7 +849,7 @@ NLM_EXTERN Int2 SeqLocOrder(SeqLocPtr a, SeqLocPtr b, BioseqPtr in); NLM_EXTERN Int2 SeqLocMol(SeqLocPtr seqloc); NLM_EXTERN CharPtr SeqLocPrint(SeqLocPtr slp); - +NLM_EXTERN CharPtr SeqLocPrintUseBestID(SeqLocPtr slp); /***************************************************************************** * diff --git a/api/sqnutil1.c b/api/sqnutil1.c index 9fc1b88e..e44797e5 100644 --- a/api/sqnutil1.c +++ b/api/sqnutil1.c @@ -29,7 +29,7 @@ * * Version Creation Date: 9/2/97 * -* $Revision: 6.361 $ +* $Revision: 6.369 $ * * File Description: * @@ -331,6 +331,98 @@ NLM_EXTERN Int2 SeqEntryToBioSource (SeqEntryPtr sep, BoolPtr mito, CharPtr taxn return SeqEntryOrEntityIDToGeneticCode (sep, 0, mito, taxname, maxsize, biopp); } +NLM_EXTERN Boolean BioseqToGeneticCode ( + BioseqPtr bsp, + Int2Ptr gencodep, + BoolPtr mitop, + BoolPtr plastidp, + CharPtr taxnamep, + size_t maxsize, + BioSourcePtr PNTR biopp +) + +{ + BioSourcePtr biop = NULL; + SeqMgrDescContext dcontext; + SeqMgrFeatContext fcontext; + Int2 gencode = 0; + Boolean mito = FALSE; + Int2 mitoCode = 0; + Int2 nuclCode = 0; + OrgNamePtr onp; + OrgRefPtr orp; + Boolean plastid = FALSE; + SeqDescrPtr sdp; + SeqFeatPtr sfp; + CharPtr taxname = NULL; + + if (bsp == NULL) return FALSE; + + sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &dcontext); + if (sdp != NULL) { + biop = (BioSourcePtr) sdp->data.ptrvalue; + } + + if (biop == NULL) { + sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_BIOSRC, 0, &fcontext); + if (sfp != NULL) { + biop = (BioSourcePtr) sfp->data.value.ptrvalue; + } + } + + if (biop == NULL) return FALSE; + orp = biop->org; + if (orp == NULL) return FALSE; + + taxname = orp->taxname; + if (StringHasNoText (taxname)) return FALSE; + + onp = orp->orgname; + if (onp != NULL) { + nuclCode = onp->gcode; + mitoCode = onp->mgcode; + } + + mito = (Boolean) (biop->genome == GENOME_kinetoplast || + biop->genome == GENOME_mitochondrion || + biop->genome == GENOME_hydrogenosome); + + plastid = (Boolean) (biop->genome == GENOME_chloroplast || + biop->genome == GENOME_chromoplast || + biop->genome == GENOME_plastid || + biop->genome == GENOME_cyanelle || + biop->genome == GENOME_apicoplast || + biop->genome == GENOME_leucoplast || + biop->genome == GENOME_proplastid); + + if (plastid) { + gencode = 11; + } else if (mito) { + gencode = mitoCode; + } else { + gencode = nuclCode; + } + + if (gencodep != NULL) { + *gencodep = gencode; + } + if (mitop != NULL) { + *mitop = mito; + } + if (plastidp != NULL) { + *plastidp = plastid; + } + if (taxnamep != NULL && maxsize > 0) { + StringNCpy_0 (taxnamep, taxname, maxsize); + } + if (biopp != NULL) { + *biopp = biop; + } + + return TRUE; +} + + static Boolean FindBspItem (GatherContextPtr gcp) { @@ -2711,7 +2803,14 @@ extern Boolean ParseAnticodon (SeqFeatPtr sfp, CharPtr val, Int4 offset) if (StringHasNoText (val)) return FALSE; rrp = (RnaRefPtr) sfp->data.value.ptrvalue; - if (rrp == NULL || rrp->ext.choice != 2) return FALSE; + if (rrp == NULL) return FALSE; + + if (rrp->ext.choice == 0 && rrp->ext.value.ptrvalue == NULL) { + rrp->ext.choice = 2; + trp = (tRNAPtr) MemNew (sizeof (tRNA)); + rrp->ext.value.ptrvalue = (Pointer) trp; + } + if (rrp->ext.choice != 2) return FALSE; trp = (tRNAPtr) rrp->ext.value.ptrvalue; if (trp == NULL) return FALSE; @@ -4002,6 +4101,7 @@ static void CleanupFeatureGBQuals (SeqFeatPtr sfp, Boolean isEmblOrDdbj) if (StringICmp (gbq->qual, "partial") == 0) { sfp->partial = TRUE; } else if (StringICmp (gbq->qual, "evidence") == 0) { + /* if (StringICmp (gbq->val, "experimental") == 0) { if (sfp->exp_ev != 2) { sfp->exp_ev = 1; @@ -4009,6 +4109,7 @@ static void CleanupFeatureGBQuals (SeqFeatPtr sfp, Boolean isEmblOrDdbj) } else if (StringICmp (gbq->val, "not_experimental") == 0) { sfp->exp_ev = 2; } + */ } else if (StringICmp (gbq->qual, "exception") == 0) { sfp->excpt = TRUE; if (! HasNoText (gbq->val)) { @@ -4524,15 +4625,83 @@ static SubSourcePtr SortSubSourceList (SubSourcePtr list) return list; } +static CharPtr TrimParenthesesAndCommasAroundString (CharPtr str) + +{ + Uchar ch; /* to use 8bit characters in multibyte languages */ + CharPtr dst; + CharPtr ptr; + + if (str != NULL && str [0] != '\0') { + dst = str; + ptr = str; + ch = *ptr; + while (ch != '\0' && (ch < ' ' || ch == '(' || ch == ',')) { + ptr++; + ch = *ptr; + } + while (ch != '\0') { + *dst = ch; + dst++; + ptr++; + ch = *ptr; + } + *dst = '\0'; + dst = NULL; + ptr = str; + ch = *ptr; + while (ch != '\0') { + if (ch != ')' && ch != ',') { + dst = NULL; + } else if (dst == NULL) { + dst = ptr; + } + ptr++; + ch = *ptr; + } + if (dst != NULL) { + *dst = '\0'; + } + } + return str; +} + +static CharPtr CombineSplitQual (CharPtr origval, CharPtr newval) + +{ + size_t len; + CharPtr str = NULL; + + if (StringStr (origval, newval) != NULL) return origval; + len = StringLen (origval) + StringLen (newval) + 5; + str = MemNew (sizeof (Char) * len); + if (str == NULL) return origval; + TrimParenthesesAndCommasAroundString (origval); + TrimParenthesesAndCommasAroundString (newval); + StringCpy (str, "("); + StringCat (str, origval); + StringCat (str, ","); + StringCat (str, newval); + StringCat (str, ")"); + /* free original string, knowing return value will replace it */ + MemFree (origval); + return str; +} + static void CleanSubSourceList (SubSourcePtr PNTR sspp) { Char ch; + CharPtr dst; + SubSourcePtr fwd_seq = NULL, rev_seq = NULL; SubSourcePtr last = NULL; + size_t len; SubSourcePtr next; SubSourcePtr PNTR prev; CharPtr ptr; + CharPtr src; SubSourcePtr ssp; + CharPtr str; Boolean unlink; if (sspp == NULL) return; @@ -4550,6 +4719,7 @@ static void CleanSubSourceList (SubSourcePtr PNTR sspp) if (ssp->subtype == SUBSRC_fwd_primer_seq || ssp->subtype == SUBSRC_rev_primer_seq) { if (ssp->name != NULL) { + /* upper case sequence */ ptr = ssp->name; ch = *ptr; while (ch != '\0') { @@ -4559,6 +4729,35 @@ static void CleanSubSourceList (SubSourcePtr PNTR sspp) ptr++; ch = *ptr; } + /* remove any spaces in sequence */ + src = ssp->name; + dst = ssp->name; + ch = *src; + while (ch != '\0') { + if (ch != ' ') { + *dst = ch; + dst++; + } + src++; + ch = *src; + } + *dst = '\0'; + } + } + if (ssp->subtype == SUBSRC_fwd_primer_seq) { + if (fwd_seq == NULL) { + fwd_seq = ssp; + } else { + fwd_seq->name = CombineSplitQual (fwd_seq->name, ssp->name); + unlink = TRUE; + } + } + if (ssp->subtype == SUBSRC_rev_primer_seq) { + if (rev_seq == NULL) { + rev_seq = ssp; + } else { + rev_seq->name = CombineSplitQual (rev_seq->name, ssp->name); + unlink = TRUE; } } CleanVisString (&(ssp->attrib)); @@ -4574,9 +4773,9 @@ static void CleanSubSourceList (SubSourcePtr PNTR sspp) ssp->subtype == SUBSRC_rearranged || ssp->subtype == SUBSRC_transgenic || ssp->subtype == SUBSRC_environmental_sample || - StringICmp (last->name, ssp->name) == 0) || + StringICmp (last->name, ssp->name) == 0 || (last->subtype == SUBSRC_other && - StringStr (last->name, ssp->name) != NULL)) { + StringStr (last->name, ssp->name) != NULL))) { unlink = TRUE; } else if (last->subtype == ssp->subtype && last->subtype == SUBSRC_other && @@ -4599,6 +4798,40 @@ static void CleanSubSourceList (SubSourcePtr PNTR sspp) } ssp = next; } + if (fwd_seq != NULL) { + if (StringChr (fwd_seq->name, ',') != NULL) { + ptr = fwd_seq->name; + len = StringLen (ptr); + if (ptr [0] != '(' || ptr [len - 1] != ')') { + TrimParenthesesAndCommasAroundString (fwd_seq->name); + str = MemNew (sizeof (Char) * (len + 4)); + if (str != NULL) { + StringCpy (str, "("); + StringCat (str, fwd_seq->name); + StringCat (str, ")"); + fwd_seq->name = MemFree (fwd_seq->name); + fwd_seq->name = str; + } + } + } + } + if (rev_seq != NULL) { + if (StringChr (rev_seq->name, ',') != NULL) { + ptr = rev_seq->name; + len = StringLen (ptr); + if (ptr [0] != '(' || ptr [len - 1] != ')') { + TrimParenthesesAndCommasAroundString (rev_seq->name); + str = MemNew (sizeof (Char) * (len + 4)); + if (str != NULL) { + StringCpy (str, "("); + StringCat (str, rev_seq->name); + StringCat (str, ")"); + rev_seq->name = MemFree (rev_seq->name); + rev_seq->name = str; + } + } + } + } } /* if string starts with given prefix, return pointer to remaining text */ @@ -6277,7 +6510,8 @@ static void CleanUpExceptText (SeqFeatPtr sfp) if (StringStr (sfp->except_text, "ribosome slippage") == NULL && StringStr (sfp->except_text, "trans splicing") == NULL && StringStr (sfp->except_text, "alternate processing") == NULL && - StringStr (sfp->except_text, "non-consensus splice site") == NULL) return; + StringStr (sfp->except_text, "non-consensus splice site") == NULL && + StringStr (sfp->except_text, "adjusted for low quality genome") == NULL) return; head = NULL; str = sfp->except_text; @@ -6307,6 +6541,9 @@ static void CleanUpExceptText (SeqFeatPtr sfp) } else if (StringCmp (tmp, "non-consensus splice site") == 0) { vnp->data.ptrvalue = MemFree (tmp); vnp->data.ptrvalue = StringSave ("nonconsensus splice site"); + } else if (StringCmp (tmp, "adjusted for low quality genome") == 0) { + vnp->data.ptrvalue = MemFree (tmp); + vnp->data.ptrvalue = StringSave ("adjusted for low-quality genome"); } } @@ -6634,11 +6871,16 @@ static void CleanupFeatureStrings (SeqFeatPtr sfp, Boolean stripSerial, ValNodeP } } } +/* + * This section has been commented out based on a request by DeAnne Cravaritis. + * If left in, this causes unexpected results when RNA comments are copied to + * the product name or vice versa. if (rrp->ext.choice == 1 && rrp->ext.value.ptrvalue != NULL) { if (StringICmp ((CharPtr) rrp->ext.value.ptrvalue, sfp->comment) == 0) { sfp->comment = MemFree (sfp->comment); } } +*/ if (rrp->type == 4) { name = (CharPtr) rrp->ext.value.ptrvalue; len = StringLen (name); @@ -6823,13 +7065,13 @@ static void CleanupFeatureStrings (SeqFeatPtr sfp, Boolean stripSerial, ValNodeP } if (rrp->type == 255 && rrp->ext.choice == 1) { name = (CharPtr) rrp->ext.value.ptrvalue; - if (StringICmp (name, "its1") == 0) { + if (StringICmp (name, "its1") == 0 || StringICmp (name, "its 1") == 0) { rrp->ext.value.ptrvalue = MemFree (rrp->ext.value.ptrvalue); rrp->ext.value.ptrvalue = StringSave ("internal transcribed spacer 1"); - } else if (StringICmp (name, "its2") == 0) { + } else if (StringICmp (name, "its2") == 0 || StringICmp (name, "its 2") == 0) { rrp->ext.value.ptrvalue = MemFree (rrp->ext.value.ptrvalue); rrp->ext.value.ptrvalue = StringSave ("internal transcribed spacer 2"); - } else if (StringICmp (name, "its3") == 0) { + } else if (StringICmp (name, "its3") == 0 || StringICmp (name, "its 3") == 0) { rrp->ext.value.ptrvalue = MemFree (rrp->ext.value.ptrvalue); rrp->ext.value.ptrvalue = StringSave ("internal transcribed spacer 3"); } @@ -6883,8 +7125,8 @@ static void CleanupFeatureStrings (SeqFeatPtr sfp, Boolean stripSerial, ValNodeP CleanVisStringList (&(orp->mod)); OrpModToSubSource (&(orp->mod), &(biop->subtype)); } - biop->subtype = SortSubSourceList (biop->subtype); CleanSubSourceList (&(biop->subtype)); + biop->subtype = SortSubSourceList (biop->subtype); break; default : break; @@ -7029,8 +7271,8 @@ static void CleanupDescriptorStrings (ValNodePtr sdp, Boolean stripSerial, ValNo CleanVisStringList (&(orp->mod)); OrpModToSubSource (&(orp->mod), &(biop->subtype)); } - biop->subtype = SortSubSourceList (biop->subtype); CleanSubSourceList (&(biop->subtype)); + biop->subtype = SortSubSourceList (biop->subtype); break; case Seq_descr_molinfo : break; diff --git a/api/sqnutil2.c b/api/sqnutil2.c index 784d5c39..f5d24d2b 100644 --- a/api/sqnutil2.c +++ b/api/sqnutil2.c @@ -29,7 +29,7 @@ * * Version Creation Date: 9/2/97 * -* $Revision: 6.240 $ +* $Revision: 6.244 $ * * File Description: * @@ -1852,6 +1852,10 @@ static CharPtr molinfo_tech_list [] = { "fli cDNA", "htgs 0", "htc", "wgs", "barcode", "composite-wgs-htgs", NULL }; +static CharPtr molinfo_completeness_list [] = { + "unknown", "complete", "partial", "no-left", "no-right", "no-ends", "has-left", "has-right", NULL +}; + NLM_EXTERN void ReadTechFromString (CharPtr str, MolInfoPtr mip) { Int4 i; @@ -1868,6 +1872,22 @@ NLM_EXTERN void ReadTechFromString (CharPtr str, MolInfoPtr mip) } } +NLM_EXTERN void ReadCompletenessFromString (CharPtr str, MolInfoPtr mip) +{ + Int4 i; + + if (mip == NULL || str == NULL) + { + return; + } + + for (i = 0; molinfo_completeness_list [i] != NULL; i++) { + if (StringsAreEquivalent (str, molinfo_completeness_list [i])) { + mip->completeness = (Uint1) i; + } + } +} + NLM_EXTERN MolInfoPtr ParseTitleIntoMolInfo ( SqnTagPtr stp, MolInfoPtr mip @@ -1903,11 +1923,7 @@ NLM_EXTERN MolInfoPtr ParseTitleIntoMolInfo ( ReadTechFromString (str, mip); str = SqnTagFind (stp, "completeness"); - if (str != NULL) { - if (StringICmp (str, "complete") == 0) { - mip->completeness = 1; - } - } + ReadCompletenessFromString (str, mip); return mip; } @@ -4002,6 +4018,8 @@ static CharPtr aaList [] = { "Z", "Glx", "Glu or Gln", "U", "Sec", "Selenocysteine", "*", "Ter", "Termination", + "O", "Pyl", "Pyrrolysine", + "J", "Xle", "Leu or Ile", NULL, NULL, NULL }; @@ -4966,6 +4984,9 @@ static void AddQualifierToFeatureEx (SeqFeatPtr sfp, CharPtr qual, CharPtr val, isLocusTag = TRUE; } } + if (qnum == GBQUAL_evidence) { + qnum = -1; /* no longer legal */ + } if (qnum <= -1) { bail = TRUE; if (sfp->data.choice == SEQFEAT_IMP) { @@ -5151,6 +5172,7 @@ static void AddQualifierToFeatureEx (SeqFeatPtr sfp, CharPtr qual, CharPtr val, } else if (qnum == GBQUAL_replace && StringCmp (val, "-") == 0) { val = ""; } else if (qnum == GBQUAL_evidence) { + /* if (StringICmp (val, "experimental") == 0) { sfp->exp_ev = 1; } else if (StringICmp (val, "not_experimental") == 0 || @@ -5159,6 +5181,7 @@ static void AddQualifierToFeatureEx (SeqFeatPtr sfp, CharPtr qual, CharPtr val, StringICmp (val, "non-experimental") == 0) { sfp->exp_ev = 2; } + */ return; } else if (qnum == GBQUAL_exception) { sfp->excpt = TRUE; @@ -8519,6 +8542,165 @@ NLM_EXTERN void PrintQualityScoresToBuffer (BioseqPtr bsp, Boolean gapIsZero, Po ValNodeFreeData (head); } + +NLM_EXTERN void TrimSeqGraph (SeqGraphPtr sgp, Int4 num_to_trim, Boolean from_left) +{ + FloatHiPtr new_flvalues = NULL, old_flvalues; + Int4Ptr new_intvalues = NULL, old_intvalues; + ByteStorePtr new_bytevalues = NULL, old_bytevalues; + Int4 new_len; + Int4 start_pos; + FloatHi fhmax = 0.0, fhmin = 0.0; + Int4 intmax = 0, intmin = 0; + Int2 bs_max = 0, bs_min = 0; + Int4 new_pos, old_pos; + Int2 val; + + if (sgp == NULL || num_to_trim < 1) + { + return; + } + + new_len = sgp->numval - num_to_trim; + if (from_left) + { + start_pos = num_to_trim; + } + else + { + start_pos = 0; + } + + if (sgp->flags[2] == 1) + { + new_flvalues = (FloatHiPtr) MemNew (new_len * sizeof (FloatHi)); + old_flvalues = (FloatHiPtr) sgp->values; + new_pos = 0; + old_pos = start_pos; + while (old_pos < sgp->numval) + { + new_flvalues [new_pos] = old_flvalues[start_pos]; + if (old_pos == start_pos) + { + fhmax = new_flvalues[new_pos]; + fhmin = new_flvalues[new_pos]; + } + else + { + if (fhmax < new_flvalues[new_pos]) + { + fhmax = new_flvalues[new_pos]; + } + + if (fhmin > new_flvalues[new_pos]) + { + fhmin = new_flvalues[new_pos]; + } + } + new_pos++; + old_pos++; + } + old_flvalues = MemFree (old_flvalues); + sgp->values = new_flvalues; + sgp->numval = new_len; + sgp->max.realvalue = fhmax; + sgp->min.realvalue = fhmin; + } + else if (sgp->flags[2] == 2) + { + new_intvalues = (Int4Ptr) MemNew (new_len * sizeof (FloatHi)); + old_intvalues = (Int4Ptr) sgp->values; + new_pos = 0; + old_pos = start_pos; + while (old_pos < sgp->numval) + { + new_intvalues [new_pos] = old_intvalues[start_pos]; + if (old_pos == start_pos) + { + intmax = new_intvalues[new_pos]; + intmin = new_intvalues[new_pos]; + } + else + { + if (intmax < new_intvalues[new_pos]) + { + intmax = new_intvalues[new_pos]; + } + + if (intmin > new_intvalues[new_pos]) + { + intmin = new_intvalues[new_pos]; + } + } + new_pos++; + old_pos++; + } + old_intvalues = MemFree (old_intvalues); + sgp->values = new_intvalues; + sgp->numval = new_len; + sgp->max.intvalue = intmax; + sgp->min.intvalue = intmin; + } + else if (sgp->flags[2] == 3) + { + new_bytevalues = BSNew(new_len + 1); + old_bytevalues = (ByteStorePtr) sgp->values; + new_pos = 0; + old_pos = start_pos; + while (old_pos < sgp->numval) + { + BSSeek (old_bytevalues, old_pos, SEEK_SET); + BSSeek (new_bytevalues, new_pos, SEEK_SET); + val = (Int2) BSGetByte (old_bytevalues); + BSPutByte (new_bytevalues, val); + + if (old_pos == start_pos) + { + bs_max = val; + bs_min = val; + } + else + { + if (bs_max < val) + { + bs_max = val; + } + + if (bs_min > val) + { + bs_min = val; + } + } + new_pos++; + old_pos++; + } + BSPutByte (new_bytevalues, EOF); + old_bytevalues = BSFree (old_bytevalues); + sgp->values = new_bytevalues; + sgp->numval = new_len; + sgp->max.intvalue = bs_max; + sgp->min.intvalue = bs_min; + } +} + + +NLM_EXTERN void TrimQualityScores (BioseqPtr bsp, Int4 num_to_trim, Boolean from_left) +{ + ValNodePtr qual_scores, vnp; + GphItemPtr gip; + + if (bsp == NULL) return; + qual_scores = GetSeqGraphsOnBioseq (bsp->idx.entityID, bsp); + for (vnp = qual_scores; vnp != NULL; vnp = vnp->next) + { + gip = (GphItemPtr) vnp->data.ptrvalue; + if (gip == NULL) continue; + TrimSeqGraph (gip->sgp, num_to_trim, from_left); + } + +} + + NLM_EXTERN BytePtr GetScoresbySeqId (SeqIdPtr sip, Int4Ptr bsplength) { diff --git a/api/sqnutil3.c b/api/sqnutil3.c index 2942e514..ef2a21bf 100644 --- a/api/sqnutil3.c +++ b/api/sqnutil3.c @@ -29,7 +29,7 @@ * * Version Creation Date: 2/7/00 * -* $Revision: 6.63 $ +* $Revision: 6.69 $ * * File Description: * @@ -4329,3 +4329,115 @@ extern Boolean RemoveSequenceFromAlignments (SeqEntryPtr sep, SeqIdPtr sip) VisitAnnotsInSep (sep, (Pointer) sip, RemoveSequenceFromAlignmentsCallback); return TRUE; } + + +static CharPtr inferencePrefix [] = { + "", + "similar to sequence", + "similar to AA sequence", + "similar to DNA sequence", + "similar to RNA sequence", + "similar to RNA sequence, mRNA", + "similar to RNA sequence, EST", + "similar to RNA sequence, other RNA", + "profile", + "nucleotide motif", + "protein motif", + "ab initio prediction", + NULL +}; + +NLM_EXTERN Int2 ValidateInferenceQualifier (CharPtr val, Boolean fetchAccn) + +{ + Int2 accnv, best, j, rsult; + Char ch; + Boolean has_fetch_function, same_species; + size_t len; + ObjMgrProcPtr ompp = NULL; + CharPtr rest, str, tmp; + ErrSev sev; + SeqIdPtr sip; + + if (StringHasNoText (val)) return EMPTY_INFERENCE_STRING; + + rest = NULL; + best = -1; + for (j = 0; inferencePrefix [j] != NULL; j++) { + len = StringLen (inferencePrefix [j]); + if (StringNICmp (val, inferencePrefix [j], len) != 0) continue; + rest = val + len; + best = j; + } + + if (best < 0 || inferencePrefix [best] == NULL) return BAD_INFERENCE_PREFIX; + + if (rest == NULL) return BAD_INFERENCE_BODY; + + same_species = FALSE; + ch = *rest; + while (IS_WHITESP (ch)) { + rest++; + ch = *rest; + } + if (StringNICmp (rest, "(same species)", 14) == 0) { + same_species = TRUE; + rest += 14; + } + ch = *rest; + while (IS_WHITESP (ch) || ch == ':') { + rest++; + ch = *rest; + } + + if (StringHasNoText (rest)) return BAD_INFERENCE_BODY; + + rsult = VALID_INFERENCE; + if (same_species && best > 7) { + rsult = SAME_SPECIES_MISUSED; + } + + str = StringSave (rest); + + tmp = StringChr (str, ':'); + if (tmp != NULL) { + *tmp = '\0'; + tmp++; + TrimSpacesAroundString (str); + TrimSpacesAroundString (tmp); + if (StringDoesHaveText (tmp)) { + if (StringICmp (str, "INSD") == 0 || StringICmp (str, "RefSeq") == 0) { + accnv = ValidateAccnDotVer (tmp); + if (accnv == -5 || accnv == -6) { + rsult = BAD_INFERENCE_ACC_VERSION; + } else if (accnv != 0) { + rsult = BAD_INFERENCE_ACCESSION; + } else if (fetchAccn) { + sip = SeqIdFromAccessionDotVersion (tmp); + has_fetch_function = FALSE; + while ((ompp = ObjMgrProcFindNext(NULL, OMPROC_FETCH, OBJ_SEQID, OBJ_SEQID, ompp)) != NULL) { + if ((ompp->subinputtype == 0) && (ompp->suboutputtype == SEQID_GI)) { + has_fetch_function = TRUE; + } + } + sev = ErrGetMessageLevel (); + ErrSetMessageLevel (SEV_ERROR); + if (has_fetch_function && GetGIForSeqId (sip) == 0) { + rsult = ACC_VERSION_NOT_PUBLIC; + } + ErrSetMessageLevel (sev); + SeqIdFree (sip); + } + } + } + if (StringChr (str, ' ') != NULL) rsult = SPACES_IN_INFERENCE; + if (StringChr (tmp, ' ') != NULL) rsult = SPACES_IN_INFERENCE; + } else { + rsult = SINGLE_INFERENCE_FIELD; + } + + MemFree (str); + + return rsult; +} + diff --git a/api/sqnutils.h b/api/sqnutils.h index 8496dfb2..279fa9c1 100644 --- a/api/sqnutils.h +++ b/api/sqnutils.h @@ -29,7 +29,7 @@ * * Version Creation Date: 9/2/97 * -* $Revision: 6.128 $ +* $Revision: 6.134 $ * * File Description: * @@ -94,6 +94,16 @@ NLM_EXTERN Int2 EntityIDToGeneticCode (Uint2 entityID, BoolPtr mito, CharPtr tax NLM_EXTERN Int2 SeqEntryToGeneticCode (SeqEntryPtr sep, BoolPtr mito, CharPtr taxname, size_t maxsize); NLM_EXTERN Int2 SeqEntryToBioSource (SeqEntryPtr sep, BoolPtr mito, CharPtr taxname, size_t maxsize, BioSourcePtr PNTR biopp); +NLM_EXTERN Boolean BioseqToGeneticCode ( + BioseqPtr bsp, + Int2Ptr gencodep, + BoolPtr mitop, + BoolPtr plastidp, + CharPtr taxnamep, + size_t maxsize, + BioSourcePtr PNTR biopp +); + NLM_EXTERN SeqLocPtr CreateWholeInterval (SeqEntryPtr sep); NLM_EXTERN SeqFeatPtr CreateNewFeature (SeqEntryPtr sep, SeqEntryPtr placeHere, Uint1 choice, SeqFeatPtr useThis); NLM_EXTERN ValNodePtr CreateNewDescriptor (SeqEntryPtr sep, Uint1 choice); @@ -252,6 +262,7 @@ NLM_EXTERN SqnTagPtr SqnTagFree (SqnTagPtr stp); NLM_EXTERN CharPtr SqnTagFind (SqnTagPtr stp, CharPtr tag); NLM_EXTERN void ReadTechFromString (CharPtr str, MolInfoPtr mip); +NLM_EXTERN void ReadCompletenessFromString (CharPtr str, MolInfoPtr mip); /* functions to extract BioSource, MolInfo, and Bioseq information from parsed titles */ @@ -356,6 +367,22 @@ NLM_EXTERN void KeyTagClear (KeyTag PNTR ktp); NLM_EXTERN Int2 KeyFromTag (KeyTag PNTR ktp, CharPtr tag); NLM_EXTERN CharPtr TagFromKey (KeyTag PNTR ktp, Int2 key); +/* inference qualifier utility */ + +#define VALID_INFERENCE 0 +#define EMPTY_INFERENCE_STRING 1 +#define BAD_INFERENCE_PREFIX 2 +#define BAD_INFERENCE_BODY 3 +#define SINGLE_INFERENCE_FIELD 4 +#define SPACES_IN_INFERENCE 5 +#define SAME_SPECIES_MISUSED 6 +#define BAD_INFERENCE_ACCESSION 7 +#define BAD_INFERENCE_ACC_VERSION 8 +#define ACC_VERSION_NOT_PUBLIC 9 + +NLM_EXTERN Int2 ValidateInferenceQualifier (CharPtr val, Boolean fetchAccn); + + /* from Colombe */ NLM_EXTERN SeqLocPtr StringSearchInBioseq (SeqIdPtr sip, CharPtr sub); @@ -391,6 +418,9 @@ NLM_EXTERN SeqEntryPtr SetPhrapContigOrder (SeqEntryPtr head, CharPtr contigs); NLM_EXTERN void PrintQualityScores (BioseqPtr bsp, FILE *fp); +NLM_EXTERN void TrimSeqGraph (SeqGraphPtr sgp, Int4 num_to_trim, Boolean from_left); +NLM_EXTERN void TrimQualityScores (BioseqPtr bsp, Int4 num_to_trim, Boolean from_left); + typedef void (*QualityWriteFunc) (CharPtr buf, Uint4 buflen, Pointer userdata); NLM_EXTERN void PrintQualityScoresToBuffer (BioseqPtr bsp, Boolean gapIsZero, Pointer userdata, QualityWriteFunc callback); diff --git a/api/subutil.c b/api/subutil.c index 801aafb2..07588e4d 100644 --- a/api/subutil.c +++ b/api/subutil.c @@ -29,7 +29,7 @@ * * Version Creation Date: 11/3/93 * -* $Revision: 6.57 $ +* $Revision: 6.62 $ * * File Description: Utilities for creating ASN.1 submissions * @@ -40,6 +40,21 @@ * * * $Log: subutil.c,v $ +* Revision 6.62 2006/02/06 19:00:15 kans +* added CreateFeatureFetchPolicyUserObject +* +* Revision 6.61 2006/01/23 20:57:04 kans +* cosmetic change +* +* Revision 6.60 2006/01/23 16:39:57 kans +* added CreateAnnotDescCommentPolicyUserObject +* +* Revision 6.59 2006/01/17 20:47:05 kans +* fixed AddIDsToGenomeProjectsDBUserObject +* +* Revision 6.58 2006/01/17 18:25:06 kans +* support for genomeprojectsdb user object +* * Revision 6.57 2005/10/26 21:30:46 kans * bug fix in AddSecondaryAccnToEntry provided by Joe Carlson * @@ -4941,7 +4956,7 @@ NLM_EXTERN UserObjectPtr CreateModelEvidenceUserObject ( ) { - UserFieldPtr curr; + UserFieldPtr curr; ObjectIdPtr oip; UserFieldPtr prev = NULL; UserObjectPtr uop; @@ -5261,3 +5276,126 @@ NLM_EXTERN void AddAccessionToTpaAssemblyUserObject (UserObjectPtr uop, CharPtr prev->next = ufp; } +NLM_EXTERN UserObjectPtr CreateGenomeProjectsDBUserObject ( + void +) + +{ + ObjectIdPtr oip; + UserObjectPtr uop; + + uop = UserObjectNew (); + oip = ObjectIdNew (); + oip->str = StringSave ("GenomeProjectsDB"); + uop->type = oip; + + return uop; +} + +NLM_EXTERN UserObjectPtr AddIDsToGenomeProjectsDBUserObject ( + UserObjectPtr uop, + Int4 projectID, + Int4 parentID +) + +{ + UserFieldPtr curr; + UserFieldPtr prev = NULL; + UserFieldPtr last = NULL; + ObjectIdPtr oip; + UserFieldPtr ufp; + + if (uop == NULL) return; + oip = uop->type; + if (oip == NULL || StringICmp (oip->str, "GenomeProjectsDB") != 0) return; + + for (curr = uop->data; curr != NULL; curr = curr->next) { + prev = curr; + } + + ufp = UserFieldNew (); + oip = ObjectIdNew (); + oip->str = StringSave ("ProjectID"); + ufp->label = oip; + ufp->choice = 2; /* integer */ + ufp->data.intvalue = projectID; + + if (prev != NULL) { + prev->next = ufp; + } else { + uop->data = ufp; + } + last = ufp; + + ufp = UserFieldNew (); + oip = ObjectIdNew (); + oip->str = StringSave ("ParentID"); + ufp->label = oip; + ufp->choice = 2; /* integer */ + ufp->data.intvalue = parentID; + + last->next = ufp; + + return uop; +} + +/* annot desc comment policy user object */ + +NLM_EXTERN UserObjectPtr CreateAnnotDescCommentPolicyUserObject ( + Boolean showInCommentBlock +) + +{ + UserFieldPtr curr; + ObjectIdPtr oip; + UserObjectPtr uop; + + uop = UserObjectNew (); + oip = ObjectIdNew (); + oip->str = StringSave ("AnnotDescCommentPolicy"); + uop->type = oip; + + curr = UserFieldNew (); + oip = ObjectIdNew (); + oip->str = StringSave ("Policy"); + curr->label = oip; + curr->choice = 1; /* visible string */ + if (showInCommentBlock) { + curr->data.ptrvalue = (Pointer) StringSave ("ShowInComment"); + } else { + curr->data.ptrvalue = (Pointer) StringSave ("ShowInNote"); + } + + uop->data = curr; + return uop; +} + +/* feature fetch policy user object */ + +NLM_EXTERN UserObjectPtr CreateFeatureFetchPolicyUserObject ( + CharPtr policy +) + +{ + UserFieldPtr curr; + ObjectIdPtr oip; + UserObjectPtr uop; + + if (StringHasNoText (policy)) return NULL; + + uop = UserObjectNew (); + oip = ObjectIdNew (); + oip->str = StringSave ("FeatureFetchPolicy"); + uop->type = oip; + + curr = UserFieldNew (); + oip = ObjectIdNew (); + oip->str = StringSave ("Policy"); + curr->label = oip; + curr->choice = 1; /* visible string */ + curr->data.ptrvalue = (Pointer) StringSave (policy); + + uop->data = curr; + return uop; +} + diff --git a/api/subutil.h b/api/subutil.h index 37cd14f3..eaba6f79 100644 --- a/api/subutil.h +++ b/api/subutil.h @@ -31,7 +31,7 @@ * * Version Creation Date: 11/3/93 * -* $Revision: 6.50 $ +* $Revision: 6.53 $ * * File Description: Utilities for creating ASN.1 submissions * @@ -42,6 +42,15 @@ * * * $Log: subutil.h,v $ +* Revision 6.53 2006/02/06 19:00:15 kans +* added CreateFeatureFetchPolicyUserObject +* +* Revision 6.52 2006/01/23 16:39:57 kans +* added CreateAnnotDescCommentPolicyUserObject +* +* Revision 6.51 2006/01/17 18:25:07 kans +* support for genomeprojectsdb user object +* * Revision 6.50 2005/06/10 14:06:16 kans * added GENOME_hydrogenosome define * @@ -1679,6 +1688,26 @@ NLM_EXTERN void AddAccessionToTpaAssemblyUserObject ( Int4 to ); +NLM_EXTERN UserObjectPtr CreateGenomeProjectsDBUserObject ( + void +); +NLM_EXTERN UserObjectPtr AddIDsToGenomeProjectsDBUserObject ( + UserObjectPtr uop, + Int4 projectID, + Int4 parentID +); + +/* annot desc comment policy user object */ +NLM_EXTERN UserObjectPtr CreateAnnotDescCommentPolicyUserObject ( + Boolean showInCommentBlock +); + +/* feature fetch policy user object */ + +NLM_EXTERN UserObjectPtr CreateFeatureFetchPolicyUserObject ( + CharPtr policy +); + #ifdef __cplusplus } @@ -1739,6 +1768,7 @@ NLM_EXTERN void AddAccessionToTpaAssemblyUserObject ( ( symbol "R", name "Arginine"), ( symbol "S", name "Serine"), ( symbol "T", name "Threoine"), + { symbol "U", name "Selenocysteine"}, ( symbol "V", name "Valine"), ( symbol "W", name "Tryptophan" ), ( symbol "X", name "Undetermined or atypical"), @@ -1753,38 +1783,56 @@ NLM_EXTERN void AddAccessionToTpaAssemblyUserObject ( * Genetic Code id's and names from /ncbi/data/gc.prt * gc.prt lists the legal start codons and genetic codes fully * - name "Standard" , - id 1 , + name "Standard" , + id 1 , + + name "Vertebrate Mitochondrial" , + id 2 , + + name "Yeast Mitochondrial" , + id 3 , + + name "Mold Mitochondrial and Mycoplasma" , + id 4 , + + name "Invertebrate Mitochondrial" , + id 5 , + + name "Ciliate Macronuclear and Daycladacean" , + id 6 , + + name "Echinoderm Mitochondrial" , + id 9 , - name "Vertebrate Mitochondrial" , - id 2 , + name "Euplotid Macronuclear" , + id 10 , - name "Yeast Mitochondrial" , - id 3 , + name "Bacterial and Plant Plastid" , + id 11 , - name "Mold Mitochondrial and Mycoplasma" , - id 4 , + name "Alternative Yeast Nuclear" , + id 12 , - name "Invertebrate Mitochondrial" , - id 5 , + name "Ascidian Mitochondrial" , + id 13 , - name "Ciliate Macronuclear and Daycladacean" , - id 6 , + name "Alternative Flatworm Mitochondrial" , + id 14 , - name "Protozoan Mitochondrial (and Kinetoplast)" , - id 7 , + name "Blepharisma Macronuclear" , + id 15 , - name "Plant Mitochondrial" , - id 8 , + name "Chlorophycean Mitochondrial" , + id 16 , - name "Echinoderm Mitochondrial" , - id 9 , + name "Trematode Mitochondrial" , + id 21 , - name "Euplotid Macronuclear" , - id 10 , + name "Scenedesmus obliquus Mitochondrial" , + id 22 , - name "Eubacterial" , - id 11 , + name "Thraustochytrium Mitochondrial" , + id 23 , * * diff --git a/api/tofasta.c b/api/tofasta.c index 2246549f..dcbbb7d3 100644 --- a/api/tofasta.c +++ b/api/tofasta.c @@ -29,7 +29,7 @@ * * Version Creation Date: 7/12/91 * -* $Revision: 6.148 $ +* $Revision: 6.150 $ * * File Description: various sequence objects to fasta output * @@ -39,6 +39,12 @@ * ------- ---------- ----------------------------------------------------- * * $Log: tofasta.c,v $ +* Revision 6.150 2006/01/10 22:19:29 kans +* CreateDefLine calls DoTpaPrefix to handle TPA_exp and TPA_inf +* +* Revision 6.149 2005/12/07 19:49:46 kans +* in BioseqFastaStreamInternal, bail if virtual Bioseq +* * Revision 6.148 2005/09/12 17:44:21 kans * in complete chromosome title, use virus instead of virion * @@ -1584,6 +1590,8 @@ static Int4 BioseqFastaStreamInternal ( if (bsp == NULL && slp == NULL) return 0; if (fp == NULL && bs == NULL) return 0; + if (bsp != NULL && bsp->repr == Seq_repr_virtual) return 0; + if (linelen > 128) { linelen = 128; } @@ -4215,6 +4223,46 @@ static Boolean NotSpecialTaxName (CharPtr taxname) return TRUE; } +static Boolean DoTpaPrefix ( + CharPtr title, + CharPtr PNTR ttl, + CharPtr PNTR pfx, + Boolean is_tpa, + Boolean tpa_exp, + Boolean tpa_inf +) + +{ + /* must be called with ttl and pfx pointing to stack variables */ + *ttl = title; + *pfx = NULL; + + if (title == NULL || *title == '\0') return FALSE; + + if (is_tpa) { + if (tpa_exp) { + if (StringNICmp (title, "TPA_exp: ", 9) == 0) return FALSE; + *pfx = "TPA_exp: "; + if (StringNICmp (title, "TPA: ", 5) == 0) { + *ttl = title + 5; + } + return TRUE; + } else if (tpa_inf) { + if (StringNICmp (title, "TPA_inf: ", 9) == 0) return FALSE; + *pfx = "TPA_inf: "; + if (StringNICmp (title, "TPA: ", 5) == 0) { + *ttl = title + 5; + } + return TRUE; + } else { + if (StringNICmp (title, "TPA: ", 5) == 0) return FALSE; + *pfx = "TPA: "; + return TRUE; + } + } + return FALSE; +} + /***************************************************************************** * * CreateDefLine(iip, bsp, buf, buflen, tech) @@ -4230,7 +4278,7 @@ NLM_EXTERN Boolean CreateDefLineExEx (ItemInfoPtr iip, BioseqPtr bsp, CharPtr bu CharPtr accession, CharPtr organism, Boolean ignoreTitle, Boolean extProtTitle) { ValNodePtr vnp = NULL; - CharPtr tmp = NULL, title = NULL; + CharPtr tmp = NULL, title = NULL, ttl = NULL, pfx = NULL; PdbBlockPtr pbp; PatentSeqIdPtr psip; PDBSeqIdPtr pdbip; @@ -4245,7 +4293,8 @@ NLM_EXTERN Boolean CreateDefLineExEx (ItemInfoPtr iip, BioseqPtr bsp, CharPtr bu "WORKING DRAFT SEQUENCE", "*** SEQUENCING IN PROGRESS ***" }; Boolean htg_tech = FALSE, htgs_draft = FALSE, htgs_cancelled = FALSE, - is_nc = FALSE, is_nm = FALSE, is_nr = FALSE, is_tpa = FALSE; + is_nc = FALSE, is_nm = FALSE, is_nr = FALSE, is_tpa = FALSE, + tpa_exp = FALSE, tpa_inf = FALSE; MolInfoPtr mip; GBBlockPtr gbp = NULL; EMBLBlockPtr ebp = NULL; @@ -4321,7 +4370,7 @@ NLM_EXTERN Boolean CreateDefLineExEx (ItemInfoPtr iip, BioseqPtr bsp, CharPtr bu buf += diff; } diff = 0; - if (htg_tech) { + if (htg_tech || is_tpa) { vnp=GatherDescrOnBioseq(iip, bsp, Seq_descr_genbank,TRUE); if (vnp != NULL) { gbp = (GBBlockPtr) vnp->data.ptrvalue; @@ -4337,6 +4386,20 @@ NLM_EXTERN Boolean CreateDefLineExEx (ItemInfoPtr iip, BioseqPtr bsp, CharPtr bu } } } + if (keywords != NULL) { + for (vnp = keywords; vnp != NULL; vnp = vnp->next) { + if (StringICmp ((CharPtr) vnp->data.ptrvalue, "HTGS_DRAFT") == 0) { + htgs_draft = TRUE; + } else if (StringICmp ((CharPtr) vnp->data.ptrvalue, "HTGS_CANCELLED") == 0) { + htgs_cancelled = TRUE; + } else if (StringICmp ((CharPtr) vnp->data.ptrvalue, "TPA:experimental") == 0) { + tpa_exp = TRUE; + } else if (StringICmp ((CharPtr) vnp->data.ptrvalue, "TPA:inferential") == 0) { + tpa_inf = TRUE; + } + } + } + if (! ignoreTitle) { vnp=GatherDescrOnBioseq(iip, bsp, Seq_descr_title,TRUE); @@ -4409,12 +4472,14 @@ NLM_EXTERN Boolean CreateDefLineExEx (ItemInfoPtr iip, BioseqPtr bsp, CharPtr bu } /* some titles may have zero length */ if (title != NULL && *title != '\0') { - if (is_tpa && StringNICmp (title, "TPA: ", 5) != 0) { - diff = LabelCopy (buf, "TPA: ", buflen); + ttl = title; + pfx = NULL; + if (DoTpaPrefix (title, &ttl, &pfx, is_tpa, tpa_exp, tpa_inf)) { + diff = LabelCopy (buf, pfx, buflen); buflen -= diff; buf += diff; } - diff = LabelCopy(buf, title, buflen); + diff = LabelCopy (buf, ttl, buflen); /* remove trailing blanks and periods */ tmp = buf + diff - 1; /* point at last character */ while (tmp >= buf && ((*tmp <= ' ') || (*tmp == '.'))) { @@ -4495,12 +4560,14 @@ NLM_EXTERN Boolean CreateDefLineExEx (ItemInfoPtr iip, BioseqPtr bsp, CharPtr bu diff = LabelCopy(buf, title, buflen); } */ - if (is_tpa && StringNICmp (title, "TPA: ", 5) != 0) { - diff = LabelCopy (buf, "TPA: ", buflen); - buflen -= diff; - buf += diff; - } - diff = LabelCopy(buf, title, buflen); + ttl = title; + pfx = NULL; + if (DoTpaPrefix (title, &ttl, &pfx, is_tpa, tpa_exp, tpa_inf)) { + diff = LabelCopy (buf, pfx, buflen); + buflen -= diff; + buf += diff; + } + diff = LabelCopy (buf, ttl, buflen); if (organism == NULL && taxname != NULL) { organism = taxname; iip = NULL; @@ -4512,15 +4579,17 @@ NLM_EXTERN Boolean CreateDefLineExEx (ItemInfoPtr iip, BioseqPtr bsp, CharPtr bu if (title == NULL) { title = UseOrgMods(bsp, NULL); } - if (is_tpa && StringNICmp (title, "TPA: ", 5) != 0) { - diff = LabelCopy (buf, "TPA: ", buflen); - buflen -= diff; - buf += diff; - } - if (title != NULL) { - diff = LabelCopy(buf, title, buflen); + ttl = title; + pfx = NULL; + if (DoTpaPrefix (title, &ttl, &pfx, is_tpa, tpa_exp, tpa_inf)) { + diff = LabelCopy (buf, pfx, buflen); + buflen -= diff; + buf += diff; + } + if (ttl != NULL) { + diff = LabelCopy (buf, ttl, buflen); } else { - diff = LabelCopy(buf, "No definition line found", buflen); + diff = LabelCopy (buf, "No definition line found", buflen); } } } @@ -4537,12 +4606,14 @@ NLM_EXTERN Boolean CreateDefLineExEx (ItemInfoPtr iip, BioseqPtr bsp, CharPtr bu title = UseOrgMods(bsp, NULL); organism = NULL; if (title != NULL) { - if (is_tpa && StringNICmp (title, "TPA: ", 5) != 0) { - diff = LabelCopy (buf, "TPA: ", buflen); - buflen -= diff; - buf += diff; - } - diff = LabelCopy(buf, title, buflen); + ttl = title; + pfx = NULL; + if (DoTpaPrefix (title, &ttl, &pfx, is_tpa, tpa_exp, tpa_inf)) { + diff = LabelCopy (buf, pfx, buflen); + buflen -= diff; + buf += diff; + } + diff = LabelCopy (buf, ttl, buflen); buflen -= diff; buf += diff; } @@ -4564,15 +4635,6 @@ NLM_EXTERN Boolean CreateDefLineExEx (ItemInfoPtr iip, BioseqPtr bsp, CharPtr bu i = 0; } } else { - if (keywords != NULL) { - for (vnp = keywords; vnp != NULL; vnp = vnp->next) { - if (StringICmp ((CharPtr) vnp->data.ptrvalue, "HTGS_DRAFT") == 0) { - htgs_draft = TRUE; - } else if (StringICmp ((CharPtr) vnp->data.ptrvalue, "HTGS_CANCELLED") == 0) { - htgs_cancelled = TRUE; - } - } - } if (htgs_draft) { if (StringStr(title, "WORKING DRAFT") == NULL) { doit = TRUE; @@ -4629,12 +4691,14 @@ NLM_EXTERN Boolean CreateDefLineExEx (ItemInfoPtr iip, BioseqPtr bsp, CharPtr bu title = UseOrgMods(bsp, NULL); organism = NULL; if (title != NULL) { - if (is_tpa && StringNICmp (title, "TPA: ", 5) != 0) { - diff = LabelCopy (buf, "TPA: ", buflen); - buflen -= diff; - buf += diff; - } - diff = LabelCopy(buf, title, buflen); + ttl = title; + pfx = NULL; + if (DoTpaPrefix (title, &ttl, &pfx, is_tpa, tpa_exp, tpa_inf)) { + diff = LabelCopy (buf, pfx, buflen); + buflen -= diff; + buf += diff; + } + diff = LabelCopy (buf, ttl, buflen); buflen -= diff; buf += diff; } diff --git a/api/txalign.c b/api/txalign.c index cf366ced..8af8d57e 100644 --- a/api/txalign.c +++ b/api/txalign.c @@ -1,4 +1,4 @@ -/* $Id: txalign.c,v 6.91 2005/05/16 17:39:20 papadopo Exp $ +/* $Id: txalign.c,v 6.92 2006/01/24 18:37:08 papadopo Exp $ *************************************************************************** * * * COPYRIGHT NOTICE * @@ -27,13 +27,16 @@ * * File Name: txalign.c * -* $Revision: 6.91 $ +* $Revision: 6.92 $ * * File Description: Formating of text alignment for the BLAST output * * Modifications: * -------------------------------------------------------------------------- * $Log: txalign.c,v $ +* Revision 6.92 2006/01/24 18:37:08 papadopo +* from Mike Gertz: Use enumerated values, rather than #define'd constants, to specify the composition adjustment method +* * Revision 6.91 2005/05/16 17:39:20 papadopo * From Alejandro Schaffer: if matrix is adjusted due to composition in * blastpgp, then print the method for adjustment in the output alignments. @@ -547,6 +550,7 @@ #include <salpstat.h> #include <fdlKludge.h> #include <blastdef.h> +#include <algo/blast/composition_adjustment/composition_constants.h> #define BUFFER_LENGTH 2048 #define MIN_INS_SPACE 50 @@ -5662,7 +5666,7 @@ NLM_EXTERN int LIBCALLBACK FormatScoreFunc(AlignStatOptionPtr asop) Char fastaLongIdBuf[BUFFER_LENGTH+1]; SeqIdPtr firstSip=NULL; Int4 num_ident; - Int2 comp_adjustment_method = NO_COMP_ADJUSTMENT; + Int2 comp_adjustment_method = eNoCompositionBasedStats; sp = asop->sp; @@ -5881,10 +5885,10 @@ NLM_EXTERN int LIBCALLBACK FormatScoreFunc(AlignStatOptionPtr asop) else sprintf(buffer, "Expect(%ld+) = %s", (long) number, eval_buff_ptr); fprintf(asop->fp, "%s", buffer); - if (NO_COMP_ADJUSTMENT != comp_adjustment_method) { - if (COMP_BASED_STATISTICS == comp_adjustment_method) + if (eNoCompositionBasedStats != comp_adjustment_method) { + if (eCompositionBasedStats == comp_adjustment_method) sprintf(buffer,", Method: Composition-based stats."); - if (COMP_MATRIX_ADJUSTMENT == comp_adjustment_method) + if (eCompositionMatrixAdjust == comp_adjustment_method) sprintf(buffer,", Method: Compositional matrix adjust."); fprintf(asop->fp, "%s", buffer); } diff --git a/api/utilpars.c b/api/utilpars.c index 69d7070e..4e82173a 100644 --- a/api/utilpars.c +++ b/api/utilpars.c @@ -3,6 +3,9 @@ * -- all common routines for main programs in this directory * * $Log: utilpars.c,v $ +* Revision 6.3 2006/01/31 22:31:49 kans +* added O for pyrrolysine and J for leu or ile ambiguity +* * Revision 6.2 2001/12/06 17:00:41 kans * TextSave takes size_t, not Int2, otherwise titin protein tries to allocate negative number * @@ -113,21 +116,23 @@ ParFlat_AA1_to_AA3 "Gly", 'G', "His", 'H', "Ile", 'I', - "Lys", 'K', /* notice no 'J', breaks naive meaning of index -Karl */ + "Xle", 'J', /* was - notice no 'J', breaks naive meaning of index -Karl */ + "Lys", 'K', "Leu", 'L', "Met", 'M', "Asn", 'N', - "Pro", 'P', /* no 'O' */ + "Pyl", 'O', /* was - no 'O' */ + "Pro", 'P', "Gln", 'Q', "Arg", 'R', "Ser", 'S', "Thr", 'T', "Val", 'V', "Trp", 'W', - "Xxx", 'X', /* no U */ + "Sec", 'U', /* was - not in iupacaa */ + "Xxx", 'X', "Tyr", 'Y', "Glx", 'Z', - "Sec", 'U', /* not in iupacaa */ "TERM", '*', /* not in iupacaa */ /*changed by Tatiana 06.07.95?`*/ "OTHER", 'X'}; diff --git a/api/utilpars.h b/api/utilpars.h index 9856b683..57551c01 100644 --- a/api/utilpars.h +++ b/api/utilpars.h @@ -2,6 +2,9 @@ * utilpars.h: * * $Log: utilpars.h,v $ +* Revision 6.2 2006/01/31 22:31:49 kans +* added O for pyrrolysine and J for leu or ile ambiguity +* * Revision 6.1 2001/12/06 17:00:41 kans * TextSave takes size_t, not Int2, otherwise titin protein tries to allocate negative number * @@ -37,7 +40,7 @@ #endif -#define ParFlat_TOTAL_AA 26 +#define ParFlat_TOTAL_AA 28 /*************************************************************************/ diff --git a/api/valid.c b/api/valid.c index 64f4f377..0a8ecae1 100644 --- a/api/valid.c +++ b/api/valid.c @@ -29,7 +29,7 @@ * * Version Creation Date: 1/1/94 * -* $Revision: 6.682 $ +* $Revision: 6.726 $ * * File Description: Sequence editing utilities * @@ -39,6 +39,138 @@ * ------- ---------- ----------------------------------------------------- * * $Log: valid.c,v $ +* Revision 6.726 2006/02/27 17:49:34 kans +* added adjusted for low-quality genome exception for RefSeq models +* +* Revision 6.725 2006/02/24 22:49:39 kans +* call BioseqToGeneticCode instead of much less efficient functions +* +* Revision 6.724 2006/02/23 23:05:53 kans +* added ERR_SEQ_FEAT_FeatureSeqIDCaseDifference +* +* Revision 6.723 2006/02/23 22:36:05 kans +* added ERR_SEQ_INST_CaseDifferenceInSeqID +* +* Revision 6.722 2006/02/17 20:12:06 kans +* fixed text of ITSdoesNotAbutRRNA for one overlap case +* +* Revision 6.721 2006/02/16 19:34:28 kans +* use vsp->is_smupd_in_sep to suppress ERR_SEQ_FEAT_FeatureRefersToAccession +* +* Revision 6.720 2006/02/15 17:08:55 kans +* made ITSdoesNotAbutRRNA more sophisticated, also handles tRNA inside small and large rRNA +* +* Revision 6.719 2006/02/10 18:26:50 kans +* added ERR_SEQ_FEAT_ITSdoesNotAbutRRNA +* +* Revision 6.718 2006/02/08 17:49:25 kans +* added ERR_SEQ_FEAT_SelfReferentialProduct +* +* Revision 6.717 2006/02/08 16:27:18 kans +* report ERR_SEQ_FEAT_TranslExcept even if protein is okay +* +* Revision 6.716 2006/02/08 14:34:56 kans +* [fwd/rev]-primer-[seq/name] changed to [fwd/rev]-pcr-primer-[seq/name] +* +* Revision 6.715 2006/02/07 20:36:37 kans +* ERR_SEQ_INST_InternalNsAdjacentToGap shows first position +* +* Revision 6.714 2006/02/07 20:29:59 kans +* added ERR_SEQ_INST_InternalNsAdjacentToGap +* +* Revision 6.713 2006/02/06 16:26:03 kans +* check for both TPA:experimental and TPA:inferential keywords +* +* Revision 6.712 2006/02/03 19:37:12 kans +* ERR_SEQ_INST_InternalNsInSeq[Lit/Raw] add one to zero-based position +* +* Revision 6.711 2006/02/02 22:24:38 kans +* warn if product gbqual on trna +* +* Revision 6.710 2006/01/31 22:31:49 kans +* added O for pyrrolysine and J for leu or ile ambiguity +* +* Revision 6.709 2006/01/26 19:54:26 kans +* added ERR_SEQ_FEAT_FeatureRefersToAccession to look for inconsistent use of gi and accession (with or without version) for sfp->location or sfp->product references in a single blob +* +* Revision 6.708 2006/01/25 20:09:33 kans +* BadDeltaSeq not done if MI_TECH_composite_wgs_htgs +* +* Revision 6.707 2006/01/24 20:17:12 kans +* ERR_SEQ_FEAT_InternalStop goes to SEV_REJECT if has GI and GenBank/EMBL/DDBJ and not RefSeq +* +* Revision 6.706 2006/01/24 19:06:39 kans +* added ERR_SEQ_DESCR_BadPCRPrimerSequence +* +* Revision 6.705 2006/01/24 15:46:08 kans +* added ERR_SEQ_FEAT_HpotheticalProteinMismatch +* +* Revision 6.704 2006/01/18 20:55:08 kans +* CheckTrnaCodons reports BadTrnaAA if aa is 0 or 255 - usually meaning it was not set +* +* Revision 6.703 2006/01/13 20:26:24 kans +* lower severity of duplicate feature error to warning if partial viral genes +* +* Revision 6.702 2006/01/10 18:22:18 kans +* find embedded html strings only if VALIDATE_ALL +* +* Revision 6.701 2006/01/05 20:23:00 kans +* set isCuratedFlybase flag even if GenBank record for lowering duplicate feature severity, suppressing if dicistronic gene +* +* Revision 6.700 2006/01/04 21:29:22 kans +* use FindStringsInEntity to find embedded script tags by finite state machine +* +* Revision 6.699 2006/01/03 19:48:39 kans +* added javascript: to findrepstrs +* +* Revision 6.698 2006/01/03 16:52:54 kans +* ValidateInferenceQualifier takes fetchAccn argument, added ACC_VERSION_NOT_PUBLIC reply type +* +* Revision 6.697 2006/01/03 14:31:39 kans +* LookForMultipleUnpubPubs relies on SetPubScratchData and ClearPubScratchData to make unique strings only once per pub +* +* Revision 6.696 2005/12/30 16:24:37 kans +* inference qualifier for INSD or RefSeq requires valid accession.version +* +* Revision 6.695 2005/12/29 22:24:02 kans +* added <applet and <form to list of strings to check for script injection attack +* +* Revision 6.694 2005/12/29 21:45:57 kans +* added ERR_GENERIC_EmbeddedScript, use FindReplaceInEntity with callback to find possible javascript injection attacks +* +* Revision 6.693 2005/12/29 19:20:28 kans +* InternalNsInSeqRaw printed for each run of Ns, not just for maximum length +* +* Revision 6.692 2005/12/23 20:16:32 kans +* added ERR_SEQ_FEAT_InvalidInferenceValue +* +* Revision 6.691 2005/12/23 18:34:18 kans +* modified cds/mrna/gene conditions on reporting partials +* +* Revision 6.690 2005/12/16 18:42:59 kans +* dicistronic gene exception turns off Duplicate Feature and SuspiciousGeneXref if curated Drosophila +* +* Revision 6.689 2005/12/15 14:22:01 kans +* ERR_SEQ_INST_InternalNsInSeqRaw triggered if >= 100, not > 100 +* +* Revision 6.688 2005/12/13 23:17:27 kans +* In Splice acceptor consensus (AG) not found before exon message, print sip if no bsp +* +* Revision 6.687 2005/12/13 23:05:22 kans +* added ERR_GENERIC_CollidingSerialNumbers +* +* Revision 6.686 2005/12/13 22:16:55 kans +* always initialize tbuf in SpliceCheckEx +* +* Revision 6.685 2005/12/08 19:50:30 kans +* FindSameCDS does not suppress if only one end is identical - also require dashes in collection_date +* +* Revision 6.684 2005/12/07 21:15:53 kans +* ERR_SEQ_FEAT_UTRdoesNotAbutCDS always sets UTR feature context, clears once at end +* +* Revision 6.683 2005/12/06 22:20:12 kans +* raised ERR_SEQ_DESCR_BadCountryCode to SEV_ERROR +* * Revision 6.682 2005/12/02 15:11:09 kans * in ValidateSeqFeat, comment out exception for cdregion same as mrna in partial not at start/stop and not consensus splice site * @@ -2258,6 +2390,7 @@ static char *this_file = __FILE__; #include <explore.h> #include <subutil.h> #include <tofasta.h> +#include <findrepl.h> /***************************************************************************** * @@ -2338,6 +2471,9 @@ NLM_EXTERN void ValidStructClear (ValidStructPtr vsp) TextFsaPtr sourceQualTags; Boolean is_htg_in_sep; Boolean is_refseq_in_sep; + Boolean is_smupd_in_sep; + Boolean feat_loc_has_gi; + Boolean feat_prod_has_gi; if (vsp == NULL) return; @@ -2369,6 +2505,9 @@ NLM_EXTERN void ValidStructClear (ValidStructPtr vsp) sourceQualTags = vsp->sourceQualTags; is_htg_in_sep = vsp->is_htg_in_sep; is_refseq_in_sep = vsp->is_refseq_in_sep; + is_smupd_in_sep = vsp->is_smupd_in_sep; + feat_loc_has_gi = vsp->feat_loc_has_gi; + feat_prod_has_gi = vsp->feat_prod_has_gi; MemSet ((VoidPtr) vsp, 0, sizeof (ValidStruct)); vsp->errbuf = errbuf; vsp->cutoff = cutoff; @@ -2397,6 +2536,9 @@ NLM_EXTERN void ValidStructClear (ValidStructPtr vsp) vsp->sourceQualTags = sourceQualTags; vsp->is_htg_in_sep = is_htg_in_sep; vsp->is_refseq_in_sep = is_refseq_in_sep; + vsp->is_smupd_in_sep = is_smupd_in_sep; + vsp->feat_loc_has_gi = feat_loc_has_gi; + vsp->feat_prod_has_gi = feat_prod_has_gi; return; } @@ -2666,7 +2808,9 @@ static CharPtr err1Label [] = { "TerminalGap", "OverlappingDeltaRange", "LeadingX", - "InternalNsInSeqRaw" + "InternalNsInSeqRaw", + "InternalNsAdjacentToGap", + "CaseDifferenceInSeqID" }; static CharPtr err2Label [] = { @@ -2705,7 +2849,8 @@ static CharPtr err2Label [] = { "BioSourceInconsistency", "FastaBracketTitle", "MissingText", - "BadCollectionDate" + "BadCollectionDate", + "BadPCRPrimerSequence" }; static CharPtr err3Label [] = { @@ -2718,7 +2863,9 @@ static CharPtr err3Label [] = { "BadPageNumbering", "MedlineEntryPub", "BadDate", - "StructuredCitGenCit" + "StructuredCitGenCit", + "CollidingSerialNumbers", + "EmbeddedScript" }; static CharPtr err4Label [] = { @@ -2852,7 +2999,13 @@ static CharPtr err5Label [] = { "ExceptionProblem", "PolyAsignalNotRange", "OldLocusTagMismtach", - "DuplicateGeneOntologyTerm" + "DuplicateGeneOntologyTerm", + "InvalidInferenceValue", + "HpotheticalProteinMismatch", + "FeatureRefersToAccession", + "SelfReferentialProduct", + "ITSdoesNotAbutRRNA", + "FeatureSeqIDCaseDifference" }; static CharPtr err6Label [] = { @@ -3441,6 +3594,10 @@ static Boolean Valid1GatherProc (GatherContextPtr gcp) ValNodePtr sdp; BioSourcePtr biop; PubdescPtr pdp; + BioseqPtr bsp; + SeqIdPtr sip; + Char buf [64]; + Char tmp [64]; SeqMgrFeatContext context; vsp = (ValidStructPtr) (gcp->userdata); @@ -3514,6 +3671,22 @@ static Boolean Valid1GatherProc (GatherContextPtr gcp) if (vsp->useSeqMgrIndexes) { if (SeqMgrGetDesiredFeature (gcp->entityID, NULL, 0, 0, sfp, &context) == NULL) { ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_UnindexedFeature, "Feature is not indexed"); + } else { + bsp = BioseqFindFromSeqLoc (sfp->location); + if (bsp != NULL) { + sip = SeqLocId (sfp->location); + if (sip != NULL && sip->choice != SEQID_GI && sip->choice != SEQID_GIBBSQ && sip->choice != SEQID_GIBBMT) { + SeqIdWrite (sip, buf, PRINTID_FASTA_SHORT, sizeof (buf) - 1); + for (sip = bsp->id; sip != NULL; sip = sip->next) { + if (sip->choice == SEQID_GI || sip->choice == SEQID_GIBBSQ || sip->choice == SEQID_GIBBMT) continue; + SeqIdWrite (sip, tmp, PRINTID_FASTA_SHORT, sizeof (tmp) - 1); + if (StringICmp (buf, tmp) != 0) continue; + if (StringCmp (buf, tmp) == 0) continue; + ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_FeatureSeqIDCaseDifference, + "Sequence identifier in feature location differs in capitalization with identifier on Bioseq"); + } + } + } } } } @@ -3624,6 +3797,12 @@ typedef struct ftprob { Uint4 num_tpa_with_hist; Uint4 num_tpa_without_hist; Boolean has_gi; + Boolean loc_has_gi; + Boolean loc_has_just_accn; + Boolean loc_has_accn_ver; + Boolean prod_has_gi; + Boolean prod_has_just_accn; + Boolean prod_has_accn_ver; } FeatProb, PNTR FeatProbPtr; static void CheckFeatPacking (BioseqPtr bsp, SeqFeatPtr sfp, Uint4Ptr num_misplaced_features) @@ -3791,6 +3970,87 @@ static void CountGeneXrefs (SeqFeatPtr sfp, Pointer userdata) (fpp->num_gene_xrefs)++; } +static void CountSfpLocIdTypes (SeqIdPtr sip, Pointer userdata) + +{ + FeatProbPtr fpp; + TextSeqIdPtr tsip; + + if (sip == NULL || userdata == NULL) return; + fpp = (FeatProbPtr) userdata; + + switch (sip->choice) { + case SEQID_GI : + fpp->loc_has_gi = TRUE; + break; + case SEQID_GENBANK : + case SEQID_EMBL : + case SEQID_DDBJ : + case SEQID_TPG : + case SEQID_TPE : + case SEQID_TPD : + case SEQID_OTHER : + tsip = (TextSeqIdPtr) sip->data.ptrvalue; + if (tsip != NULL) { + if (StringDoesHaveText (tsip->accession)) { + if (tsip->version < 1) { + fpp->loc_has_just_accn = TRUE; + } else { + fpp->loc_has_accn_ver = TRUE; + } + } + } + break; + default : + break; + } +} + +static void CountSfpProdIdTypes (SeqIdPtr sip, Pointer userdata) + +{ + FeatProbPtr fpp; + TextSeqIdPtr tsip; + + if (sip == NULL || userdata == NULL) return; + fpp = (FeatProbPtr) userdata; + + switch (sip->choice) { + case SEQID_GI : + fpp->prod_has_gi = TRUE; + break; + case SEQID_GENBANK : + case SEQID_EMBL : + case SEQID_DDBJ : + case SEQID_TPG : + case SEQID_TPE : + case SEQID_TPD : + case SEQID_OTHER : + tsip = (TextSeqIdPtr) sip->data.ptrvalue; + if (tsip != NULL) { + if (StringDoesHaveText (tsip->accession)) { + if (tsip->version < 1) { + fpp->prod_has_just_accn = TRUE; + } else { + fpp->prod_has_accn_ver = TRUE; + } + } + } + break; + default : + break; + } +} + +static void CountFeatLocIdTypes (SeqFeatPtr sfp, Pointer userdata) + +{ + if (sfp == NULL || userdata == NULL) return; + + VisitSeqIdsInSeqLoc (sfp->location, userdata, CountSfpLocIdTypes); + VisitSeqIdsInSeqLoc (sfp->product, userdata, CountSfpProdIdTypes); +} + static Boolean HasTpaUserObject (BioseqPtr bsp) { @@ -3864,6 +4124,7 @@ typedef struct vfcdata { ValNodePtr uids; ValNodePtr unpub; ValNodePtr publshd; + ValNodePtr serial; ValidStructPtr vsp; } VfcData, PNTR VfcPtr; @@ -3886,6 +4147,7 @@ static void MakePubTags (PubdescPtr pdp, Pointer userdata) { Char buf [1024]; + CitGenPtr cgp; Int4 muid = 0, pmid = 0; VfcPtr vfp; ValNodePtr vnp; @@ -3898,6 +4160,16 @@ static void MakePubTags (PubdescPtr pdp, Pointer userdata) muid = vnp->data.intvalue; } else if (vnp->choice == PUB_PMid) { pmid = vnp->data.intvalue; + } else if (vnp->choice == PUB_Gen) { + cgp = (CitGenPtr) vnp->data.ptrvalue; + if (cgp != NULL && cgp->serial_number > 0) { + vnp = ValNodeNew (NULL); + if (vnp != NULL) { + vnp->data.intvalue = (Int4) cgp->serial_number; + vnp->next = vfp->serial; + vfp->serial = vnp; + } + } } } @@ -4018,6 +4290,43 @@ static void CheckFeatCits (SeqFeatPtr sfp, Pointer userdata) } } +static void CheckForCollidingSerials ( + ValidStructPtr vsp, + GatherContextPtr gcp, + ValNodePtr list +) + +{ + Int4 curr, last, max; + Uint2 olditemtype = 0; + Uint2 olditemid = 0; + ValNodePtr vnp; + + if (vsp == NULL || gcp == NULL || list == NULL) return; + + olditemid = gcp->itemID; + olditemtype = gcp->thistype; + gcp->itemID = 0; + gcp->thistype = 0; + + last = (Int4) list->data.intvalue; + max = last; + for (vnp = list->next; vnp != NULL; vnp = vnp->next) { + curr = (Int4) vnp->data.intvalue; + if (last == curr) { + if (curr > max) { + ValidErr (vsp, SEV_WARNING, ERR_GENERIC_CollidingSerialNumbers, + "Multiple publications have serial number %ld", (long) curr); + max = curr; + } + } + last = curr; + } + + gcp->itemID = olditemid; + gcp->thistype = olditemtype; +} + static void ValidateFeatCits (SeqEntryPtr sep, ValidStructPtr vsp) { @@ -4038,9 +4347,17 @@ static void ValidateFeatCits (SeqEntryPtr sep, ValidStructPtr vsp) VisitFeaturesInSep (sep, (Pointer) &vfd, CheckFeatCits); + vsp->bssp = NULL; + vsp->bsp = NULL; + vsp->sfp = NULL; + vsp->descr = NULL; + vfd.serial = ValNodeSort (vfd.serial, SortByIntvalue); + CheckForCollidingSerials (vsp, vsp->gcp, vfd.serial); + ValNodeFree (vfd.uids); ValNodeFreeData (vfd.unpub); ValNodeFreeData (vfd.publshd); + ValNodeFree (vfd.serial); } static void ValidateFeatIDs (Uint2 entityID, ValidStructPtr vsp) @@ -4097,6 +4414,111 @@ static void ValidateFeatIDs (Uint2 entityID, ValidStructPtr vsp) } } +typedef struct vsicdata { + ValidStructPtr vsp; + ValNodePtr headid; + ValNodePtr tailid; +} VsicData, PNTR VsicDataPtr; + +static void CaptureTextSeqIDs (BioseqPtr bsp, Pointer userdata) + +{ + Char buf [64]; + SeqIdPtr sip; + VsicDataPtr vdp; + ValNodePtr vnp; + + if (bsp == NULL || userdata == NULL) return; + vdp = (VsicDataPtr) userdata; + + for (sip = bsp->id; sip != NULL; sip = sip->next) { + if (sip->choice == SEQID_GI || sip->choice == SEQID_GIBBSQ || sip->choice == SEQID_GIBBMT) continue; + SeqIdWrite (sip, buf, PRINTID_FASTA_SHORT, sizeof (buf) - 1); + vnp = ValNodeCopyStr (&(vdp->tailid), 0, buf); + if (vdp->headid == NULL) { + vdp->headid = vnp; + } + vdp->tailid = vnp; + } +} + +static ValNodePtr UniqueValNodeCaseSensitive (ValNodePtr list) + +{ + CharPtr last; + ValNodePtr next; + Pointer PNTR prev; + CharPtr str; + ValNodePtr vnp; + + if (list == NULL) return NULL; + last = (CharPtr) list->data.ptrvalue; + vnp = list->next; + prev = (Pointer PNTR) &(list->next); + while (vnp != NULL) { + next = vnp->next; + str = (CharPtr) vnp->data.ptrvalue; + if (StringCmp (last, str) == 0) { + vnp->next = NULL; + *prev = next; + ValNodeFreeData (vnp); + } else { + last = (CharPtr) vnp->data.ptrvalue; + prev = (Pointer PNTR) &(vnp->next); + } + vnp = next; + } + + return list; +} + +static void ValidateSeqIdCase (SeqEntryPtr sep, ValidStructPtr vsp) + +{ + CharPtr curr; + GatherContext gc; + GatherContextPtr gcp; + CharPtr prev; + VsicData vd; + ValNodePtr vnp; + + if (vsp == NULL || sep == NULL) return; + + MemSet ((Pointer) &gc, 0, sizeof (GatherContext)); + MemSet ((Pointer) &vd, 0, sizeof (VsicData)); + + gcp = &gc; + vsp->gcp = &gc; + vsp->bssp = NULL; + vsp->bsp = NULL; + vsp->sfp = NULL; + vsp->descr = NULL; + vd.vsp = vsp; + + VisitBioseqsInSep (sep, (Pointer) &vd, CaptureTextSeqIDs); + vd.headid = ValNodeSort (vd.headid, SortVnpByString); + vd.headid = UniqueValNodeCaseSensitive (vd.headid); + + curr = NULL; + prev = NULL; + for (vnp = vd.headid; vnp != NULL; vnp = vnp->next, prev = curr) { + curr = (CharPtr) vnp->data.ptrvalue; + if (StringHasNoText (curr)) continue; + if (StringHasNoText (prev)) continue; + if (StringICmp (curr, prev) != 0) continue; + if (StringCmp (curr, prev) == 0) continue; + ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_CaseDifferenceInSeqID, + "Sequence identifier differs only by case - %s and %s", curr, prev); + } + + vsp->bssp = NULL; + vsp->bsp = NULL; + vsp->sfp = NULL; + vsp->descr = NULL; + + ValNodeFreeData (vd.headid); +} + static void LookForNC (BioseqPtr bsp, Pointer userdata) { @@ -4143,6 +4565,120 @@ static void LookForHTG (SeqDescrPtr sdp, Pointer userdata) } } +static void LookForSMUPD (SeqDescrPtr sdp, Pointer userdata) + +{ + BoolPtr is_smupdp; + UserObjectPtr uop; + + if (sdp == NULL || userdata == NULL) return; + if (sdp->choice != Seq_descr_user) return; + + uop = (UserObjectPtr) sdp->data.ptrvalue; + if (uop == NULL) return; + + if (StringICmp (uop->_class, "SMART_V1.0") == 0) { + + is_smupdp = (BoolPtr) userdata; + *is_smupdp = TRUE; + } +} + +static void SetPubScratchData (SeqDescrPtr sdp, Pointer userdata) + +{ + AuthListPtr alp; + Char buf [2048]; + CitGenPtr cgp; + CharPtr consortium, str, tmp; + ValNodePtr vnp; + ObjValNodePtr ovp; + PubdescPtr pdp; + + if (sdp == NULL || sdp->choice != Seq_descr_pub || sdp->extended == 0) return; + ovp = (ObjValNodePtr) sdp; + pdp = (PubdescPtr) sdp->data.ptrvalue; + if (pdp == NULL) return; + + vnp = pdp->pub; + + /* skip over just serial number */ + + if (vnp != NULL && vnp->choice == PUB_Gen && vnp->next != NULL) { + cgp = (CitGenPtr) vnp->data.ptrvalue; + if (cgp != NULL) { + if (StringNICmp ("BackBone id_pub", cgp->cit, 15) != 0) { + if (cgp->cit == NULL && cgp->journal == NULL && cgp->date == NULL && cgp->serial_number) { + vnp = vnp->next; + } + } + } + } + + if (PubLabelUnique (vnp, buf, sizeof (buf) - 1, OM_LABEL_CONTENT, TRUE) > 0) { + alp = GetAuthListPtr (pdp, NULL); + if (alp != NULL) { + consortium = NULL; + str = GetAuthorsString (GENBANK_FMT, alp, &consortium, NULL, NULL); + tmp = MemNew (StringLen (buf) + StringLen (str) + StringLen (consortium) + 10); + if (tmp != NULL) { + StringCpy (tmp, buf); + if (StringDoesHaveText (str)) { + StringCat (tmp, "; "); + StringCat (tmp, str); + } + if (StringDoesHaveText (consortium)) { + StringCat (tmp, "; "); + StringCat (tmp, consortium); + } + ovp->idx.scratch = tmp; + } + MemFree (str); + MemFree (consortium); + } + } +} + +static void ClearPubScratchData (SeqDescrPtr sdp, Pointer userdata) + +{ + ObjValNodePtr ovp; + + if (sdp == NULL || sdp->choice != Seq_descr_pub || sdp->extended == 0) return; + ovp = (ObjValNodePtr) sdp; + ovp->idx.scratch = MemFree (ovp->idx.scratch); +} + +typedef struct frd { + ValidStructPtr vsp; + GatherContextPtr gcp; + /* + CharPtr string; + */ +} FindRepData, PNTR FindRepPtr; + +static void FindRepValidate (Uint2 entityID, Uint2 itemID, Uint2 itemtype, Pointer userdata) + +{ + FindRepPtr frp; + GatherContextPtr gcp; + ValidStructPtr vsp; + + frp = (FindRepPtr) userdata; + vsp = frp->vsp; + gcp = frp->gcp; + + gcp->entityID = entityID; + gcp->itemID = itemID; + gcp->thistype = itemtype; + + ValidErr (vsp, SEV_ERROR, ERR_GENERIC_EmbeddedScript, "Script tag found in item"); +} + +static CharPtr findrepstrs [] = { + "<script", "<object", "<applet", "<embed", "<form", "javascript:", NULL +}; + NLM_EXTERN Boolean ValidateSeqEntry (SeqEntryPtr sep, ValidStructPtr vsp) { Uint2 entityID = 0; @@ -4164,7 +4700,7 @@ NLM_EXTERN Boolean ValidateSeqEntry (SeqEntryPtr sep, ValidStructPtr vsp) SeqEntryPtr oldsep; ErrSev oldsev; ObjMgrDataPtr omdp; - SeqEntryPtr topsep; + SeqEntryPtr topsep = NULL; SeqEntryPtr tmp; ValNodePtr bsplist; ErrSev sev; @@ -4172,6 +4708,7 @@ NLM_EXTERN Boolean ValidateSeqEntry (SeqEntryPtr sep, ValidStructPtr vsp) Boolean isGPS = FALSE; Boolean isPatent = FALSE; Boolean isPDB = FALSE; + FindRepData frd; if (sep == NULL || vsp == NULL) return FALSE; @@ -4195,6 +4732,7 @@ NLM_EXTERN Boolean ValidateSeqEntry (SeqEntryPtr sep, ValidStructPtr vsp) topsep = GetTopSeqEntryForEntityID (entityID); VisitGraphsInSep (topsep, (Pointer) &featprob, CheckGraphPacking); VisitFeaturesInSep (topsep, (Pointer) &featprob, CountGeneXrefs); + VisitFeaturesInSep (topsep, (Pointer) &featprob, CountFeatLocIdTypes); VisitBioseqsInSep (topsep, (Pointer) &featprob, CheckTpaHist); } else { @@ -4247,12 +4785,20 @@ NLM_EXTERN Boolean ValidateSeqEntry (SeqEntryPtr sep, ValidStructPtr vsp) vsp->is_htg_in_sep = FALSE; VisitDescriptorsInSep (sep, (Pointer) &(vsp->is_htg_in_sep), LookForHTG); + vsp->is_smupd_in_sep = FALSE; + VisitDescriptorsInSep (sep, (Pointer) &(vsp->is_smupd_in_sep), LookForSMUPD); vsp->is_refseq_in_sep = FALSE; VisitBioseqsInSep (sep, (Pointer) &(vsp->is_refseq_in_sep), LookForNC); + vsp->feat_loc_has_gi = featprob.loc_has_gi; + vsp->feat_prod_has_gi = featprob.prod_has_gi; + globalvsp = vsp; /* for spell checker */ while (sep != NULL) { + /* calculate strings for LookForMultipleUnpubPubs test only once for genome product set efficiency */ + VisitDescriptorsInSep (sep, NULL, SetPubScratchData); + MemSet (&gs, 0, sizeof (GatherScope)); gs.scope = sep; /* default is to scope to this set */ @@ -4384,6 +4930,10 @@ NLM_EXTERN Boolean ValidateSeqEntry (SeqEntryPtr sep, ValidStructPtr vsp) ValidateFeatIDs (gc.entityID, vsp); vsp->gcp = NULL; + vsp->gcp = NULL; + ValidateSeqIdCase (sep, vsp); + vsp->gcp = NULL; + if (vsp->validateAlignments) { vsp->gcp = NULL; ValidateSeqAlignWithinValidator (vsp, sep, vsp->alignFindRemoteBsp, vsp->doSeqHistAssembly); @@ -4392,6 +4942,8 @@ NLM_EXTERN Boolean ValidateSeqEntry (SeqEntryPtr sep, ValidStructPtr vsp) SeqEntrySetScope (oldsep); + VisitDescriptorsInSep (sep, NULL, ClearPubScratchData); + if (vsp->useSeqMgrIndexes) { /* unlock all pre-locked remote genome components */ @@ -4407,6 +4959,22 @@ NLM_EXTERN Boolean ValidateSeqEntry (SeqEntryPtr sep, ValidStructPtr vsp) sep = NULL; } + MemSet ((Pointer) &gc, 0, sizeof (GatherContext)); + gcp = &gc; + gc.entityID = ObjMgrGetEntityIDForChoice (sep); + vsp->gcp = gcp; + frd.vsp = vsp; + frd.gcp = gcp; + + limit = vsp->validationLimit; + if (limit == VALIDATE_ALL) { + /* + frd.string = "?"; + */ + FindStringsInEntity (entityID, findrepstrs, FALSE, FALSE, FALSE, UPDATE_NEVER, + NULL, NULL, NULL, TRUE, FindRepValidate, (Pointer) &frd); + } + if (do_many) { for (i = 0; i < 6; i++) vsp->errors[i] = errors[i]; @@ -5284,16 +5852,23 @@ static void ValidateIDSetAgainstDb (GatherContextPtr gcp, ValidStructPtr vsp, Bi } typedef struct enrun { - Int4 ncount; - Int4 maxrun; + GatherContextPtr gcp; + ValidStructPtr vsp; + Int4 ncount; + Int4 maxrun; + Int4 seqpos; + Boolean showAll; + Boolean inNrun; } RunOfNs, PNTR RunOfNsPtr; static void LIBCALLBACK CountAdjacentProc (CharPtr sequence, Pointer userdata) { - Char ch; - RunOfNsPtr ronp; - CharPtr str; + Char ch; + GatherContextPtr gcp; + RunOfNsPtr ronp; + CharPtr str; + ValidStructPtr vsp; ronp = (RunOfNsPtr) userdata; if (sequence == NULL || ronp == NULL) return; @@ -5301,20 +5876,29 @@ static void LIBCALLBACK CountAdjacentProc (CharPtr sequence, Pointer userdata) str = sequence; ch = *str; while (ch != '\0') { + (ronp->seqpos)++; if (ch == 'N') { (ronp->ncount)++; if (ronp->ncount > ronp->maxrun) { ronp->maxrun = ronp->ncount; } + ronp->inNrun = TRUE; } else { + if (ronp->inNrun && ronp->showAll && ronp->ncount >= 100) { + vsp = ronp->vsp; + gcp = ronp->gcp; + ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_InternalNsInSeqRaw, "Run of %ld Ns in raw sequence starting at base %ld", + (long) ronp->ncount, (long) (ronp->seqpos - ronp->ncount + 1)); + } ronp->ncount = 0; + ronp->inNrun = FALSE; } str++; ch = *str; } } -static Int4 CountAdjacentNsInSeqLit (SeqLitPtr slitp, Boolean is_na) +static Int4 CountAdjacentNsInSeqLit (GatherContextPtr gcp, SeqLitPtr slitp, Boolean is_na) { BioseqPtr bsp; @@ -5340,8 +5924,13 @@ static Int4 CountAdjacentNsInSeqLit (SeqLitPtr slitp, Boolean is_na) bsp->length = slitp->length; bsp->id = SeqIdParse ("lcl|countseqlitns"); + ron.gcp = gcp; + ron.vsp = (ValidStructPtr) (gcp->userdata); ron.ncount = 0; ron.maxrun = 0; + ron.seqpos = 0; + ron.showAll = FALSE; + ron.inNrun = FALSE; SeqPortStream (bsp, STREAM_EXPAND_GAPS, (Pointer) &ron, CountAdjacentProc); @@ -5572,6 +6161,10 @@ static void ValidateBioseqInst (GatherContextPtr gcp) Boolean hasGi = FALSE; SeqHistPtr hist; IntFuzzPtr ifp; + Int4 adjacent_N_gap_position; + Boolean adjacent_N_and_gap; + Boolean in_gap; + Boolean in_N; Boolean isActiveFin = FALSE; Boolean isGB = FALSE; Boolean isPatent = FALSE; @@ -6001,12 +6594,6 @@ static void ValidateBioseqInst (GatherContextPtr gcp) oldItemID = gcp->itemID; oldItemtype = gcp->thistype; - if (SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &dcontext) != NULL) { - gcp->entityID = dcontext.entityID; - gcp->itemID = dcontext.itemID; - gcp->thistype = OBJ_SEQDESC; - } - if (ISA_aa (bsp->mol)) { if (bsp->topology > 1) { /* not linear */ ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_CircularProtein, "Non-linear topology set on protein"); @@ -6276,6 +6863,47 @@ static void ValidateBioseqInst (GatherContextPtr gcp) } } + if (ISA_na (bsp->mol) && bsp->repr == Seq_repr_delta && DeltaLitOnly (bsp)) { + if (! StreamCacheSetup (bsp, NULL, EXPAND_GAPS_TO_DASHES, &sc)) { + ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_SeqPortFail, "Can't open StreamCache"); + return; + } + in_gap = FALSE; + in_N = FALSE; + adjacent_N_and_gap = FALSE; + adjacent_N_gap_position = 0; + for (len = 0; len < bsp->length; len++) { + residue = StreamCacheGetResidue (&sc); + if (residue == '-') { + if (in_N) { + adjacent_N_and_gap = TRUE; + if (adjacent_N_gap_position == 0) { + adjacent_N_gap_position = len; + } + } + in_N = FALSE; + in_gap = TRUE; + } else if (residue == 'N') { + if (in_gap) { + adjacent_N_and_gap = TRUE; + if (adjacent_N_gap_position == 0) { + adjacent_N_gap_position = len; + } + } + in_gap = FALSE; + in_N = TRUE; + } else { + in_gap = FALSE; + in_N = FALSE; + } + } + if (adjacent_N_and_gap) { + ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_InternalNsAdjacentToGap, + "Ambiguous residue N is adjacent to a gap around position %ld", + (long) adjacent_N_gap_position); + } + } + if ((bsp->repr == Seq_repr_seg) || (bsp->repr == Seq_repr_ref)) { /* check segmented sequence */ head.choice = SEQLOC_MIX; head.data.ptrvalue = bsp->seq_ext; @@ -6499,19 +7127,19 @@ static void ValidateBioseqInst (GatherContextPtr gcp) } if (mip != NULL) { if (mip->tech == MI_TECH_htgs_1 || mip->tech == MI_TECH_htgs_2) { - runsofn = CountAdjacentNsInSeqLit (slitp, (Boolean) ISA_na (bsp->mol)); + runsofn = CountAdjacentNsInSeqLit (gcp, slitp, (Boolean) ISA_na (bsp->mol)); if (runsofn > 80) { - ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_InternalNsInSeqLit, "Run of %ld Ns in delta component %ld that starts at base %ld", (long) runsofn, (int) segnum, (long) len); + ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_InternalNsInSeqLit, "Run of %ld Ns in delta component %ld that starts at base %ld", (long) runsofn, (int) segnum, (long) (len + 1)); } } else if (mip->tech == MI_TECH_wgs || mip->tech == MI_TECH_composite_wgs_htgs) { - runsofn = CountAdjacentNsInSeqLit (slitp, (Boolean) ISA_na (bsp->mol)); + runsofn = CountAdjacentNsInSeqLit (gcp, slitp, (Boolean) ISA_na (bsp->mol)); if (runsofn > 80) { - ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_InternalNsInSeqLit, "Run of %ld Ns in delta component %ld that starts at base %ld", (long) runsofn, (int) segnum, (long) len); + ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_InternalNsInSeqLit, "Run of %ld Ns in delta component %ld that starts at base %ld", (long) runsofn, (int) segnum, (long) (len + 1)); } } else { - runsofn = CountAdjacentNsInSeqLit (slitp, (Boolean) ISA_na (bsp->mol)); + runsofn = CountAdjacentNsInSeqLit (gcp, slitp, (Boolean) ISA_na (bsp->mol)); if (runsofn > 100) { - ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_InternalNsInSeqLit, "Run of %ld Ns in delta component %ld that starts at base %ld", (long) runsofn, (int) segnum, (long) len); + ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_InternalNsInSeqLit, "Run of %ld Ns in delta component %ld that starts at base %ld", (long) runsofn, (int) segnum, (long) (len + 1)); } } } @@ -6547,19 +7175,31 @@ static void ValidateBioseqInst (GatherContextPtr gcp) } if ((!isNTorNC) && (! is_gps) && mip->tech != MI_TECH_htgs_0 && mip->tech != MI_TECH_htgs_1 && mip->tech != MI_TECH_htgs_2 && mip->tech != MI_TECH_htgs_3 && mip->tech != MI_TECH_wgs && - mip->tech != MI_TECH_unknown && mip->tech != MI_TECH_standard) { + mip->tech != MI_TECH_composite_wgs_htgs && mip->tech != MI_TECH_unknown && mip->tech != MI_TECH_standard) { ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_BadDeltaSeq, "Delta seq technique should not be [%d]", (int) (mip->tech)); } } } else if (bsp->repr == Seq_repr_raw) { + ron.gcp = gcp; + ron.vsp = vsp; ron.ncount = 0; ron.maxrun = 0; + ron.seqpos = 0; + ron.showAll = TRUE; + ron.inNrun = FALSE; SeqPortStream (bsp, STREAM_EXPAND_GAPS, (Pointer) &ron, CountAdjacentProc); - if (ron.maxrun > 100) { + if (ron.inNrun && ron.showAll && ron.ncount >= 100) { + ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_InternalNsInSeqRaw, "Run of %ld Ns in raw sequence starting at base %ld", + (long) ron.ncount, (long) (ron.seqpos - ron.ncount + 1)); + } + + /* + if (ron.maxrun >= 100) { ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_InternalNsInSeqRaw, "Run of %ld Ns in raw sequence", (long) ron.maxrun); } + */ } if (bsp->repr == Seq_repr_delta) { @@ -7086,12 +7726,11 @@ static void LookForMultiplePubs (ValidStructPtr vsp, GatherContextPtr gcp, SeqDe static void LookForMultipleUnpubPubs (ValidStructPtr vsp, GatherContextPtr gcp, BioseqPtr bsp) { - AuthListPtr alp; Char buf [2048]; - CitGenPtr cgp; - CharPtr consortium, last, str, tmp; + CharPtr last, str; SeqMgrDescContext dcontext; ValNodePtr list = NULL, next, vnp; + ObjValNodePtr ovp; PubdescPtr pdp; SeqDescrPtr sdp; @@ -7099,42 +7738,9 @@ static void LookForMultipleUnpubPubs (ValidStructPtr vsp, GatherContextPtr gcp, while (sdp) { pdp = (PubdescPtr) sdp->data.ptrvalue; if (pdp != NULL) { - vnp = pdp->pub; - - /* skip over just serial number */ - - if (vnp != NULL && vnp->choice == PUB_Gen && vnp->next != NULL) { - cgp = (CitGenPtr) vnp->data.ptrvalue; - if (cgp != NULL) { - if (StringNICmp ("BackBone id_pub", cgp->cit, 15) != 0) { - if (cgp->cit == NULL && cgp->journal == NULL && cgp->date == NULL && cgp->serial_number) { - vnp = vnp->next; - } - } - } - } - - if (PubLabelUnique (vnp, buf, sizeof (buf) - 1, OM_LABEL_CONTENT, TRUE) > 0) { - alp = GetAuthListPtr (pdp, NULL); - if (alp != NULL) { - consortium = NULL; - str = GetAuthorsString (GENBANK_FMT, alp, &consortium, NULL, NULL); - tmp = MemNew (StringLen (buf) + StringLen (str) + StringLen (consortium) + 10); - if (tmp != NULL) { - StringCpy (tmp, buf); - if (StringDoesHaveText (str)) { - StringCat (tmp, "; "); - StringCat (tmp, str); - } - if (StringDoesHaveText (consortium)) { - StringCat (tmp, "; "); - StringCat (tmp, consortium); - } - ValNodeAddStr (&list, 0, tmp); - } - MemFree (str); - MemFree (consortium); - } + ovp = (ObjValNodePtr) sdp; + if (ovp->idx.scratch != NULL) { + ValNodeCopyStr (&list, 0, ovp->idx.scratch); } } sdp = SeqMgrGetNextDescriptor (bsp, sdp, Seq_descr_pub, &dcontext); @@ -8231,7 +8837,7 @@ static Boolean CountryIsValid (CharPtr name) return FALSE; } -static CharPtr GetDashOrSpace (CharPtr str) +static CharPtr GetDash (CharPtr str) { Char ch; @@ -8239,7 +8845,7 @@ static CharPtr GetDashOrSpace (CharPtr str) if (str == NULL) return NULL; ch = *str; while (ch != '\0') { - if (ch == ' ' || ch == '-') return str; + if (ch == '-') return str; str++; ch = *str; } @@ -8275,11 +8881,11 @@ static Boolean CollectionDateIsValid (CharPtr name) if (StringHasNoText (name)) return FALSE; StringNCpy_0 (str, name, sizeof (str)); - ptr1 = GetDashOrSpace (str); + ptr1 = GetDash (str); if (ptr1 != NULL) { *ptr1 = '\0'; ptr1++; - ptr2 = GetDashOrSpace (ptr1); + ptr2 = GetDash (ptr1); if (ptr2 != NULL) { *ptr2 = '\0'; ptr2++; @@ -8323,6 +8929,42 @@ static Boolean CollectionDateIsValid (CharPtr name) return FALSE; } +static Boolean PrimerSeqIsValid (CharPtr name) + +{ + Char ch; + size_t len; + CharPtr ptr; + + if (StringHasNoText (name)) return FALSE; + len = StringLen (name); + if (len < 1) return FALSE; + + if (StringChr (name, ',') != NULL) { + if (name [0] != '(' || name [len - 1] != ')') return FALSE; + } else { + if (StringChr (name, '(') != NULL) return FALSE; + if (StringChr (name, ')') != NULL) return FALSE; + } + + if (StringChr (name, ';') != NULL) return FALSE; + if (StringChr (name, ' ') != NULL) return FALSE; + + ptr = name; + ch = *ptr; + while (ch != '\0') { + if (ch != '(' && ch != ')' && ch != ',') { + if (! (IS_ALPHA (ch))) return FALSE; + ch = TO_UPPER (ch); + if (StringChr ("ABCDGHKMNRSTVWY", ch) == NULL) return FALSE; + } + ptr++; + ch = *ptr; + } + + return TRUE; +} + static CharPtr source_qual_prefixes [] = { "acronym:", "anamorph:", @@ -8349,6 +8991,10 @@ static CharPtr source_qual_prefixes [] = { "forma:", "forma_specialis:", "frequency:", + "fwd_pcr_primer_name", + "fwd_pcr_primer_seq", + "fwd_primer_name", + "fwd_primer_seq", "genotype:", "germline:", "group:", @@ -8367,6 +9013,10 @@ static CharPtr source_qual_prefixes [] = { "plastid_name:", "pop_variant:", "rearranged:", + "rev_pcr_primer_name", + "rev_pcr_primer_seq", + "rev_primer_name", + "rev_primer_seq", "right_primer:", "segment:", "serogroup:", @@ -8489,7 +9139,7 @@ static void ValidateBioSource (ValidStructPtr vsp, GatherContextPtr gcp, BioSour if (StringHasNoText (countryname)) { countryname = "?"; } - ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BadCountryCode, "Bad country name [%s]", countryname); + ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_BadCountryCode, "Bad country name [%s]", countryname); } } else if (ssp->subtype == SUBSRC_chromosome) { chromcount++; @@ -8523,6 +9173,10 @@ static void ValidateBioSource (ValidStructPtr vsp, GatherContextPtr gcp, BioSour if (! CollectionDateIsValid (ssp->name)) { ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BadCollectionDate, "Collection_date format is not in DD-Mmm-YYYY format"); } + } else if (ssp->subtype == SUBSRC_fwd_primer_seq || ssp->subtype == SUBSRC_rev_primer_seq) { + if (! PrimerSeqIsValid (ssp->name)) { + ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BadPCRPrimerSequence, "PCR primer sequence format is incorrect"); + } } ssp = ssp->next; } @@ -8731,6 +9385,9 @@ static Boolean ValidateSeqDescrCommon (ValNodePtr sdp, BioseqValidStrPtr bvsp, V OrgRefPtr this_org = NULL, that_org = NULL; int tmpval; Char buf1[20], buf2[20]; + EMBLBlockPtr ebp; + GBBlockPtr gbp; + ValNodePtr keywords = NULL; PubdescPtr pdp; MolInfoPtr mip; Uint2 olditemtype = 0; @@ -8739,6 +9396,8 @@ static Boolean ValidateSeqDescrCommon (ValNodePtr sdp, BioseqValidStrPtr bvsp, V GatherContextPtr gcp = NULL; CharPtr str; SeqFeatPtr sfp; + Boolean tpa_exp; + Boolean tpa_inf; BioseqPtr bsp; DatePtr dp; SeqMgrFeatContext fcontext; @@ -8859,12 +9518,24 @@ static Boolean ValidateSeqDescrCommon (ValNodePtr sdp, BioseqValidStrPtr bvsp, V ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_Inconsistent, "Multiple GenBank blocks"); else bvsp->last_gb = vnp; + if (vnp != NULL) { + gbp = (GBBlockPtr) vnp->data.ptrvalue; + if (gbp != NULL) { + keywords = gbp->keywords; + } + } break; case Seq_descr_embl: if (bvsp->last_embl != NULL) ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_Inconsistent, "Multiple EMBL blocks"); else bvsp->last_embl = vnp; + if (vnp != NULL) { + ebp = (EMBLBlockPtr) vnp->data.ptrvalue; + if (ebp != NULL) { + keywords = ebp->keywords; + } + } break; case Seq_descr_pir: if (bvsp->last_pir != NULL) @@ -9109,6 +9780,20 @@ static Boolean ValidateSeqDescrCommon (ValNodePtr sdp, BioseqValidStrPtr bvsp, V break; } + if (keywords != NULL) { + tpa_exp = FALSE; + tpa_inf = FALSE; + for (vnp = keywords; vnp != NULL; vnp = vnp->next) { + if (StringICmp ((CharPtr) vnp->data.ptrvalue, "TPA:experimental") == 0) { + tpa_exp = TRUE; + } else if (StringICmp ((CharPtr) vnp->data.ptrvalue, "TPA:inferential") == 0) { + tpa_inf = TRUE; + } + } + if (tpa_exp && tpa_inf) { + ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_Inconsistent, "TPA:experimental and TPA:inferential should not both be in the same set of keywords"); + } + } if (gcp != NULL) { gcp->itemID = olditemid; @@ -9223,6 +9908,20 @@ static Boolean GPSorNTorNC (SeqEntryPtr sep, SeqLocPtr location) return FALSE; } +static Boolean IsGenBankAccn (SeqEntryPtr sep, SeqLocPtr location) +{ + BioseqPtr bsp; + SeqIdPtr sip; + + bsp = BioseqFindFromSeqLoc (location); + if (bsp != NULL) { + for (sip = bsp->id; sip != NULL; sip = sip->next) { + if (sip->choice == SEQID_GENBANK) return TRUE; + } + } + return FALSE; +} + static Boolean NGorNT (SeqEntryPtr sep, SeqLocPtr location, BoolPtr is_nc) { BioseqPtr bsp; @@ -10110,6 +10809,54 @@ static Boolean HaveUniqueFeatIDXrefs (SeqFeatXrefPtr xref1, SeqFeatXrefPtr xref2 return FALSE; } +#define SMALL_RIBOSOMAL_SUBUNIT 1 +#define INTERNAL_SPACER_1 2 +#define MIDDLE_RIBOSOMAL_SUBUNIT 3 +#define INTERNAL_SPACER_2 4 +#define LARGE_RIBOSOMAL_SUBUNIT 5 +#define INTERNAL_SPACER_X 6 +#define TRANSFER_RNA 7 + +static Int2 WhichRNA (SeqFeatPtr sfp) + +{ + RnaRefPtr rrp; + CharPtr str; + + if (sfp == NULL || sfp->data.choice != SEQFEAT_RNA) return 0; + rrp = (RnaRefPtr) sfp->data.value.ptrvalue; + if (rrp == NULL) return 0; + if (rrp->type == 3) { + return TRANSFER_RNA; + } + if (rrp->ext.choice != 1) return 0; + str = (CharPtr) rrp->ext.value.ptrvalue; + if (StringHasNoText (str)) return 0; + if (rrp->type == 4) { + if (StringNICmp (str, "small ", 6) == 0) return SMALL_RIBOSOMAL_SUBUNIT; + if (StringNICmp (str, "18S ", 4) == 0) return SMALL_RIBOSOMAL_SUBUNIT; + if (StringNICmp (str, "5.8S ", 5) == 0) return MIDDLE_RIBOSOMAL_SUBUNIT; + if (StringNICmp (str, "large ", 6) == 0) return LARGE_RIBOSOMAL_SUBUNIT; + if (StringNICmp (str, "26S ", 4) == 0) return LARGE_RIBOSOMAL_SUBUNIT; + if (StringNICmp (str, "28S ", 4) == 0) return LARGE_RIBOSOMAL_SUBUNIT; + /* variant spellings */ + if (StringNICmp (str, "18 ", 3) == 0) return SMALL_RIBOSOMAL_SUBUNIT; + if (StringNICmp (str, "5.8 ", 4) == 0) return MIDDLE_RIBOSOMAL_SUBUNIT; + if (StringNICmp (str, "26 ", 3) == 0) return LARGE_RIBOSOMAL_SUBUNIT; + if (StringNICmp (str, "28 ", 3) == 0) return LARGE_RIBOSOMAL_SUBUNIT; + } + if (rrp->type == 255) { + if (StringICmp (str, "internal transcribed spacer 1") == 0) return INTERNAL_SPACER_1; + if (StringICmp (str, "internal transcribed spacer 2") == 0) return INTERNAL_SPACER_2; + /* variant spellings */ + if (StringICmp (str, "internal transcribed spacer1") == 0) return INTERNAL_SPACER_1; + if (StringICmp (str, "internal transcribed spacer2") == 0) return INTERNAL_SPACER_2; + if (StringICmp (str, "internal transcribed spacer") == 0) return INTERNAL_SPACER_X; + if (StringICmp (str, "ITS") == 0) return INTERNAL_SPACER_X; + } + return 0; +} + static Boolean ValidateBioseqContextIndexed (BioseqPtr bsp, BioseqValidStrPtr bvsp) { @@ -10142,6 +10889,11 @@ static Boolean ValidateBioseqContextIndexed (BioseqPtr bsp, BioseqValidStrPtr bv CharPtr lastLabel; CharPtr message; Int2 i; + Boolean isCuratedFlybase = FALSE; + Boolean isDrosophila = FALSE; + Boolean isGenBankAccn = FALSE; + Boolean isGPSorNTorNC = FALSE; + Boolean isViral = FALSE; Int2 j; CdRegionPtr crp; Uint1 frame; @@ -10150,6 +10902,7 @@ static Boolean ValidateBioseqContextIndexed (BioseqPtr bsp, BioseqValidStrPtr bv int overlapPepSev; BioSourcePtr biop = NULL, lastbiop; OrgRefPtr orp = NULL; + OrgNamePtr onp = NULL; Int4 fiveUTRright; Int4 cdsRight; Int4 threeUTRright; @@ -10162,12 +10915,13 @@ static Boolean ValidateBioseqContextIndexed (BioseqPtr bsp, BioseqValidStrPtr bv Int2 numBadFullSource; SubSourcePtr sbsp; Int2 numgene, numcds, nummrna, numcdsproducts, nummrnaproducts, - numcdspseudo, nummrnapseudo; + numcdspseudo, nummrnapseudo, lastrnatype, thisrnatype; Boolean cds_products_unique = TRUE, mrna_products_unique = TRUE, suppress_duplicate_messages = FALSE, pseudo; SeqIdPtr sip; Char buf [64]; SeqFeatXrefPtr xref = NULL; + CharPtr except_text = NULL; ValNodePtr vnp, cds_prod_head = NULL, mrna_prod_head = NULL, lastcdsprod = NULL, lastmrnaprod = NULL; @@ -10350,10 +11104,31 @@ static Boolean ValidateBioseqContextIndexed (BioseqPtr bsp, BioseqValidStrPtr bv ValNodeFreeData (cds_prod_head); ValNodeFreeData (mrna_prod_head); + /* + SeqEntryToBioSource (vsp->sep, NULL, NULL, 0, &biop); + */ + BioseqToGeneticCode (bsp, NULL, NULL, NULL, NULL, 0, &biop); + if (biop != NULL) { + orp = biop->org; + if (orp != NULL) { + /* curated fly source still has duplicate features */ + if (StringICmp (orp->taxname, "Drosophila melanogaster") == 0) { + isDrosophila = TRUE; + } + onp = orp->orgname; + if (onp != NULL) { + if (StringNICmp (onp->lineage, "Viruses; ", 9) == 0) { + isViral = TRUE; + } + } + } + } + sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &fcontext); while (sfp != NULL) { leave = TRUE; if (last != NULL) { + ivalssame = FALSE; if (fcontext.left == left && fcontext.right == right && fcontext.featdeftype == featdeftype) { if (fcontext.strand == strand || strand == Seq_strand_unknown || fcontext.strand == Seq_strand_unknown) { ivalssame = TRUE; @@ -10389,21 +11164,30 @@ static Boolean ValidateBioseqContextIndexed (BioseqPtr bsp, BioseqValidStrPtr bv featdeftype == FEATDEF_REGION || featdeftype == FEATDEF_misc_feature || featdeftype == FEATDEF_STS || featdeftype == FEATDEF_variation) { severity = SEV_WARNING; } else { - if (! GPSorNTorNC (vsp->sep, sfp->location)) { - severity = SEV_WARNING; - } else { - if (orp == NULL) { - SeqEntryToBioSource (vsp->sep, NULL, NULL, 0, &biop); - if (biop != NULL) { - orp = biop->org; + if (isGPSorNTorNC || GPSorNTorNC (vsp->sep, sfp->location)) { + isGPSorNTorNC = TRUE; + if (! isCuratedFlybase) { + if (isDrosophila) { + isCuratedFlybase = TRUE; } } - if (orp != NULL) { + if (isCuratedFlybase) { /* curated fly source still has duplicate features */ - if (StringICmp (orp->taxname, "Drosophila melanogaster") == 0) { - severity = SEV_WARNING; + severity = SEV_WARNING; + } + } else if (isGenBankAccn || IsGenBankAccn (vsp->sep, sfp->location)) { + isGenBankAccn = TRUE; + if (! isCuratedFlybase) { + if (isDrosophila) { + isCuratedFlybase = TRUE; } } + if (isCuratedFlybase) { + /* curated fly source still has duplicate features */ + severity = SEV_WARNING; + } + } else { + severity = SEV_WARNING; } } /* if different CDS frames, lower to warning */ @@ -10447,7 +11231,15 @@ static Boolean ValidateBioseqContextIndexed (BioseqPtr bsp, BioseqValidStrPtr bv } else { if (suppress_duplicate_messages && (featdeftype == FEATDEF_CDS || featdeftype == FEATDEF_mRNA) && HaveUniqueFeatIDXrefs (xref, sfp->xref)) { /* do not report CDS or mRNA if every one has a unique product and unique featID xrefs */ + } else if (featdeftype == FEATDEF_GENE && + StringStr (sfp->except_text, "dicistronic gene") != NULL && + StringStr (except_text, "dicistronic gene") != NULL && + isCuratedFlybase) { + /* do not report genes marked dicistronic */ } else { + if (featdeftype == FEATDEF_GENE && isViral && (sfp->partial || last->partial)) { + severity = SEV_WARNING; + } ValidErr (vsp, severity, ERR_SEQ_FEAT_DuplicateFeat, "Features have identical intervals, but labels differ"); } } @@ -10505,6 +11297,7 @@ static Boolean ValidateBioseqContextIndexed (BioseqPtr bsp, BioseqValidStrPtr bv ivals = fcontext.ivals; sap = fcontext.sap; xref = sfp->xref; + except_text = sfp->except_text; frame = 0; if (sfp->data.choice == SEQFEAT_CDREGION) { crp = (CdRegionPtr) sfp->data.value.ptrvalue; @@ -10621,22 +11414,18 @@ static Boolean ValidateBioseqContextIndexed (BioseqPtr bsp, BioseqValidStrPtr bv sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &fcontext); while (sfp != NULL) { + if (gcp != NULL) { + gcp->itemID = fcontext.itemID; + gcp->thistype = OBJ_SEQFEAT; + } + vsp->descr = NULL; + vsp->sfp = sfp; if (sfp->idx.subtype == FEATDEF_3UTR && utr3count < 2) { if (fcontext.strand != Seq_strand_minus) { ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UTRdoesNotAbutCDS, "3'UTR is not on minus strand"); } else if (threeUTRright > 0) { if (threeUTRright + 1 != fcontext.left) { - if (gcp != NULL) { - gcp->itemID = fcontext.itemID; - gcp->thistype = OBJ_SEQFEAT; - } - vsp->descr = NULL; - vsp->sfp = sfp; ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UTRdoesNotAbutCDS, "Previous 3'UTR does not abut next 3'UTR"); - if (gcp != NULL) { - gcp->itemID = olditemid; - gcp->thistype = olditemtype; - } } } threeUTRright = fcontext.right; @@ -10644,18 +11433,7 @@ static Boolean ValidateBioseqContextIndexed (BioseqPtr bsp, BioseqValidStrPtr bv cdsRight = fcontext.right; if (threeUTRright > 0 && firstCDS) { if (threeUTRright + 1 != fcontext.left) { - if (gcp != NULL) { - gcp->itemID = fcontext.itemID; - gcp->thistype = OBJ_SEQFEAT; - } - vsp->descr = NULL; - vsp->sfp = sfp; ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UTRdoesNotAbutCDS, "CDS does not abut 3'UTR"); - vsp->sfp = NULL; - if (gcp != NULL) { - gcp->itemID = olditemid; - gcp->thistype = olditemtype; - } } } firstCDS = FALSE; @@ -10664,17 +11442,7 @@ static Boolean ValidateBioseqContextIndexed (BioseqPtr bsp, BioseqValidStrPtr bv ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UTRdoesNotAbutCDS, "5'UTR is not on minus strand"); } else if (cdsRight > 0) { if (cdsRight + 1 != fcontext.left) { - if (gcp != NULL) { - gcp->itemID = fcontext.itemID; - gcp->thistype = OBJ_SEQFEAT; - } - vsp->descr = NULL; - vsp->sfp = sfp; ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UTRdoesNotAbutCDS, "5'UTR does not abut CDS"); - if (gcp != NULL) { - gcp->itemID = olditemid; - gcp->thistype = olditemtype; - } } } threeUTRright = fcontext.right; @@ -10686,6 +11454,12 @@ static Boolean ValidateBioseqContextIndexed (BioseqPtr bsp, BioseqValidStrPtr bv sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &fcontext); while (sfp != NULL) { + if (gcp != NULL) { + gcp->itemID = fcontext.itemID; + gcp->thistype = OBJ_SEQFEAT; + } + vsp->descr = NULL; + vsp->sfp = sfp; if (sfp->idx.subtype == FEATDEF_5UTR && utr5count < 2) { if (fcontext.strand == Seq_strand_minus) { if (genecount > 1 && cdsgene != NULL && utr5gene != NULL && cdsgene != utr5gene) { @@ -10699,22 +11473,11 @@ static Boolean ValidateBioseqContextIndexed (BioseqPtr bsp, BioseqValidStrPtr bv cdsRight = fcontext.right; if (fiveUTRright > 0 && firstCDS) { if (fiveUTRright + 1 != fcontext.left) { - if (gcp != NULL) { - gcp->itemID = fcontext.itemID; - gcp->thistype = OBJ_SEQFEAT; - } - vsp->descr = NULL; - vsp->sfp = sfp; if (genecount > 1 && cdsgene != NULL && utr5gene != NULL && cdsgene != utr5gene) { /* ignore */ } else { ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UTRdoesNotAbutCDS, "5'UTR does not abut CDS"); } - vsp->sfp = NULL; - if (gcp != NULL) { - gcp->itemID = olditemid; - gcp->thistype = olditemtype; - } } } firstCDS = FALSE; @@ -10723,31 +11486,11 @@ static Boolean ValidateBioseqContextIndexed (BioseqPtr bsp, BioseqValidStrPtr bv ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UTRdoesNotAbutCDS, "3'UTR is not on plus strand"); } else if (threeUTRright > 0) { if (threeUTRright + 1 != fcontext.left) { - if (gcp != NULL) { - gcp->itemID = fcontext.itemID; - gcp->thistype = OBJ_SEQFEAT; - } - vsp->descr = NULL; - vsp->sfp = sfp; ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UTRdoesNotAbutCDS, "Previous 3'UTR does not abut next 3'UTR"); - if (gcp != NULL) { - gcp->itemID = olditemid; - gcp->thistype = olditemtype; - } } } else if (cdsRight > 0) { if (cdsRight + 1 != fcontext.left) { - if (gcp != NULL) { - gcp->itemID = fcontext.itemID; - gcp->thistype = OBJ_SEQFEAT; - } - vsp->descr = NULL; - vsp->sfp = sfp; ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UTRdoesNotAbutCDS, "CDS does not abut 3'UTR"); - if (gcp != NULL) { - gcp->itemID = olditemid; - gcp->thistype = olditemtype; - } } } threeUTRright = fcontext.right; @@ -10757,6 +11500,148 @@ static Boolean ValidateBioseqContextIndexed (BioseqPtr bsp, BioseqValidStrPtr bv } } + if (! bvsp->is_mrna) { + last = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_RNA, 0, &fcontext); + if (last != NULL) { + lastrnatype = WhichRNA (last); + left = fcontext.left; + right = fcontext.right; + strand = fcontext.strand; + sfp = SeqMgrGetNextFeature (bsp, last, SEQFEAT_RNA, 0, &fcontext); + while (sfp != NULL) { + thisrnatype = WhichRNA (sfp); + if (fcontext.strand == strand || (strand != Seq_strand_minus && fcontext.strand != Seq_strand_minus)) { + if (lastrnatype != 0 && thisrnatype != 0) { + if (right + 1 < fcontext.left) { + /* gap */ + if (strand == Seq_strand_minus) { + if ((lastrnatype == LARGE_RIBOSOMAL_SUBUNIT && thisrnatype == TRANSFER_RNA) || + (lastrnatype == TRANSFER_RNA && thisrnatype == SMALL_RIBOSOMAL_SUBUNIT)) { + /* okay in mitochondria */ + } else if ((lastrnatype == LARGE_RIBOSOMAL_SUBUNIT && thisrnatype == INTERNAL_SPACER_2) || + (lastrnatype == INTERNAL_SPACER_2 && thisrnatype == MIDDLE_RIBOSOMAL_SUBUNIT) || + (lastrnatype == MIDDLE_RIBOSOMAL_SUBUNIT && thisrnatype == INTERNAL_SPACER_1) || + (lastrnatype == INTERNAL_SPACER_1 && thisrnatype == SMALL_RIBOSOMAL_SUBUNIT)) { + if (gcp != NULL) { + gcp->itemID = fcontext.itemID; + gcp->thistype = OBJ_SEQFEAT; + } + vsp->descr = NULL; + vsp->sfp = sfp; + ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_ITSdoesNotAbutRRNA, "ITS does not abut adjacent rRNA component"); + } + } else { + if ((lastrnatype == SMALL_RIBOSOMAL_SUBUNIT && thisrnatype == TRANSFER_RNA) || + (lastrnatype == TRANSFER_RNA && thisrnatype == LARGE_RIBOSOMAL_SUBUNIT)) { + /* okay in mitochondria */ + } else if ((lastrnatype == SMALL_RIBOSOMAL_SUBUNIT && thisrnatype == INTERNAL_SPACER_1) || + (lastrnatype == MIDDLE_RIBOSOMAL_SUBUNIT && thisrnatype == INTERNAL_SPACER_2) || + (lastrnatype == INTERNAL_SPACER_1 && thisrnatype == MIDDLE_RIBOSOMAL_SUBUNIT) || + (lastrnatype == INTERNAL_SPACER_2 && thisrnatype == LARGE_RIBOSOMAL_SUBUNIT)) { + if (gcp != NULL) { + gcp->itemID = fcontext.itemID; + gcp->thistype = OBJ_SEQFEAT; + } + vsp->descr = NULL; + vsp->sfp = sfp; + ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_ITSdoesNotAbutRRNA, "ITS does not abut adjacent rRNA component"); + } + } + } else if (right + 1 > fcontext.left) { + /* overlaps */ + if (strand == Seq_strand_minus) { + if ((lastrnatype == LARGE_RIBOSOMAL_SUBUNIT && thisrnatype == TRANSFER_RNA) || + (lastrnatype == TRANSFER_RNA && thisrnatype == SMALL_RIBOSOMAL_SUBUNIT)) { + if (gcp != NULL) { + gcp->itemID = fcontext.itemID; + gcp->thistype = OBJ_SEQFEAT; + } + vsp->descr = NULL; + vsp->sfp = sfp; + ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_ITSdoesNotAbutRRNA, "tRNA overlaps adjacent rRNA component"); + } else if ((lastrnatype == LARGE_RIBOSOMAL_SUBUNIT && thisrnatype == INTERNAL_SPACER_2) || + (lastrnatype == INTERNAL_SPACER_2 && thisrnatype == MIDDLE_RIBOSOMAL_SUBUNIT) || + (lastrnatype == MIDDLE_RIBOSOMAL_SUBUNIT && thisrnatype == INTERNAL_SPACER_1) || + (lastrnatype == INTERNAL_SPACER_1 && thisrnatype == SMALL_RIBOSOMAL_SUBUNIT)) { + if (gcp != NULL) { + gcp->itemID = fcontext.itemID; + gcp->thistype = OBJ_SEQFEAT; + } + vsp->descr = NULL; + vsp->sfp = sfp; + ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_ITSdoesNotAbutRRNA, "ITS overlaps adjacent rRNA component"); + } + } else { + if ((lastrnatype == SMALL_RIBOSOMAL_SUBUNIT && thisrnatype == TRANSFER_RNA) || + (lastrnatype == TRANSFER_RNA && thisrnatype == LARGE_RIBOSOMAL_SUBUNIT)) { + /* okay in mitochondria */ + } else if ((lastrnatype == SMALL_RIBOSOMAL_SUBUNIT && thisrnatype == INTERNAL_SPACER_1) || + (lastrnatype == MIDDLE_RIBOSOMAL_SUBUNIT && thisrnatype == INTERNAL_SPACER_2) || + (lastrnatype == INTERNAL_SPACER_1 && thisrnatype == MIDDLE_RIBOSOMAL_SUBUNIT) || + (lastrnatype == INTERNAL_SPACER_2 && thisrnatype == LARGE_RIBOSOMAL_SUBUNIT)) { + if (gcp != NULL) { + gcp->itemID = fcontext.itemID; + gcp->thistype = OBJ_SEQFEAT; + } + vsp->descr = NULL; + vsp->sfp = sfp; + ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_ITSdoesNotAbutRRNA, "ITS overlaps adjacent rRNA component"); + } + } + } else { + /* abuts */ + if (strand == Seq_strand_minus) { + if ((lastrnatype == LARGE_RIBOSOMAL_SUBUNIT && thisrnatype == TRANSFER_RNA) || + (lastrnatype == TRANSFER_RNA && thisrnatype == SMALL_RIBOSOMAL_SUBUNIT)) { + /* okay in mitochondria */ + } else if ((lastrnatype == LARGE_RIBOSOMAL_SUBUNIT && thisrnatype != INTERNAL_SPACER_2) || + (lastrnatype == INTERNAL_SPACER_2 && thisrnatype != MIDDLE_RIBOSOMAL_SUBUNIT) || + (lastrnatype == MIDDLE_RIBOSOMAL_SUBUNIT && thisrnatype != INTERNAL_SPACER_1) || + (lastrnatype == INTERNAL_SPACER_1 && thisrnatype != SMALL_RIBOSOMAL_SUBUNIT)) { + if (gcp != NULL) { + gcp->itemID = fcontext.itemID; + gcp->thistype = OBJ_SEQFEAT; + } + vsp->descr = NULL; + vsp->sfp = sfp; + ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_ITSdoesNotAbutRRNA, "Problem with order of abutting rRNA components"); + } + } else { + if ((lastrnatype == SMALL_RIBOSOMAL_SUBUNIT && thisrnatype == TRANSFER_RNA) || + (lastrnatype == TRANSFER_RNA && thisrnatype == LARGE_RIBOSOMAL_SUBUNIT)) { + /* okay in mitochondria */ + } else if ((lastrnatype == SMALL_RIBOSOMAL_SUBUNIT && thisrnatype != INTERNAL_SPACER_1) || + (lastrnatype == MIDDLE_RIBOSOMAL_SUBUNIT && thisrnatype != INTERNAL_SPACER_2) || + (lastrnatype == INTERNAL_SPACER_1 && thisrnatype != MIDDLE_RIBOSOMAL_SUBUNIT) || + (lastrnatype == INTERNAL_SPACER_2 && thisrnatype != LARGE_RIBOSOMAL_SUBUNIT)) { + if (gcp != NULL) { + gcp->itemID = fcontext.itemID; + gcp->thistype = OBJ_SEQFEAT; + } + vsp->descr = NULL; + vsp->sfp = sfp; + ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_ITSdoesNotAbutRRNA, "Problem with order of abutting rRNA components"); + } + } + } + } + } + last = sfp; + left = fcontext.left; + right = fcontext.right; + strand = fcontext.strand; + lastrnatype = thisrnatype; + sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_RNA, 0, &fcontext); + } + } + } + + vsp->sfp = NULL; + if (gcp != NULL) { + gcp->itemID = olditemid; + gcp->thistype = olditemtype; + } + mrna = SeqMgrGetRNAgivenProduct (bsp, &fcontext); if (mrna != NULL) { genomicgrp = SeqMgrGetGeneXref (mrna); @@ -12144,9 +13029,7 @@ static void CheckTrnaCodons (ValidStructPtr vsp, GatherContextPtr gcp, SeqFeatPt GeneticCodePtr gncp; Uint2 idx; Int2 j; - SeqEntryPtr sep; ErrSev sev = SEV_ERROR; - Uint1 shift; SeqMapTablePtr smtp; Uint1 taa; ValNodePtr vnp; @@ -12188,8 +13071,11 @@ static void CheckTrnaCodons (ValidStructPtr vsp, GatherContextPtr gcp, SeqFeatPt if (trp->codon[j] < 64) { if (codes == NULL) { bsp = GetBioseqGivenSeqLoc (sfp->location, gcp->entityID); + /* sep = GetBestTopParentForData (gcp->entityID, bsp); code = SeqEntryToGeneticCode (sep, NULL, NULL, 0); + */ + BioseqToGeneticCode (bsp, &code, NULL, NULL, NULL, 0, NULL); gncp = GeneticCodeFind (code, NULL); if (gncp == NULL) { gncp = GeneticCodeFind (1, NULL); @@ -12207,12 +13093,14 @@ static void CheckTrnaCodons (ValidStructPtr vsp, GatherContextPtr gcp, SeqFeatPt taa = codes[trp->codon[j]]; if (aa > 0 && aa != 255) { if (taa != aa) { - if (aa == 'U') { + if (aa == 'U' || aa == 'O') { sev = SEV_WARNING; } if (aa == 'U' && taa == '*' && trp->codon [j] == 14) { /* selenocysteine normally uses TGA (14), so ignore without requiring exception in record */ - /* TAG (11) is used for pyrrolysine in archaebacteria */ + } else if (aa == 'O' && taa == '*' && trp->codon [j] == 11) { + /* pyrrolysine normally uses TAG (11) in archaebacteria, so ignore without requiring exception in record */ + /* TAA (10) is not yet known to be used for an exceptional amino acid */ } else if (StringISearch (sfp->except_text, "modified codon recognition") == NULL) { ValidErr (vsp, sev, ERR_SEQ_FEAT_TrnaCodonWrong, "tRNA codon does not match genetic code"); @@ -12225,6 +13113,7 @@ static void CheckTrnaCodons (ValidStructPtr vsp, GatherContextPtr gcp, SeqFeatPt } if (aa > 0 && aa != 255) { + /* - no gaps now that O and J are added if (aa <= 74) { shift = 0; } else if (aa > 79) { @@ -12232,16 +13121,19 @@ static void CheckTrnaCodons (ValidStructPtr vsp, GatherContextPtr gcp, SeqFeatPt } else { shift = 1; } + */ if (aa != '*') { - idx = aa - (64 + shift); + idx = aa - (64 /* + shift */); } else { - idx = 25; + idx = 25; /* termination */ } - if (idx > 0 && idx < 26) { + if (idx > 0 && idx < 28) { /* valid trna amino acid */ } else { ValidErr (vsp, sev, ERR_SEQ_FEAT_BadTrnaAA, "Invalid tRNA amino acid"); } + } else { + ValidErr (vsp, sev, ERR_SEQ_FEAT_BadTrnaAA, "Invalid tRNA amino acid"); } } @@ -12752,6 +13644,7 @@ static CharPtr legal_exception_strings [] = { "nonconsensus splice site", "modified codon recognition", "alternative start codon", + "dicistronic gene", NULL }; @@ -12760,6 +13653,7 @@ static CharPtr refseq_exception_strings [] = { "unclassified translation discrepancy", "mismatches in transcription", "mismatches in translation", + "adjusted for low-quality genome", NULL }; @@ -12847,7 +13741,9 @@ static void ValidateExceptText (ValidStructPtr vsp, GatherContextPtr gcp, SeqFea typedef struct samecds { Boolean found; SeqMgrFeatContextPtr gcontext; + Uint2 slpTag; Uint1 subtype; + Boolean bypassGeneTest; } SameCds, PNTR SameCdsPtr; static Boolean LIBCALLBACK FindSameCDS (SeqFeatPtr sfp, SeqMgrFeatContextPtr ccontext) @@ -12893,16 +13789,57 @@ static Boolean LIBCALLBACK FindSameCDS (SeqFeatPtr sfp, SeqMgrFeatContextPtr cco return FALSE; } } - } else if (gcontext->left == ccontext->left || gcontext->right == ccontext->right) { - /* if either end of CDS and mRNA is identical, okay to suppress partial warning */ - same->found = TRUE; - return FALSE; + } else if (SeqLocAinB (sfp->location, gcontext->sfp->location) > 0) { + + if (ccontext->strand == Seq_strand_minus || gcontext->strand == Seq_strand_minus) { + if (same->slpTag == SLP_NOSTART && gcontext->partialL) { + if (gcontext->right == ccontext->right) { + same->found = TRUE; + return FALSE; + } + if (gcontext->right > ccontext->right) { + same->bypassGeneTest = TRUE; + return FALSE; + } + } else if (same->slpTag == SLP_NOSTOP && gcontext->partialR) { + if (gcontext->left == ccontext->left) { + same->found = TRUE; + return FALSE; + } + if (gcontext->left < ccontext->left) { + same->bypassGeneTest = TRUE; + return FALSE; + } + } + + } else { + + if (same->slpTag == SLP_NOSTART && gcontext->partialL) { + if (gcontext->left == ccontext->left) { + same->found = TRUE; + return FALSE; + } + if (gcontext->left < ccontext->left) { + same->bypassGeneTest = TRUE; + return FALSE; + } + } else if (same->slpTag == SLP_NOSTOP && gcontext->partialR) { + if (gcontext->right == ccontext->right) { + same->found = TRUE; + return FALSE; + } + if (gcontext->right > ccontext->right) { + same->bypassGeneTest = TRUE; + return FALSE; + } + } + } } } return TRUE; } -static Boolean SameAsCDS (SeqFeatPtr sfp) +static Boolean SameAsCDS (SeqFeatPtr sfp, Uint2 slpTag, BoolPtr bypassGeneTestP) { BioseqPtr bsp; @@ -12915,10 +13852,15 @@ static Boolean SameAsCDS (SeqFeatPtr sfp) if (SeqMgrGetDesiredFeature (0, bsp, 0, 0, sfp, &gcontext) != sfp) return FALSE; same.found = FALSE; same.gcontext = &gcontext; + same.slpTag = slpTag; same.subtype = sfp->idx.subtype; + same.bypassGeneTest = FALSE; MemSet ((Pointer) &cdsFilt, 0, sizeof (cdsFilt)); cdsFilt [SEQFEAT_CDREGION] = TRUE; SeqMgrExploreFeatures (bsp, (Pointer) &same, FindSameCDS, sfp->location, cdsFilt, NULL); + if (bypassGeneTestP != NULL) { + *bypassGeneTestP = same.bypassGeneTest; + } return same.found; } @@ -13568,6 +14510,49 @@ static void ValidateGoTermsSfp ( } } +static void LookForAccnLocs (SeqIdPtr sip, Pointer userdata) + +{ + BoolPtr bp; + TextSeqIdPtr tsip; + + if (sip == NULL || userdata == NULL) return; + bp = (BoolPtr) userdata; + + switch (sip->choice) { + case SEQID_GENBANK : + case SEQID_EMBL : + case SEQID_DDBJ : + case SEQID_TPG : + case SEQID_TPE : + case SEQID_TPD : + case SEQID_OTHER : + tsip = (TextSeqIdPtr) sip->data.ptrvalue; + if (tsip != NULL) { + if (StringDoesHaveText (tsip->accession)) { + *bp = TRUE; + } + } + break; + default : + break; + } +} + +static CharPtr infMessage [] = { + "unknown error", + "empty inference string", + "bad inference prefix", + "bad inference body", + "single inference field", + "spaces in inference", + "same species misused", + "bad inference accession", + "bad inference accession version", + "accession.version not public", + NULL +}; + NLM_EXTERN void ValidateSeqFeat (GatherContextPtr gcp) { Int2 type, i, j; @@ -13590,7 +14575,8 @@ NLM_EXTERN void ValidateSeqFeat (GatherContextPtr gcp) tRNAPtr trp; GBQualPtr gbq; Boolean pseudo, excpt, conflict, codonqual, - anticodonqual, protidqual, transidqual, ovgenepseudo; + anticodonqual, productqual, protidqual, + transidqual, ovgenepseudo; ImpFeatPtr ifp; GeneRefPtr grp; ProtRefPtr prp; @@ -13641,6 +14627,10 @@ NLM_EXTERN void ValidateSeqFeat (GatherContextPtr gcp) Boolean hasxref; CharPtr sfp_old_locus_tag; CharPtr gene_old_locus_tag; + Boolean bypassGeneTest; + Boolean dicistronic = FALSE; + Int2 inferenceCode; + Boolean accn_seqid; vsp = (ValidStructPtr) (gcp->userdata); @@ -13653,6 +14643,26 @@ NLM_EXTERN void ValidateSeqFeat (GatherContextPtr gcp) ValidateSeqLoc (vsp, sfp->product, "Product"); + if (vsp->feat_loc_has_gi) { + accn_seqid = FALSE; + VisitSeqIdsInSeqLoc (sfp->location, (Pointer) &accn_seqid, LookForAccnLocs); + if (accn_seqid) { + if (! vsp->is_smupd_in_sep) { + ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_FeatureRefersToAccession, "Feature location refers to accession"); + } + } + } + + if (vsp->feat_prod_has_gi) { + accn_seqid = FALSE; + VisitSeqIdsInSeqLoc (sfp->product, (Pointer) &accn_seqid, LookForAccnLocs); + if (accn_seqid) { + if (! vsp->is_smupd_in_sep) { + ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_FeatureRefersToAccession, "Feature product refers to accession"); + } + } + } + partials[0] = SeqLocPartialCheck (sfp->product); partials[1] = SeqLocPartialCheck (sfp->location); if ((partials[0] != SLP_COMPLETE) || (partials[1] != SLP_COMPLETE) || (sfp->partial)) { /* partialness */ @@ -13729,21 +14739,21 @@ NLM_EXTERN void ValidateSeqFeat (GatherContextPtr gcp) for (i = 0; i < 2; i++) { errtype = SLP_NOSTART; for (j = 0; j < 4; j++) { + bypassGeneTest = FALSE; if (partials[i] & errtype) { if (i == 1 && j < 2 && IsCddFeat (sfp)) { /* suppresses warning */ - } else if (i == 1 && j < 2 && sfp->data.choice == SEQFEAT_GENE && SameAsCDS (sfp)) { + } else if (i == 1 && j < 2 && sfp->data.choice == SEQFEAT_GENE && SameAsCDS (sfp, errtype, NULL)) { /* ValidErr (vsp, SEV_INFO, ERR_SEQ_FEAT_PartialProblem, "%s: %s", parterr[i], parterrs[j]); */ } else if (i == 1 && j < 2 && sfp->idx.subtype == SEQFEAT_GENE && SameAsMRNA (sfp)) { - } else if (i == 1 && j < 2 && sfp->idx.subtype == FEATDEF_mRNA && SameAsCDS (sfp)) { - } else if (i == 1 && j < 2 && sfp->idx.subtype == FEATDEF_mRNA && SameAsGene (sfp)) { - /* - } else if (i == 1 && j < 2 && sfp->data.choice == SEQFEAT_CDREGION && SameAsMRNA (sfp)) { - */ + } else if (i == 1 && j < 2 && sfp->idx.subtype == FEATDEF_mRNA && SameAsCDS (sfp, errtype, &bypassGeneTest)) { + } else if (i == 1 && j < 2 && sfp->idx.subtype == FEATDEF_mRNA && (! bypassGeneTest) && SameAsGene (sfp)) { + } else if (i == 1 && j < 2 && sfp->data.choice == SEQFEAT_CDREGION && SameAsMRNA (sfp) && + PartialAtSpliceSiteOrGap (sfp->location, errtype, &isgap, &badseq)) { } else if (i == 1 && j < 2 && PartialAtSpliceSiteOrGap (sfp->location, errtype, &isgap, &badseq)) { if (! isgap) { if (sfp->idx.subtype != FEATDEF_CDS || SplicingNotExpected (sfp)) { @@ -14031,6 +15041,19 @@ NLM_EXTERN void ValidateSeqFeat (GatherContextPtr gcp) ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_ProteinNameEndsInBracket, "Protein name ends with bracket and may contain organism name"); } } + if (StringNICmp (str, "hypothetical protein XP_", 24) == 0) { + bsp = GetBioseqGivenSeqLoc (sfp->location, gcp->entityID); + if (bsp != NULL) { + for (sip = bsp->id; sip != NULL; sip = sip->next) { + if (sip->choice != SEQID_OTHER) continue; + tsip = (TextSeqIdPtr) sip->data.ptrvalue; + if (tsip == NULL) continue; + if (StringICmp (tsip->accession, str + 21) != 0) { + ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_HpotheticalProteinMismatch, "Hypothetical protein reference does not match accession"); + } + } + } + } } if (str != NULL && sfp->comment != NULL) { if (StringCmp (str, sfp->comment) == 0) { @@ -14126,16 +15149,22 @@ NLM_EXTERN void ValidateSeqFeat (GatherContextPtr gcp) } if (rrp->type == 3) { /* tRNA */ anticodonqual = FALSE; + productqual = FALSE; gbq = sfp->qual; while (gbq != NULL) { if (StringICmp (gbq->qual, "anticodon") == 0) { anticodonqual = TRUE; + } else if (StringICmp (gbq->qual, "product") == 0) { + productqual = TRUE; } gbq = gbq->next; } if (anticodonqual) { ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidQualifierValue, "Unparsed anticodon qualifier in tRNA"); } + if (productqual) { + ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidQualifierValue, "Unparsed product qualifier in tRNA"); + } } if (rrp->type == 3 && rrp->ext.choice == 1) { /* tRNA with string extension */ ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidQualifierValue, "Unparsed product qualifier in tRNA"); @@ -14314,6 +15343,15 @@ NLM_EXTERN void ValidateSeqFeat (GatherContextPtr gcp) ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_InvalidQualifierValue, "Qualifier other than replace has just quotation marks"); } } + if (StringICmp (gbq->qual, "inference") == 0) { + inferenceCode = ValidateInferenceQualifier (gbq->val, TRUE); + if (inferenceCode != VALID_INFERENCE) { + if (inferenceCode < VALID_INFERENCE || inferenceCode > ACC_VERSION_NOT_PUBLIC) { + inferenceCode = VALID_INFERENCE; + } + ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_InvalidInferenceValue, "Inference qualifier problem - %s", infMessage [(int) inferenceCode]); + } + } } if (sfp->product != NULL) { @@ -14347,7 +15385,13 @@ NLM_EXTERN void ValidateSeqFeat (GatherContextPtr gcp) break; } } + bsp = BioseqFindFromSeqLoc (sfp->location); protBsp = BioseqFindFromSeqLoc (sfp->product); + if (bsp != NULL && protBsp != NULL) { + if (bsp == protBsp) { + ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_SelfReferentialProduct, "Self-referential feature product"); + } + } if (protBsp != NULL && protBsp->id != NULL) { for (sip = protBsp->id; sip != NULL; sip = sip->next) { switch (sip->choice) { @@ -14450,6 +15494,8 @@ NLM_EXTERN void ValidateSeqFeat (GatherContextPtr gcp) if (sfpx == NULL) { ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_GeneXrefWithoutGene, "Feature has gene locus cross-reference but no equivalent gene feature exists"); + } else if (StringStr (sfpx->except_text, "dicistronic gene") != NULL) { + dicistronic = TRUE; } } } @@ -14460,6 +15506,8 @@ NLM_EXTERN void ValidateSeqFeat (GatherContextPtr gcp) if (sfpx == NULL) { ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_GeneXrefWithoutGene, "Feature has gene locus_tag cross-reference but no equivalent gene feature exists"); + } else if (StringStr (sfpx->except_text, "dicistronic gene") != NULL) { + dicistronic = TRUE; } } } @@ -14508,8 +15556,12 @@ NLM_EXTERN void ValidateSeqFeat (GatherContextPtr gcp) } ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UnnecessaryGeneXref, "Unnecessary gene cross-reference %s", label); } else { - if (GPSorNTorNC (vsp->sep, sfp->location)) { + if ((! dicistronic) && GPSorNTorNC (vsp->sep, sfp->location)) { + /* SeqEntryToBioSource (vsp->sep, NULL, NULL, 0, &biop); + */ + bsp = BioseqFindFromSeqLoc (sfp->location); + BioseqToGeneticCode (bsp, NULL, NULL, NULL, NULL, 0, &biop); if (biop != NULL) { orp = biop->org; if (orp != NULL) { @@ -14558,6 +15610,7 @@ static CharPtr bypass_mrna_trans_check [] = { "artificial frameshift", "unclassified transcription discrepancy", "mismatches in transcription", + "adjusted for low-quality genome", NULL }; @@ -14873,6 +15926,7 @@ static CharPtr bypass_cds_trans_check [] = { "rearrangement required for product", "unclassified translation discrepancy", "mismatches in translation", + "adjusted for low-quality genome", NULL }; @@ -14909,6 +15963,11 @@ NLM_EXTERN void CdTransCheck (ValidStructPtr vsp, SeqFeatPtr sfp) StreamCache sc; Boolean isgap; Boolean badseq; + BioseqPtr bsp; + SeqIdPtr sip; + Boolean is_ged = FALSE; + Boolean is_refseq = FALSE; + Boolean has_gi = FALSE; if (sfp == NULL) @@ -15155,6 +16214,32 @@ NLM_EXTERN void CdTransCheck (ValidStructPtr vsp, SeqFeatPtr sfp) sev = SEV_WARNING; } if (report_errors || unclassified_except) { + bsp = BioseqFindFromSeqLoc (sfp->location); + if (bsp != NULL) { + for (sip = bsp->id; sip != NULL; sip = sip->next) { + switch (sip->choice) { + case SEQID_GI : + has_gi = TRUE; + break; + case SEQID_GENBANK : + case SEQID_EMBL : + case SEQID_DDBJ : + case SEQID_TPG : + case SEQID_TPE : + case SEQID_TPD : + is_ged = TRUE; + break; + case SEQID_OTHER : + is_refseq = TRUE; + break; + default : + break; + } + } + if (has_gi && is_ged && (! is_refseq)) { + sev = SEV_REJECT; + } + } ValidErr (vsp, sev, ERR_SEQ_FEAT_InternalStop, "%ld internal stops. Genetic code [%d]", (long) stop_count, gccode); } } @@ -15400,6 +16485,13 @@ erret: ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_TranslExcept, "Unparsed transl_except qual. Skipped"); } } + } else { + if (transl_except) { + has_errors = TRUE; + if (report_errors) { + ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_TranslExcept, "Unparsed transl_except qual (but protein is okay). Skipped"); + } + } } if (prot2seq != NULL) @@ -15466,7 +16558,8 @@ static void SpliceCheckEx (ValidStructPtr vsp, SeqFeatPtr sfp, Boolean checkAll) if (sfp->excpt) { if (StringISearch (sfp->except_text, "ribosomal slippage") != NULL|| StringISearch (sfp->except_text, "artificial frameshift") != NULL || - StringISearch (sfp->except_text, "nonconsensus splice site") != NULL) { + StringISearch (sfp->except_text, "nonconsensus splice site") != NULL || + StringISearch (sfp->except_text, "adjusted for low-quality genome") != NULL) { report_errors = FALSE; } } @@ -15605,6 +16698,7 @@ static void SpliceCheckEx (ValidStructPtr vsp, SeqFeatPtr sfp, Boolean checkAll) } if (((checkAll && (!lastPartial)) || ctr < total) && (stp < (len - 2))) { /* check donor on all but last exon and on sequence */ + tbuf[0] = '\0'; StreamCacheSetPosition (&sc, stp + 1); residue1 = StreamCacheGetResidue (&sc); residue2 = StreamCacheGetResidue (&sc); @@ -15694,6 +16788,7 @@ static void SpliceCheckEx (ValidStructPtr vsp, SeqFeatPtr sfp, Boolean checkAll) tbuf[0] = '\0'; if (bsp == NULL) { StringCpy (tbuf, "?"); + SeqIdWrite (sip, tbuf, PRINTID_FASTA_SHORT, 39); } else if (vsp->suppressContext || vsp->convertGiToAccn) { WorstBioseqLabel (bsp, tbuf, 39, OM_LABEL_CONTENT); } else { diff --git a/api/valid.h b/api/valid.h index 833594ee..33af92a7 100644 --- a/api/valid.h +++ b/api/valid.h @@ -29,7 +29,7 @@ * * Version Creation Date: 1/1/94 * -* $Revision: 6.20 $ +* $Revision: 6.22 $ * * File Description: Sequence editing utilities * @@ -39,6 +39,12 @@ * ------- ---------- ----------------------------------------------------- * * $Log: valid.h,v $ +* Revision 6.22 2006/02/16 19:34:47 kans +* use vsp->is_smupd_in_sep to suppress ERR_SEQ_FEAT_FeatureRefersToAccession +* +* Revision 6.21 2006/01/26 19:54:26 kans +* added ERR_SEQ_FEAT_FeatureRefersToAccession to look for inconsistent use of gi and accession (with or without version) for sfp->location or sfp->product references in a single blob +* * Revision 6.20 2005/06/08 15:26:06 kans * added is_htg_in_sep and is_refseq_in_sep flags to vsp to avoid repetitive checks * @@ -236,6 +242,9 @@ typedef struct validstruct { TextFsaPtr sourceQualTags; /* for detecting structured qual tags in notes */ Boolean is_htg_in_sep; /* record has technique of htgs 0 through htgs 3 */ Boolean is_refseq_in_sep; /* record has seqid of type other (refseq) */ + Boolean is_smupd_in_sep; /* record in INSD internal processing */ + Boolean feat_loc_has_gi; /* at least one feature has a gi location reference */ + Boolean feat_prod_has_gi; /* at least one feature has a gi product reference */ } ValidStruct, PNTR ValidStructPtr; NLM_EXTERN Boolean ValidateSeqEntry PROTO((SeqEntryPtr sep, ValidStructPtr vsp)); diff --git a/api/valid.msg b/api/valid.msg index 1e20cb9a..97bc4de9 100644 --- a/api/valid.msg +++ b/api/valid.msg @@ -194,7 +194,15 @@ $^ LeadingX, 54 The protein sequence starts with one or more X (unknown) amino acids. $^ InternalNsInSeqRaw, 55 -There are runs of many Ns inside a raw sequence. +There are runs of greater than 100 Ns within sequence. Please describe +what these Ns represent with your sequence submission. + +$^ InternalNsAdjacentToGap, 56 +There are Ns directly adjacent to a SeqLit gap in a delta Bioseq. + +$^ CaseDifferenceInSeqID, 57 +Multiple Bioseqs have the same Seq-id except for capitalization. Sequence +identifiers must be unique in a case-insensitive manner within a record. $$ SEQ_DESCR, 2 @@ -336,6 +344,9 @@ the descriptor should be removed. $^ BadCollectionDate, 35 The collection date is not in the required format. +$^ BadPCRPrimerSequence, 36 +The PCR primer sequence has illegal characters or non-IUPAC nucleotides. + $$ GENERIC, 3 $^ NonAsciiAsn, 1 @@ -367,7 +378,15 @@ $^ BadDate, 8 There are bad values for month, day, or year in a date. $^ StructuredCitGenCit, 9 -The publication has title or journal embedded in the unstructured citgen.cit field. +The publication has title or journal embedded in the unstructured citgen.cit +field. + +$^ CollidingSerialNumbers, 10 +Multiple publications have the same serial number explicitly recorded in the +data. + +$^ EmbeddedScript, 11 +Script or other markup tags should not be used in sequence record fields. $$ SEQ_PKG, 4 @@ -867,6 +886,31 @@ The old_locus_tag qualifier on a feature does not match that on the overlapping $^ DuplicateGeneOntologyTerm, 111 A feature has multiple identical Gene Ontology (GO) term specifications. +$^ InvalidInferenceValue, 112 +The value of the inference qualifier is constrained by agreement of the international +nucleotide sequence database collaboration. This value does not conform to those +constraints. Please see the feature table documentation for more information. + +$^ HpotheticalProteinMismatch, 113 +There is a mismatch between the accession cited by the hypothetical protein claim +and the actual accession of the record. + +$^ FeatureRefersToAccession, 114 +There is a mixture of features referring to sequence by gi numbers and by accession. +This inconsistency is likely due to incomplete processing by software. + +$^ SelfReferentialProduct, 115 +A feature product points to the same sequence that the feature location does. +The product must point to a different sequence that is the biological product +of the first, due to transcription, translation, or peptide processing. + +$^ ITSdoesNotAbutRRNA, 116 +The internal transcribed spacer misc_RNA features should exactly abut the flanking rRNA features. + +$^ FeatureSeqIDCaseDifference, 117 +Feature location and referenced Bioseq have the same Seq-id except for capitalization. +Sequence identifiers must be unique in a case-insensitive manner within a record. + $$ SEQ_ALIGN, 6 $^ SeqIdProblem, 1 diff --git a/api/validerr.h b/api/validerr.h index c0f972b4..cfad0e33 100644 --- a/api/validerr.h +++ b/api/validerr.h @@ -57,6 +57,8 @@ #define ERR_SEQ_INST_OverlappingDeltaRange 1,53 #define ERR_SEQ_INST_LeadingX 1,54 #define ERR_SEQ_INST_InternalNsInSeqRaw 1,55 +#define ERR_SEQ_INST_InternalNsAdjacentToGap 1,56 +#define ERR_SEQ_INST_CaseDifferenceInSeqID 1,57 #define ERR_SEQ_DESCR 2,0 #define ERR_SEQ_DESCR_BioSourceMissing 2,1 #define ERR_SEQ_DESCR_InvalidForType 2,2 @@ -93,6 +95,7 @@ #define ERR_SEQ_DESCR_FastaBracketTitle 2,33 #define ERR_SEQ_DESCR_MissingText 2,34 #define ERR_SEQ_DESCR_BadCollectionDate 2,35 +#define ERR_SEQ_DESCR_BadPCRPrimerSequence 2,36 #define ERR_GENERIC 3,0 #define ERR_GENERIC_NonAsciiAsn 3,1 #define ERR_GENERIC_Spell 3,2 @@ -103,6 +106,8 @@ #define ERR_GENERIC_MedlineEntryPub 3,7 #define ERR_GENERIC_BadDate 3,8 #define ERR_GENERIC_StructuredCitGenCit 3,9 +#define ERR_GENERIC_CollidingSerialNumbers 3,10 +#define ERR_GENERIC_EmbeddedScript 3,11 #define ERR_SEQ_PKG 4,0 #define ERR_SEQ_PKG_NoCdRegionPtr 4,1 #define ERR_SEQ_PKG_NucProtProblem 4,2 @@ -231,6 +236,12 @@ #define ERR_SEQ_FEAT_PolyAsignalNotRange 5,109 #define ERR_SEQ_FEAT_OldLocusTagMismtach 5,110 #define ERR_SEQ_FEAT_DuplicateGeneOntologyTerm 5,111 +#define ERR_SEQ_FEAT_InvalidInferenceValue 5,112 +#define ERR_SEQ_FEAT_HpotheticalProteinMismatch 5,113 +#define ERR_SEQ_FEAT_FeatureRefersToAccession 5,114 +#define ERR_SEQ_FEAT_SelfReferentialProduct 5,115 +#define ERR_SEQ_FEAT_ITSdoesNotAbutRRNA 5,116 +#define ERR_SEQ_FEAT_FeatureSeqIDCaseDifference 5,117 #define ERR_SEQ_ALIGN 6,0 #define ERR_SEQ_ALIGN_SeqIdProblem 6,1 #define ERR_SEQ_ALIGN_StrandRev 6,2 |