summaryrefslogtreecommitdiff
path: root/api/valid.c
diff options
context:
space:
mode:
authorAaron M. Ucko <ucko@debian.org>2006-03-13 15:18:51 +0000
committerAaron M. Ucko <ucko@debian.org>2006-03-13 15:18:51 +0000
commit047f9550aeffa40eb05ad53427718889f660e0f4 (patch)
treec491506658b86eb5b9b4145a178f7bf64fe4a543 /api/valid.c
parent4b1edc60532e16ec7d0255e1c3552c2e6a33737e (diff)
Load /tmp/.../ncbi-tools6-6.1.20060301 into
branches/upstream/current.
Diffstat (limited to 'api/valid.c')
-rw-r--r--api/valid.c1423
1 files changed, 1259 insertions, 164 deletions
diff --git a/api/valid.c b/api/valid.c
index 64f4f377..0a8ecae1 100644
--- a/api/valid.c
+++ b/api/valid.c
@@ -29,7 +29,7 @@
*
* Version Creation Date: 1/1/94
*
-* $Revision: 6.682 $
+* $Revision: 6.726 $
*
* File Description: Sequence editing utilities
*
@@ -39,6 +39,138 @@
* ------- ---------- -----------------------------------------------------
*
* $Log: valid.c,v $
+* Revision 6.726 2006/02/27 17:49:34 kans
+* added adjusted for low-quality genome exception for RefSeq models
+*
+* Revision 6.725 2006/02/24 22:49:39 kans
+* call BioseqToGeneticCode instead of much less efficient functions
+*
+* Revision 6.724 2006/02/23 23:05:53 kans
+* added ERR_SEQ_FEAT_FeatureSeqIDCaseDifference
+*
+* Revision 6.723 2006/02/23 22:36:05 kans
+* added ERR_SEQ_INST_CaseDifferenceInSeqID
+*
+* Revision 6.722 2006/02/17 20:12:06 kans
+* fixed text of ITSdoesNotAbutRRNA for one overlap case
+*
+* Revision 6.721 2006/02/16 19:34:28 kans
+* use vsp->is_smupd_in_sep to suppress ERR_SEQ_FEAT_FeatureRefersToAccession
+*
+* Revision 6.720 2006/02/15 17:08:55 kans
+* made ITSdoesNotAbutRRNA more sophisticated, also handles tRNA inside small and large rRNA
+*
+* Revision 6.719 2006/02/10 18:26:50 kans
+* added ERR_SEQ_FEAT_ITSdoesNotAbutRRNA
+*
+* Revision 6.718 2006/02/08 17:49:25 kans
+* added ERR_SEQ_FEAT_SelfReferentialProduct
+*
+* Revision 6.717 2006/02/08 16:27:18 kans
+* report ERR_SEQ_FEAT_TranslExcept even if protein is okay
+*
+* Revision 6.716 2006/02/08 14:34:56 kans
+* [fwd/rev]-primer-[seq/name] changed to [fwd/rev]-pcr-primer-[seq/name]
+*
+* Revision 6.715 2006/02/07 20:36:37 kans
+* ERR_SEQ_INST_InternalNsAdjacentToGap shows first position
+*
+* Revision 6.714 2006/02/07 20:29:59 kans
+* added ERR_SEQ_INST_InternalNsAdjacentToGap
+*
+* Revision 6.713 2006/02/06 16:26:03 kans
+* check for both TPA:experimental and TPA:inferential keywords
+*
+* Revision 6.712 2006/02/03 19:37:12 kans
+* ERR_SEQ_INST_InternalNsInSeq[Lit/Raw] add one to zero-based position
+*
+* Revision 6.711 2006/02/02 22:24:38 kans
+* warn if product gbqual on trna
+*
+* Revision 6.710 2006/01/31 22:31:49 kans
+* added O for pyrrolysine and J for leu or ile ambiguity
+*
+* Revision 6.709 2006/01/26 19:54:26 kans
+* added ERR_SEQ_FEAT_FeatureRefersToAccession to look for inconsistent use of gi and accession (with or without version) for sfp->location or sfp->product references in a single blob
+*
+* Revision 6.708 2006/01/25 20:09:33 kans
+* BadDeltaSeq not done if MI_TECH_composite_wgs_htgs
+*
+* Revision 6.707 2006/01/24 20:17:12 kans
+* ERR_SEQ_FEAT_InternalStop goes to SEV_REJECT if has GI and GenBank/EMBL/DDBJ and not RefSeq
+*
+* Revision 6.706 2006/01/24 19:06:39 kans
+* added ERR_SEQ_DESCR_BadPCRPrimerSequence
+*
+* Revision 6.705 2006/01/24 15:46:08 kans
+* added ERR_SEQ_FEAT_HpotheticalProteinMismatch
+*
+* Revision 6.704 2006/01/18 20:55:08 kans
+* CheckTrnaCodons reports BadTrnaAA if aa is 0 or 255 - usually meaning it was not set
+*
+* Revision 6.703 2006/01/13 20:26:24 kans
+* lower severity of duplicate feature error to warning if partial viral genes
+*
+* Revision 6.702 2006/01/10 18:22:18 kans
+* find embedded html strings only if VALIDATE_ALL
+*
+* Revision 6.701 2006/01/05 20:23:00 kans
+* set isCuratedFlybase flag even if GenBank record for lowering duplicate feature severity, suppressing if dicistronic gene
+*
+* Revision 6.700 2006/01/04 21:29:22 kans
+* use FindStringsInEntity to find embedded script tags by finite state machine
+*
+* Revision 6.699 2006/01/03 19:48:39 kans
+* added javascript: to findrepstrs
+*
+* Revision 6.698 2006/01/03 16:52:54 kans
+* ValidateInferenceQualifier takes fetchAccn argument, added ACC_VERSION_NOT_PUBLIC reply type
+*
+* Revision 6.697 2006/01/03 14:31:39 kans
+* LookForMultipleUnpubPubs relies on SetPubScratchData and ClearPubScratchData to make unique strings only once per pub
+*
+* Revision 6.696 2005/12/30 16:24:37 kans
+* inference qualifier for INSD or RefSeq requires valid accession.version
+*
+* Revision 6.695 2005/12/29 22:24:02 kans
+* added <applet and <form to list of strings to check for script injection attack
+*
+* Revision 6.694 2005/12/29 21:45:57 kans
+* added ERR_GENERIC_EmbeddedScript, use FindReplaceInEntity with callback to find possible javascript injection attacks
+*
+* Revision 6.693 2005/12/29 19:20:28 kans
+* InternalNsInSeqRaw printed for each run of Ns, not just for maximum length
+*
+* Revision 6.692 2005/12/23 20:16:32 kans
+* added ERR_SEQ_FEAT_InvalidInferenceValue
+*
+* Revision 6.691 2005/12/23 18:34:18 kans
+* modified cds/mrna/gene conditions on reporting partials
+*
+* Revision 6.690 2005/12/16 18:42:59 kans
+* dicistronic gene exception turns off Duplicate Feature and SuspiciousGeneXref if curated Drosophila
+*
+* Revision 6.689 2005/12/15 14:22:01 kans
+* ERR_SEQ_INST_InternalNsInSeqRaw triggered if >= 100, not > 100
+*
+* Revision 6.688 2005/12/13 23:17:27 kans
+* In Splice acceptor consensus (AG) not found before exon message, print sip if no bsp
+*
+* Revision 6.687 2005/12/13 23:05:22 kans
+* added ERR_GENERIC_CollidingSerialNumbers
+*
+* Revision 6.686 2005/12/13 22:16:55 kans
+* always initialize tbuf in SpliceCheckEx
+*
+* Revision 6.685 2005/12/08 19:50:30 kans
+* FindSameCDS does not suppress if only one end is identical - also require dashes in collection_date
+*
+* Revision 6.684 2005/12/07 21:15:53 kans
+* ERR_SEQ_FEAT_UTRdoesNotAbutCDS always sets UTR feature context, clears once at end
+*
+* Revision 6.683 2005/12/06 22:20:12 kans
+* raised ERR_SEQ_DESCR_BadCountryCode to SEV_ERROR
+*
* Revision 6.682 2005/12/02 15:11:09 kans
* in ValidateSeqFeat, comment out exception for cdregion same as mrna in partial not at start/stop and not consensus splice site
*
@@ -2258,6 +2390,7 @@ static char *this_file = __FILE__;
#include <explore.h>
#include <subutil.h>
#include <tofasta.h>
+#include <findrepl.h>
/*****************************************************************************
*
@@ -2338,6 +2471,9 @@ NLM_EXTERN void ValidStructClear (ValidStructPtr vsp)
TextFsaPtr sourceQualTags;
Boolean is_htg_in_sep;
Boolean is_refseq_in_sep;
+ Boolean is_smupd_in_sep;
+ Boolean feat_loc_has_gi;
+ Boolean feat_prod_has_gi;
if (vsp == NULL)
return;
@@ -2369,6 +2505,9 @@ NLM_EXTERN void ValidStructClear (ValidStructPtr vsp)
sourceQualTags = vsp->sourceQualTags;
is_htg_in_sep = vsp->is_htg_in_sep;
is_refseq_in_sep = vsp->is_refseq_in_sep;
+ is_smupd_in_sep = vsp->is_smupd_in_sep;
+ feat_loc_has_gi = vsp->feat_loc_has_gi;
+ feat_prod_has_gi = vsp->feat_prod_has_gi;
MemSet ((VoidPtr) vsp, 0, sizeof (ValidStruct));
vsp->errbuf = errbuf;
vsp->cutoff = cutoff;
@@ -2397,6 +2536,9 @@ NLM_EXTERN void ValidStructClear (ValidStructPtr vsp)
vsp->sourceQualTags = sourceQualTags;
vsp->is_htg_in_sep = is_htg_in_sep;
vsp->is_refseq_in_sep = is_refseq_in_sep;
+ vsp->is_smupd_in_sep = is_smupd_in_sep;
+ vsp->feat_loc_has_gi = feat_loc_has_gi;
+ vsp->feat_prod_has_gi = feat_prod_has_gi;
return;
}
@@ -2666,7 +2808,9 @@ static CharPtr err1Label [] = {
"TerminalGap",
"OverlappingDeltaRange",
"LeadingX",
- "InternalNsInSeqRaw"
+ "InternalNsInSeqRaw",
+ "InternalNsAdjacentToGap",
+ "CaseDifferenceInSeqID"
};
static CharPtr err2Label [] = {
@@ -2705,7 +2849,8 @@ static CharPtr err2Label [] = {
"BioSourceInconsistency",
"FastaBracketTitle",
"MissingText",
- "BadCollectionDate"
+ "BadCollectionDate",
+ "BadPCRPrimerSequence"
};
static CharPtr err3Label [] = {
@@ -2718,7 +2863,9 @@ static CharPtr err3Label [] = {
"BadPageNumbering",
"MedlineEntryPub",
"BadDate",
- "StructuredCitGenCit"
+ "StructuredCitGenCit",
+ "CollidingSerialNumbers",
+ "EmbeddedScript"
};
static CharPtr err4Label [] = {
@@ -2852,7 +2999,13 @@ static CharPtr err5Label [] = {
"ExceptionProblem",
"PolyAsignalNotRange",
"OldLocusTagMismtach",
- "DuplicateGeneOntologyTerm"
+ "DuplicateGeneOntologyTerm",
+ "InvalidInferenceValue",
+ "HpotheticalProteinMismatch",
+ "FeatureRefersToAccession",
+ "SelfReferentialProduct",
+ "ITSdoesNotAbutRRNA",
+ "FeatureSeqIDCaseDifference"
};
static CharPtr err6Label [] = {
@@ -3441,6 +3594,10 @@ static Boolean Valid1GatherProc (GatherContextPtr gcp)
ValNodePtr sdp;
BioSourcePtr biop;
PubdescPtr pdp;
+ BioseqPtr bsp;
+ SeqIdPtr sip;
+ Char buf [64];
+ Char tmp [64];
SeqMgrFeatContext context;
vsp = (ValidStructPtr) (gcp->userdata);
@@ -3514,6 +3671,22 @@ static Boolean Valid1GatherProc (GatherContextPtr gcp)
if (vsp->useSeqMgrIndexes) {
if (SeqMgrGetDesiredFeature (gcp->entityID, NULL, 0, 0, sfp, &context) == NULL) {
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_UnindexedFeature, "Feature is not indexed");
+ } else {
+ bsp = BioseqFindFromSeqLoc (sfp->location);
+ if (bsp != NULL) {
+ sip = SeqLocId (sfp->location);
+ if (sip != NULL && sip->choice != SEQID_GI && sip->choice != SEQID_GIBBSQ && sip->choice != SEQID_GIBBMT) {
+ SeqIdWrite (sip, buf, PRINTID_FASTA_SHORT, sizeof (buf) - 1);
+ for (sip = bsp->id; sip != NULL; sip = sip->next) {
+ if (sip->choice == SEQID_GI || sip->choice == SEQID_GIBBSQ || sip->choice == SEQID_GIBBMT) continue;
+ SeqIdWrite (sip, tmp, PRINTID_FASTA_SHORT, sizeof (tmp) - 1);
+ if (StringICmp (buf, tmp) != 0) continue;
+ if (StringCmp (buf, tmp) == 0) continue;
+ ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_FeatureSeqIDCaseDifference,
+ "Sequence identifier in feature location differs in capitalization with identifier on Bioseq");
+ }
+ }
+ }
}
}
}
@@ -3624,6 +3797,12 @@ typedef struct ftprob {
Uint4 num_tpa_with_hist;
Uint4 num_tpa_without_hist;
Boolean has_gi;
+ Boolean loc_has_gi;
+ Boolean loc_has_just_accn;
+ Boolean loc_has_accn_ver;
+ Boolean prod_has_gi;
+ Boolean prod_has_just_accn;
+ Boolean prod_has_accn_ver;
} FeatProb, PNTR FeatProbPtr;
static void CheckFeatPacking (BioseqPtr bsp, SeqFeatPtr sfp, Uint4Ptr num_misplaced_features)
@@ -3791,6 +3970,87 @@ static void CountGeneXrefs (SeqFeatPtr sfp, Pointer userdata)
(fpp->num_gene_xrefs)++;
}
+static void CountSfpLocIdTypes (SeqIdPtr sip, Pointer userdata)
+
+{
+ FeatProbPtr fpp;
+ TextSeqIdPtr tsip;
+
+ if (sip == NULL || userdata == NULL) return;
+ fpp = (FeatProbPtr) userdata;
+
+ switch (sip->choice) {
+ case SEQID_GI :
+ fpp->loc_has_gi = TRUE;
+ break;
+ case SEQID_GENBANK :
+ case SEQID_EMBL :
+ case SEQID_DDBJ :
+ case SEQID_TPG :
+ case SEQID_TPE :
+ case SEQID_TPD :
+ case SEQID_OTHER :
+ tsip = (TextSeqIdPtr) sip->data.ptrvalue;
+ if (tsip != NULL) {
+ if (StringDoesHaveText (tsip->accession)) {
+ if (tsip->version < 1) {
+ fpp->loc_has_just_accn = TRUE;
+ } else {
+ fpp->loc_has_accn_ver = TRUE;
+ }
+ }
+ }
+ break;
+ default :
+ break;
+ }
+}
+
+static void CountSfpProdIdTypes (SeqIdPtr sip, Pointer userdata)
+
+{
+ FeatProbPtr fpp;
+ TextSeqIdPtr tsip;
+
+ if (sip == NULL || userdata == NULL) return;
+ fpp = (FeatProbPtr) userdata;
+
+ switch (sip->choice) {
+ case SEQID_GI :
+ fpp->prod_has_gi = TRUE;
+ break;
+ case SEQID_GENBANK :
+ case SEQID_EMBL :
+ case SEQID_DDBJ :
+ case SEQID_TPG :
+ case SEQID_TPE :
+ case SEQID_TPD :
+ case SEQID_OTHER :
+ tsip = (TextSeqIdPtr) sip->data.ptrvalue;
+ if (tsip != NULL) {
+ if (StringDoesHaveText (tsip->accession)) {
+ if (tsip->version < 1) {
+ fpp->prod_has_just_accn = TRUE;
+ } else {
+ fpp->prod_has_accn_ver = TRUE;
+ }
+ }
+ }
+ break;
+ default :
+ break;
+ }
+}
+
+static void CountFeatLocIdTypes (SeqFeatPtr sfp, Pointer userdata)
+
+{
+ if (sfp == NULL || userdata == NULL) return;
+
+ VisitSeqIdsInSeqLoc (sfp->location, userdata, CountSfpLocIdTypes);
+ VisitSeqIdsInSeqLoc (sfp->product, userdata, CountSfpProdIdTypes);
+}
+
static Boolean HasTpaUserObject (BioseqPtr bsp)
{
@@ -3864,6 +4124,7 @@ typedef struct vfcdata {
ValNodePtr uids;
ValNodePtr unpub;
ValNodePtr publshd;
+ ValNodePtr serial;
ValidStructPtr vsp;
} VfcData, PNTR VfcPtr;
@@ -3886,6 +4147,7 @@ static void MakePubTags (PubdescPtr pdp, Pointer userdata)
{
Char buf [1024];
+ CitGenPtr cgp;
Int4 muid = 0, pmid = 0;
VfcPtr vfp;
ValNodePtr vnp;
@@ -3898,6 +4160,16 @@ static void MakePubTags (PubdescPtr pdp, Pointer userdata)
muid = vnp->data.intvalue;
} else if (vnp->choice == PUB_PMid) {
pmid = vnp->data.intvalue;
+ } else if (vnp->choice == PUB_Gen) {
+ cgp = (CitGenPtr) vnp->data.ptrvalue;
+ if (cgp != NULL && cgp->serial_number > 0) {
+ vnp = ValNodeNew (NULL);
+ if (vnp != NULL) {
+ vnp->data.intvalue = (Int4) cgp->serial_number;
+ vnp->next = vfp->serial;
+ vfp->serial = vnp;
+ }
+ }
}
}
@@ -4018,6 +4290,43 @@ static void CheckFeatCits (SeqFeatPtr sfp, Pointer userdata)
}
}
+static void CheckForCollidingSerials (
+ ValidStructPtr vsp,
+ GatherContextPtr gcp,
+ ValNodePtr list
+)
+
+{
+ Int4 curr, last, max;
+ Uint2 olditemtype = 0;
+ Uint2 olditemid = 0;
+ ValNodePtr vnp;
+
+ if (vsp == NULL || gcp == NULL || list == NULL) return;
+
+ olditemid = gcp->itemID;
+ olditemtype = gcp->thistype;
+ gcp->itemID = 0;
+ gcp->thistype = 0;
+
+ last = (Int4) list->data.intvalue;
+ max = last;
+ for (vnp = list->next; vnp != NULL; vnp = vnp->next) {
+ curr = (Int4) vnp->data.intvalue;
+ if (last == curr) {
+ if (curr > max) {
+ ValidErr (vsp, SEV_WARNING, ERR_GENERIC_CollidingSerialNumbers,
+ "Multiple publications have serial number %ld", (long) curr);
+ max = curr;
+ }
+ }
+ last = curr;
+ }
+
+ gcp->itemID = olditemid;
+ gcp->thistype = olditemtype;
+}
+
static void ValidateFeatCits (SeqEntryPtr sep, ValidStructPtr vsp)
{
@@ -4038,9 +4347,17 @@ static void ValidateFeatCits (SeqEntryPtr sep, ValidStructPtr vsp)
VisitFeaturesInSep (sep, (Pointer) &vfd, CheckFeatCits);
+ vsp->bssp = NULL;
+ vsp->bsp = NULL;
+ vsp->sfp = NULL;
+ vsp->descr = NULL;
+ vfd.serial = ValNodeSort (vfd.serial, SortByIntvalue);
+ CheckForCollidingSerials (vsp, vsp->gcp, vfd.serial);
+
ValNodeFree (vfd.uids);
ValNodeFreeData (vfd.unpub);
ValNodeFreeData (vfd.publshd);
+ ValNodeFree (vfd.serial);
}
static void ValidateFeatIDs (Uint2 entityID, ValidStructPtr vsp)
@@ -4097,6 +4414,111 @@ static void ValidateFeatIDs (Uint2 entityID, ValidStructPtr vsp)
}
}
+typedef struct vsicdata {
+ ValidStructPtr vsp;
+ ValNodePtr headid;
+ ValNodePtr tailid;
+} VsicData, PNTR VsicDataPtr;
+
+static void CaptureTextSeqIDs (BioseqPtr bsp, Pointer userdata)
+
+{
+ Char buf [64];
+ SeqIdPtr sip;
+ VsicDataPtr vdp;
+ ValNodePtr vnp;
+
+ if (bsp == NULL || userdata == NULL) return;
+ vdp = (VsicDataPtr) userdata;
+
+ for (sip = bsp->id; sip != NULL; sip = sip->next) {
+ if (sip->choice == SEQID_GI || sip->choice == SEQID_GIBBSQ || sip->choice == SEQID_GIBBMT) continue;
+ SeqIdWrite (sip, buf, PRINTID_FASTA_SHORT, sizeof (buf) - 1);
+ vnp = ValNodeCopyStr (&(vdp->tailid), 0, buf);
+ if (vdp->headid == NULL) {
+ vdp->headid = vnp;
+ }
+ vdp->tailid = vnp;
+ }
+}
+
+static ValNodePtr UniqueValNodeCaseSensitive (ValNodePtr list)
+
+{
+ CharPtr last;
+ ValNodePtr next;
+ Pointer PNTR prev;
+ CharPtr str;
+ ValNodePtr vnp;
+
+ if (list == NULL) return NULL;
+ last = (CharPtr) list->data.ptrvalue;
+ vnp = list->next;
+ prev = (Pointer PNTR) &(list->next);
+ while (vnp != NULL) {
+ next = vnp->next;
+ str = (CharPtr) vnp->data.ptrvalue;
+ if (StringCmp (last, str) == 0) {
+ vnp->next = NULL;
+ *prev = next;
+ ValNodeFreeData (vnp);
+ } else {
+ last = (CharPtr) vnp->data.ptrvalue;
+ prev = (Pointer PNTR) &(vnp->next);
+ }
+ vnp = next;
+ }
+
+ return list;
+}
+
+static void ValidateSeqIdCase (SeqEntryPtr sep, ValidStructPtr vsp)
+
+{
+ CharPtr curr;
+ GatherContext gc;
+ GatherContextPtr gcp;
+ CharPtr prev;
+ VsicData vd;
+ ValNodePtr vnp;
+
+ if (vsp == NULL || sep == NULL) return;
+
+ MemSet ((Pointer) &gc, 0, sizeof (GatherContext));
+ MemSet ((Pointer) &vd, 0, sizeof (VsicData));
+
+ gcp = &gc;
+ vsp->gcp = &gc;
+ vsp->bssp = NULL;
+ vsp->bsp = NULL;
+ vsp->sfp = NULL;
+ vsp->descr = NULL;
+ vd.vsp = vsp;
+
+ VisitBioseqsInSep (sep, (Pointer) &vd, CaptureTextSeqIDs);
+ vd.headid = ValNodeSort (vd.headid, SortVnpByString);
+ vd.headid = UniqueValNodeCaseSensitive (vd.headid);
+
+ curr = NULL;
+ prev = NULL;
+ for (vnp = vd.headid; vnp != NULL; vnp = vnp->next, prev = curr) {
+ curr = (CharPtr) vnp->data.ptrvalue;
+ if (StringHasNoText (curr)) continue;
+ if (StringHasNoText (prev)) continue;
+ if (StringICmp (curr, prev) != 0) continue;
+ if (StringCmp (curr, prev) == 0) continue;
+ ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_CaseDifferenceInSeqID,
+ "Sequence identifier differs only by case - %s and %s", curr, prev);
+ }
+
+ vsp->bssp = NULL;
+ vsp->bsp = NULL;
+ vsp->sfp = NULL;
+ vsp->descr = NULL;
+
+ ValNodeFreeData (vd.headid);
+}
+
static void LookForNC (BioseqPtr bsp, Pointer userdata)
{
@@ -4143,6 +4565,120 @@ static void LookForHTG (SeqDescrPtr sdp, Pointer userdata)
}
}
+static void LookForSMUPD (SeqDescrPtr sdp, Pointer userdata)
+
+{
+ BoolPtr is_smupdp;
+ UserObjectPtr uop;
+
+ if (sdp == NULL || userdata == NULL) return;
+ if (sdp->choice != Seq_descr_user) return;
+
+ uop = (UserObjectPtr) sdp->data.ptrvalue;
+ if (uop == NULL) return;
+
+ if (StringICmp (uop->_class, "SMART_V1.0") == 0) {
+
+ is_smupdp = (BoolPtr) userdata;
+ *is_smupdp = TRUE;
+ }
+}
+
+static void SetPubScratchData (SeqDescrPtr sdp, Pointer userdata)
+
+{
+ AuthListPtr alp;
+ Char buf [2048];
+ CitGenPtr cgp;
+ CharPtr consortium, str, tmp;
+ ValNodePtr vnp;
+ ObjValNodePtr ovp;
+ PubdescPtr pdp;
+
+ if (sdp == NULL || sdp->choice != Seq_descr_pub || sdp->extended == 0) return;
+ ovp = (ObjValNodePtr) sdp;
+ pdp = (PubdescPtr) sdp->data.ptrvalue;
+ if (pdp == NULL) return;
+
+ vnp = pdp->pub;
+
+ /* skip over just serial number */
+
+ if (vnp != NULL && vnp->choice == PUB_Gen && vnp->next != NULL) {
+ cgp = (CitGenPtr) vnp->data.ptrvalue;
+ if (cgp != NULL) {
+ if (StringNICmp ("BackBone id_pub", cgp->cit, 15) != 0) {
+ if (cgp->cit == NULL && cgp->journal == NULL && cgp->date == NULL && cgp->serial_number) {
+ vnp = vnp->next;
+ }
+ }
+ }
+ }
+
+ if (PubLabelUnique (vnp, buf, sizeof (buf) - 1, OM_LABEL_CONTENT, TRUE) > 0) {
+ alp = GetAuthListPtr (pdp, NULL);
+ if (alp != NULL) {
+ consortium = NULL;
+ str = GetAuthorsString (GENBANK_FMT, alp, &consortium, NULL, NULL);
+ tmp = MemNew (StringLen (buf) + StringLen (str) + StringLen (consortium) + 10);
+ if (tmp != NULL) {
+ StringCpy (tmp, buf);
+ if (StringDoesHaveText (str)) {
+ StringCat (tmp, "; ");
+ StringCat (tmp, str);
+ }
+ if (StringDoesHaveText (consortium)) {
+ StringCat (tmp, "; ");
+ StringCat (tmp, consortium);
+ }
+ ovp->idx.scratch = tmp;
+ }
+ MemFree (str);
+ MemFree (consortium);
+ }
+ }
+}
+
+static void ClearPubScratchData (SeqDescrPtr sdp, Pointer userdata)
+
+{
+ ObjValNodePtr ovp;
+
+ if (sdp == NULL || sdp->choice != Seq_descr_pub || sdp->extended == 0) return;
+ ovp = (ObjValNodePtr) sdp;
+ ovp->idx.scratch = MemFree (ovp->idx.scratch);
+}
+
+typedef struct frd {
+ ValidStructPtr vsp;
+ GatherContextPtr gcp;
+ /*
+ CharPtr string;
+ */
+} FindRepData, PNTR FindRepPtr;
+
+static void FindRepValidate (Uint2 entityID, Uint2 itemID, Uint2 itemtype, Pointer userdata)
+
+{
+ FindRepPtr frp;
+ GatherContextPtr gcp;
+ ValidStructPtr vsp;
+
+ frp = (FindRepPtr) userdata;
+ vsp = frp->vsp;
+ gcp = frp->gcp;
+
+ gcp->entityID = entityID;
+ gcp->itemID = itemID;
+ gcp->thistype = itemtype;
+
+ ValidErr (vsp, SEV_ERROR, ERR_GENERIC_EmbeddedScript, "Script tag found in item");
+}
+
+static CharPtr findrepstrs [] = {
+ "<script", "<object", "<applet", "<embed", "<form", "javascript:", NULL
+};
+
NLM_EXTERN Boolean ValidateSeqEntry (SeqEntryPtr sep, ValidStructPtr vsp)
{
Uint2 entityID = 0;
@@ -4164,7 +4700,7 @@ NLM_EXTERN Boolean ValidateSeqEntry (SeqEntryPtr sep, ValidStructPtr vsp)
SeqEntryPtr oldsep;
ErrSev oldsev;
ObjMgrDataPtr omdp;
- SeqEntryPtr topsep;
+ SeqEntryPtr topsep = NULL;
SeqEntryPtr tmp;
ValNodePtr bsplist;
ErrSev sev;
@@ -4172,6 +4708,7 @@ NLM_EXTERN Boolean ValidateSeqEntry (SeqEntryPtr sep, ValidStructPtr vsp)
Boolean isGPS = FALSE;
Boolean isPatent = FALSE;
Boolean isPDB = FALSE;
+ FindRepData frd;
if (sep == NULL || vsp == NULL) return FALSE;
@@ -4195,6 +4732,7 @@ NLM_EXTERN Boolean ValidateSeqEntry (SeqEntryPtr sep, ValidStructPtr vsp)
topsep = GetTopSeqEntryForEntityID (entityID);
VisitGraphsInSep (topsep, (Pointer) &featprob, CheckGraphPacking);
VisitFeaturesInSep (topsep, (Pointer) &featprob, CountGeneXrefs);
+ VisitFeaturesInSep (topsep, (Pointer) &featprob, CountFeatLocIdTypes);
VisitBioseqsInSep (topsep, (Pointer) &featprob, CheckTpaHist);
} else {
@@ -4247,12 +4785,20 @@ NLM_EXTERN Boolean ValidateSeqEntry (SeqEntryPtr sep, ValidStructPtr vsp)
vsp->is_htg_in_sep = FALSE;
VisitDescriptorsInSep (sep, (Pointer) &(vsp->is_htg_in_sep), LookForHTG);
+ vsp->is_smupd_in_sep = FALSE;
+ VisitDescriptorsInSep (sep, (Pointer) &(vsp->is_smupd_in_sep), LookForSMUPD);
vsp->is_refseq_in_sep = FALSE;
VisitBioseqsInSep (sep, (Pointer) &(vsp->is_refseq_in_sep), LookForNC);
+ vsp->feat_loc_has_gi = featprob.loc_has_gi;
+ vsp->feat_prod_has_gi = featprob.prod_has_gi;
+
globalvsp = vsp; /* for spell checker */
while (sep != NULL) {
+ /* calculate strings for LookForMultipleUnpubPubs test only once for genome product set efficiency */
+ VisitDescriptorsInSep (sep, NULL, SetPubScratchData);
+
MemSet (&gs, 0, sizeof (GatherScope));
gs.scope = sep; /* default is to scope to this set */
@@ -4384,6 +4930,10 @@ NLM_EXTERN Boolean ValidateSeqEntry (SeqEntryPtr sep, ValidStructPtr vsp)
ValidateFeatIDs (gc.entityID, vsp);
vsp->gcp = NULL;
+ vsp->gcp = NULL;
+ ValidateSeqIdCase (sep, vsp);
+ vsp->gcp = NULL;
+
if (vsp->validateAlignments) {
vsp->gcp = NULL;
ValidateSeqAlignWithinValidator (vsp, sep, vsp->alignFindRemoteBsp, vsp->doSeqHistAssembly);
@@ -4392,6 +4942,8 @@ NLM_EXTERN Boolean ValidateSeqEntry (SeqEntryPtr sep, ValidStructPtr vsp)
SeqEntrySetScope (oldsep);
+ VisitDescriptorsInSep (sep, NULL, ClearPubScratchData);
+
if (vsp->useSeqMgrIndexes) {
/* unlock all pre-locked remote genome components */
@@ -4407,6 +4959,22 @@ NLM_EXTERN Boolean ValidateSeqEntry (SeqEntryPtr sep, ValidStructPtr vsp)
sep = NULL;
}
+ MemSet ((Pointer) &gc, 0, sizeof (GatherContext));
+ gcp = &gc;
+ gc.entityID = ObjMgrGetEntityIDForChoice (sep);
+ vsp->gcp = gcp;
+ frd.vsp = vsp;
+ frd.gcp = gcp;
+
+ limit = vsp->validationLimit;
+ if (limit == VALIDATE_ALL) {
+ /*
+ frd.string = "?";
+ */
+ FindStringsInEntity (entityID, findrepstrs, FALSE, FALSE, FALSE, UPDATE_NEVER,
+ NULL, NULL, NULL, TRUE, FindRepValidate, (Pointer) &frd);
+ }
+
if (do_many) {
for (i = 0; i < 6; i++)
vsp->errors[i] = errors[i];
@@ -5284,16 +5852,23 @@ static void ValidateIDSetAgainstDb (GatherContextPtr gcp, ValidStructPtr vsp, Bi
}
typedef struct enrun {
- Int4 ncount;
- Int4 maxrun;
+ GatherContextPtr gcp;
+ ValidStructPtr vsp;
+ Int4 ncount;
+ Int4 maxrun;
+ Int4 seqpos;
+ Boolean showAll;
+ Boolean inNrun;
} RunOfNs, PNTR RunOfNsPtr;
static void LIBCALLBACK CountAdjacentProc (CharPtr sequence, Pointer userdata)
{
- Char ch;
- RunOfNsPtr ronp;
- CharPtr str;
+ Char ch;
+ GatherContextPtr gcp;
+ RunOfNsPtr ronp;
+ CharPtr str;
+ ValidStructPtr vsp;
ronp = (RunOfNsPtr) userdata;
if (sequence == NULL || ronp == NULL) return;
@@ -5301,20 +5876,29 @@ static void LIBCALLBACK CountAdjacentProc (CharPtr sequence, Pointer userdata)
str = sequence;
ch = *str;
while (ch != '\0') {
+ (ronp->seqpos)++;
if (ch == 'N') {
(ronp->ncount)++;
if (ronp->ncount > ronp->maxrun) {
ronp->maxrun = ronp->ncount;
}
+ ronp->inNrun = TRUE;
} else {
+ if (ronp->inNrun && ronp->showAll && ronp->ncount >= 100) {
+ vsp = ronp->vsp;
+ gcp = ronp->gcp;
+ ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_InternalNsInSeqRaw, "Run of %ld Ns in raw sequence starting at base %ld",
+ (long) ronp->ncount, (long) (ronp->seqpos - ronp->ncount + 1));
+ }
ronp->ncount = 0;
+ ronp->inNrun = FALSE;
}
str++;
ch = *str;
}
}
-static Int4 CountAdjacentNsInSeqLit (SeqLitPtr slitp, Boolean is_na)
+static Int4 CountAdjacentNsInSeqLit (GatherContextPtr gcp, SeqLitPtr slitp, Boolean is_na)
{
BioseqPtr bsp;
@@ -5340,8 +5924,13 @@ static Int4 CountAdjacentNsInSeqLit (SeqLitPtr slitp, Boolean is_na)
bsp->length = slitp->length;
bsp->id = SeqIdParse ("lcl|countseqlitns");
+ ron.gcp = gcp;
+ ron.vsp = (ValidStructPtr) (gcp->userdata);
ron.ncount = 0;
ron.maxrun = 0;
+ ron.seqpos = 0;
+ ron.showAll = FALSE;
+ ron.inNrun = FALSE;
SeqPortStream (bsp, STREAM_EXPAND_GAPS, (Pointer) &ron, CountAdjacentProc);
@@ -5572,6 +6161,10 @@ static void ValidateBioseqInst (GatherContextPtr gcp)
Boolean hasGi = FALSE;
SeqHistPtr hist;
IntFuzzPtr ifp;
+ Int4 adjacent_N_gap_position;
+ Boolean adjacent_N_and_gap;
+ Boolean in_gap;
+ Boolean in_N;
Boolean isActiveFin = FALSE;
Boolean isGB = FALSE;
Boolean isPatent = FALSE;
@@ -6001,12 +6594,6 @@ static void ValidateBioseqInst (GatherContextPtr gcp)
oldItemID = gcp->itemID;
oldItemtype = gcp->thistype;
- if (SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &dcontext) != NULL) {
- gcp->entityID = dcontext.entityID;
- gcp->itemID = dcontext.itemID;
- gcp->thistype = OBJ_SEQDESC;
- }
-
if (ISA_aa (bsp->mol)) {
if (bsp->topology > 1) { /* not linear */
ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_CircularProtein, "Non-linear topology set on protein");
@@ -6276,6 +6863,47 @@ static void ValidateBioseqInst (GatherContextPtr gcp)
}
}
+ if (ISA_na (bsp->mol) && bsp->repr == Seq_repr_delta && DeltaLitOnly (bsp)) {
+ if (! StreamCacheSetup (bsp, NULL, EXPAND_GAPS_TO_DASHES, &sc)) {
+ ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_SeqPortFail, "Can't open StreamCache");
+ return;
+ }
+ in_gap = FALSE;
+ in_N = FALSE;
+ adjacent_N_and_gap = FALSE;
+ adjacent_N_gap_position = 0;
+ for (len = 0; len < bsp->length; len++) {
+ residue = StreamCacheGetResidue (&sc);
+ if (residue == '-') {
+ if (in_N) {
+ adjacent_N_and_gap = TRUE;
+ if (adjacent_N_gap_position == 0) {
+ adjacent_N_gap_position = len;
+ }
+ }
+ in_N = FALSE;
+ in_gap = TRUE;
+ } else if (residue == 'N') {
+ if (in_gap) {
+ adjacent_N_and_gap = TRUE;
+ if (adjacent_N_gap_position == 0) {
+ adjacent_N_gap_position = len;
+ }
+ }
+ in_gap = FALSE;
+ in_N = TRUE;
+ } else {
+ in_gap = FALSE;
+ in_N = FALSE;
+ }
+ }
+ if (adjacent_N_and_gap) {
+ ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_InternalNsAdjacentToGap,
+ "Ambiguous residue N is adjacent to a gap around position %ld",
+ (long) adjacent_N_gap_position);
+ }
+ }
+
if ((bsp->repr == Seq_repr_seg) || (bsp->repr == Seq_repr_ref)) { /* check segmented sequence */
head.choice = SEQLOC_MIX;
head.data.ptrvalue = bsp->seq_ext;
@@ -6499,19 +7127,19 @@ static void ValidateBioseqInst (GatherContextPtr gcp)
}
if (mip != NULL) {
if (mip->tech == MI_TECH_htgs_1 || mip->tech == MI_TECH_htgs_2) {
- runsofn = CountAdjacentNsInSeqLit (slitp, (Boolean) ISA_na (bsp->mol));
+ runsofn = CountAdjacentNsInSeqLit (gcp, slitp, (Boolean) ISA_na (bsp->mol));
if (runsofn > 80) {
- ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_InternalNsInSeqLit, "Run of %ld Ns in delta component %ld that starts at base %ld", (long) runsofn, (int) segnum, (long) len);
+ ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_InternalNsInSeqLit, "Run of %ld Ns in delta component %ld that starts at base %ld", (long) runsofn, (int) segnum, (long) (len + 1));
}
} else if (mip->tech == MI_TECH_wgs || mip->tech == MI_TECH_composite_wgs_htgs) {
- runsofn = CountAdjacentNsInSeqLit (slitp, (Boolean) ISA_na (bsp->mol));
+ runsofn = CountAdjacentNsInSeqLit (gcp, slitp, (Boolean) ISA_na (bsp->mol));
if (runsofn > 80) {
- ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_InternalNsInSeqLit, "Run of %ld Ns in delta component %ld that starts at base %ld", (long) runsofn, (int) segnum, (long) len);
+ ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_InternalNsInSeqLit, "Run of %ld Ns in delta component %ld that starts at base %ld", (long) runsofn, (int) segnum, (long) (len + 1));
}
} else {
- runsofn = CountAdjacentNsInSeqLit (slitp, (Boolean) ISA_na (bsp->mol));
+ runsofn = CountAdjacentNsInSeqLit (gcp, slitp, (Boolean) ISA_na (bsp->mol));
if (runsofn > 100) {
- ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_InternalNsInSeqLit, "Run of %ld Ns in delta component %ld that starts at base %ld", (long) runsofn, (int) segnum, (long) len);
+ ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_InternalNsInSeqLit, "Run of %ld Ns in delta component %ld that starts at base %ld", (long) runsofn, (int) segnum, (long) (len + 1));
}
}
}
@@ -6547,19 +7175,31 @@ static void ValidateBioseqInst (GatherContextPtr gcp)
}
if ((!isNTorNC) && (! is_gps) && mip->tech != MI_TECH_htgs_0 && mip->tech != MI_TECH_htgs_1 &&
mip->tech != MI_TECH_htgs_2 && mip->tech != MI_TECH_htgs_3 && mip->tech != MI_TECH_wgs &&
- mip->tech != MI_TECH_unknown && mip->tech != MI_TECH_standard) {
+ mip->tech != MI_TECH_composite_wgs_htgs && mip->tech != MI_TECH_unknown && mip->tech != MI_TECH_standard) {
ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_BadDeltaSeq, "Delta seq technique should not be [%d]", (int) (mip->tech));
}
}
} else if (bsp->repr == Seq_repr_raw) {
+ ron.gcp = gcp;
+ ron.vsp = vsp;
ron.ncount = 0;
ron.maxrun = 0;
+ ron.seqpos = 0;
+ ron.showAll = TRUE;
+ ron.inNrun = FALSE;
SeqPortStream (bsp, STREAM_EXPAND_GAPS, (Pointer) &ron, CountAdjacentProc);
- if (ron.maxrun > 100) {
+ if (ron.inNrun && ron.showAll && ron.ncount >= 100) {
+ ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_InternalNsInSeqRaw, "Run of %ld Ns in raw sequence starting at base %ld",
+ (long) ron.ncount, (long) (ron.seqpos - ron.ncount + 1));
+ }
+
+ /*
+ if (ron.maxrun >= 100) {
ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_InternalNsInSeqRaw, "Run of %ld Ns in raw sequence", (long) ron.maxrun);
}
+ */
}
if (bsp->repr == Seq_repr_delta) {
@@ -7086,12 +7726,11 @@ static void LookForMultiplePubs (ValidStructPtr vsp, GatherContextPtr gcp, SeqDe
static void LookForMultipleUnpubPubs (ValidStructPtr vsp, GatherContextPtr gcp, BioseqPtr bsp)
{
- AuthListPtr alp;
Char buf [2048];
- CitGenPtr cgp;
- CharPtr consortium, last, str, tmp;
+ CharPtr last, str;
SeqMgrDescContext dcontext;
ValNodePtr list = NULL, next, vnp;
+ ObjValNodePtr ovp;
PubdescPtr pdp;
SeqDescrPtr sdp;
@@ -7099,42 +7738,9 @@ static void LookForMultipleUnpubPubs (ValidStructPtr vsp, GatherContextPtr gcp,
while (sdp) {
pdp = (PubdescPtr) sdp->data.ptrvalue;
if (pdp != NULL) {
- vnp = pdp->pub;
-
- /* skip over just serial number */
-
- if (vnp != NULL && vnp->choice == PUB_Gen && vnp->next != NULL) {
- cgp = (CitGenPtr) vnp->data.ptrvalue;
- if (cgp != NULL) {
- if (StringNICmp ("BackBone id_pub", cgp->cit, 15) != 0) {
- if (cgp->cit == NULL && cgp->journal == NULL && cgp->date == NULL && cgp->serial_number) {
- vnp = vnp->next;
- }
- }
- }
- }
-
- if (PubLabelUnique (vnp, buf, sizeof (buf) - 1, OM_LABEL_CONTENT, TRUE) > 0) {
- alp = GetAuthListPtr (pdp, NULL);
- if (alp != NULL) {
- consortium = NULL;
- str = GetAuthorsString (GENBANK_FMT, alp, &consortium, NULL, NULL);
- tmp = MemNew (StringLen (buf) + StringLen (str) + StringLen (consortium) + 10);
- if (tmp != NULL) {
- StringCpy (tmp, buf);
- if (StringDoesHaveText (str)) {
- StringCat (tmp, "; ");
- StringCat (tmp, str);
- }
- if (StringDoesHaveText (consortium)) {
- StringCat (tmp, "; ");
- StringCat (tmp, consortium);
- }
- ValNodeAddStr (&list, 0, tmp);
- }
- MemFree (str);
- MemFree (consortium);
- }
+ ovp = (ObjValNodePtr) sdp;
+ if (ovp->idx.scratch != NULL) {
+ ValNodeCopyStr (&list, 0, ovp->idx.scratch);
}
}
sdp = SeqMgrGetNextDescriptor (bsp, sdp, Seq_descr_pub, &dcontext);
@@ -8231,7 +8837,7 @@ static Boolean CountryIsValid (CharPtr name)
return FALSE;
}
-static CharPtr GetDashOrSpace (CharPtr str)
+static CharPtr GetDash (CharPtr str)
{
Char ch;
@@ -8239,7 +8845,7 @@ static CharPtr GetDashOrSpace (CharPtr str)
if (str == NULL) return NULL;
ch = *str;
while (ch != '\0') {
- if (ch == ' ' || ch == '-') return str;
+ if (ch == '-') return str;
str++;
ch = *str;
}
@@ -8275,11 +8881,11 @@ static Boolean CollectionDateIsValid (CharPtr name)
if (StringHasNoText (name)) return FALSE;
StringNCpy_0 (str, name, sizeof (str));
- ptr1 = GetDashOrSpace (str);
+ ptr1 = GetDash (str);
if (ptr1 != NULL) {
*ptr1 = '\0';
ptr1++;
- ptr2 = GetDashOrSpace (ptr1);
+ ptr2 = GetDash (ptr1);
if (ptr2 != NULL) {
*ptr2 = '\0';
ptr2++;
@@ -8323,6 +8929,42 @@ static Boolean CollectionDateIsValid (CharPtr name)
return FALSE;
}
+static Boolean PrimerSeqIsValid (CharPtr name)
+
+{
+ Char ch;
+ size_t len;
+ CharPtr ptr;
+
+ if (StringHasNoText (name)) return FALSE;
+ len = StringLen (name);
+ if (len < 1) return FALSE;
+
+ if (StringChr (name, ',') != NULL) {
+ if (name [0] != '(' || name [len - 1] != ')') return FALSE;
+ } else {
+ if (StringChr (name, '(') != NULL) return FALSE;
+ if (StringChr (name, ')') != NULL) return FALSE;
+ }
+
+ if (StringChr (name, ';') != NULL) return FALSE;
+ if (StringChr (name, ' ') != NULL) return FALSE;
+
+ ptr = name;
+ ch = *ptr;
+ while (ch != '\0') {
+ if (ch != '(' && ch != ')' && ch != ',') {
+ if (! (IS_ALPHA (ch))) return FALSE;
+ ch = TO_UPPER (ch);
+ if (StringChr ("ABCDGHKMNRSTVWY", ch) == NULL) return FALSE;
+ }
+ ptr++;
+ ch = *ptr;
+ }
+
+ return TRUE;
+}
+
static CharPtr source_qual_prefixes [] = {
"acronym:",
"anamorph:",
@@ -8349,6 +8991,10 @@ static CharPtr source_qual_prefixes [] = {
"forma:",
"forma_specialis:",
"frequency:",
+ "fwd_pcr_primer_name",
+ "fwd_pcr_primer_seq",
+ "fwd_primer_name",
+ "fwd_primer_seq",
"genotype:",
"germline:",
"group:",
@@ -8367,6 +9013,10 @@ static CharPtr source_qual_prefixes [] = {
"plastid_name:",
"pop_variant:",
"rearranged:",
+ "rev_pcr_primer_name",
+ "rev_pcr_primer_seq",
+ "rev_primer_name",
+ "rev_primer_seq",
"right_primer:",
"segment:",
"serogroup:",
@@ -8489,7 +9139,7 @@ static void ValidateBioSource (ValidStructPtr vsp, GatherContextPtr gcp, BioSour
if (StringHasNoText (countryname)) {
countryname = "?";
}
- ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BadCountryCode, "Bad country name [%s]", countryname);
+ ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_BadCountryCode, "Bad country name [%s]", countryname);
}
} else if (ssp->subtype == SUBSRC_chromosome) {
chromcount++;
@@ -8523,6 +9173,10 @@ static void ValidateBioSource (ValidStructPtr vsp, GatherContextPtr gcp, BioSour
if (! CollectionDateIsValid (ssp->name)) {
ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BadCollectionDate, "Collection_date format is not in DD-Mmm-YYYY format");
}
+ } else if (ssp->subtype == SUBSRC_fwd_primer_seq || ssp->subtype == SUBSRC_rev_primer_seq) {
+ if (! PrimerSeqIsValid (ssp->name)) {
+ ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BadPCRPrimerSequence, "PCR primer sequence format is incorrect");
+ }
}
ssp = ssp->next;
}
@@ -8731,6 +9385,9 @@ static Boolean ValidateSeqDescrCommon (ValNodePtr sdp, BioseqValidStrPtr bvsp, V
OrgRefPtr this_org = NULL, that_org = NULL;
int tmpval;
Char buf1[20], buf2[20];
+ EMBLBlockPtr ebp;
+ GBBlockPtr gbp;
+ ValNodePtr keywords = NULL;
PubdescPtr pdp;
MolInfoPtr mip;
Uint2 olditemtype = 0;
@@ -8739,6 +9396,8 @@ static Boolean ValidateSeqDescrCommon (ValNodePtr sdp, BioseqValidStrPtr bvsp, V
GatherContextPtr gcp = NULL;
CharPtr str;
SeqFeatPtr sfp;
+ Boolean tpa_exp;
+ Boolean tpa_inf;
BioseqPtr bsp;
DatePtr dp;
SeqMgrFeatContext fcontext;
@@ -8859,12 +9518,24 @@ static Boolean ValidateSeqDescrCommon (ValNodePtr sdp, BioseqValidStrPtr bvsp, V
ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_Inconsistent, "Multiple GenBank blocks");
else
bvsp->last_gb = vnp;
+ if (vnp != NULL) {
+ gbp = (GBBlockPtr) vnp->data.ptrvalue;
+ if (gbp != NULL) {
+ keywords = gbp->keywords;
+ }
+ }
break;
case Seq_descr_embl:
if (bvsp->last_embl != NULL)
ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_Inconsistent, "Multiple EMBL blocks");
else
bvsp->last_embl = vnp;
+ if (vnp != NULL) {
+ ebp = (EMBLBlockPtr) vnp->data.ptrvalue;
+ if (ebp != NULL) {
+ keywords = ebp->keywords;
+ }
+ }
break;
case Seq_descr_pir:
if (bvsp->last_pir != NULL)
@@ -9109,6 +9780,20 @@ static Boolean ValidateSeqDescrCommon (ValNodePtr sdp, BioseqValidStrPtr bvsp, V
break;
}
+ if (keywords != NULL) {
+ tpa_exp = FALSE;
+ tpa_inf = FALSE;
+ for (vnp = keywords; vnp != NULL; vnp = vnp->next) {
+ if (StringICmp ((CharPtr) vnp->data.ptrvalue, "TPA:experimental") == 0) {
+ tpa_exp = TRUE;
+ } else if (StringICmp ((CharPtr) vnp->data.ptrvalue, "TPA:inferential") == 0) {
+ tpa_inf = TRUE;
+ }
+ }
+ if (tpa_exp && tpa_inf) {
+ ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_Inconsistent, "TPA:experimental and TPA:inferential should not both be in the same set of keywords");
+ }
+ }
if (gcp != NULL) {
gcp->itemID = olditemid;
@@ -9223,6 +9908,20 @@ static Boolean GPSorNTorNC (SeqEntryPtr sep, SeqLocPtr location)
return FALSE;
}
+static Boolean IsGenBankAccn (SeqEntryPtr sep, SeqLocPtr location)
+{
+ BioseqPtr bsp;
+ SeqIdPtr sip;
+
+ bsp = BioseqFindFromSeqLoc (location);
+ if (bsp != NULL) {
+ for (sip = bsp->id; sip != NULL; sip = sip->next) {
+ if (sip->choice == SEQID_GENBANK) return TRUE;
+ }
+ }
+ return FALSE;
+}
+
static Boolean NGorNT (SeqEntryPtr sep, SeqLocPtr location, BoolPtr is_nc)
{
BioseqPtr bsp;
@@ -10110,6 +10809,54 @@ static Boolean HaveUniqueFeatIDXrefs (SeqFeatXrefPtr xref1, SeqFeatXrefPtr xref2
return FALSE;
}
+#define SMALL_RIBOSOMAL_SUBUNIT 1
+#define INTERNAL_SPACER_1 2
+#define MIDDLE_RIBOSOMAL_SUBUNIT 3
+#define INTERNAL_SPACER_2 4
+#define LARGE_RIBOSOMAL_SUBUNIT 5
+#define INTERNAL_SPACER_X 6
+#define TRANSFER_RNA 7
+
+static Int2 WhichRNA (SeqFeatPtr sfp)
+
+{
+ RnaRefPtr rrp;
+ CharPtr str;
+
+ if (sfp == NULL || sfp->data.choice != SEQFEAT_RNA) return 0;
+ rrp = (RnaRefPtr) sfp->data.value.ptrvalue;
+ if (rrp == NULL) return 0;
+ if (rrp->type == 3) {
+ return TRANSFER_RNA;
+ }
+ if (rrp->ext.choice != 1) return 0;
+ str = (CharPtr) rrp->ext.value.ptrvalue;
+ if (StringHasNoText (str)) return 0;
+ if (rrp->type == 4) {
+ if (StringNICmp (str, "small ", 6) == 0) return SMALL_RIBOSOMAL_SUBUNIT;
+ if (StringNICmp (str, "18S ", 4) == 0) return SMALL_RIBOSOMAL_SUBUNIT;
+ if (StringNICmp (str, "5.8S ", 5) == 0) return MIDDLE_RIBOSOMAL_SUBUNIT;
+ if (StringNICmp (str, "large ", 6) == 0) return LARGE_RIBOSOMAL_SUBUNIT;
+ if (StringNICmp (str, "26S ", 4) == 0) return LARGE_RIBOSOMAL_SUBUNIT;
+ if (StringNICmp (str, "28S ", 4) == 0) return LARGE_RIBOSOMAL_SUBUNIT;
+ /* variant spellings */
+ if (StringNICmp (str, "18 ", 3) == 0) return SMALL_RIBOSOMAL_SUBUNIT;
+ if (StringNICmp (str, "5.8 ", 4) == 0) return MIDDLE_RIBOSOMAL_SUBUNIT;
+ if (StringNICmp (str, "26 ", 3) == 0) return LARGE_RIBOSOMAL_SUBUNIT;
+ if (StringNICmp (str, "28 ", 3) == 0) return LARGE_RIBOSOMAL_SUBUNIT;
+ }
+ if (rrp->type == 255) {
+ if (StringICmp (str, "internal transcribed spacer 1") == 0) return INTERNAL_SPACER_1;
+ if (StringICmp (str, "internal transcribed spacer 2") == 0) return INTERNAL_SPACER_2;
+ /* variant spellings */
+ if (StringICmp (str, "internal transcribed spacer1") == 0) return INTERNAL_SPACER_1;
+ if (StringICmp (str, "internal transcribed spacer2") == 0) return INTERNAL_SPACER_2;
+ if (StringICmp (str, "internal transcribed spacer") == 0) return INTERNAL_SPACER_X;
+ if (StringICmp (str, "ITS") == 0) return INTERNAL_SPACER_X;
+ }
+ return 0;
+}
+
static Boolean ValidateBioseqContextIndexed (BioseqPtr bsp, BioseqValidStrPtr bvsp)
{
@@ -10142,6 +10889,11 @@ static Boolean ValidateBioseqContextIndexed (BioseqPtr bsp, BioseqValidStrPtr bv
CharPtr lastLabel;
CharPtr message;
Int2 i;
+ Boolean isCuratedFlybase = FALSE;
+ Boolean isDrosophila = FALSE;
+ Boolean isGenBankAccn = FALSE;
+ Boolean isGPSorNTorNC = FALSE;
+ Boolean isViral = FALSE;
Int2 j;
CdRegionPtr crp;
Uint1 frame;
@@ -10150,6 +10902,7 @@ static Boolean ValidateBioseqContextIndexed (BioseqPtr bsp, BioseqValidStrPtr bv
int overlapPepSev;
BioSourcePtr biop = NULL, lastbiop;
OrgRefPtr orp = NULL;
+ OrgNamePtr onp = NULL;
Int4 fiveUTRright;
Int4 cdsRight;
Int4 threeUTRright;
@@ -10162,12 +10915,13 @@ static Boolean ValidateBioseqContextIndexed (BioseqPtr bsp, BioseqValidStrPtr bv
Int2 numBadFullSource;
SubSourcePtr sbsp;
Int2 numgene, numcds, nummrna, numcdsproducts, nummrnaproducts,
- numcdspseudo, nummrnapseudo;
+ numcdspseudo, nummrnapseudo, lastrnatype, thisrnatype;
Boolean cds_products_unique = TRUE, mrna_products_unique = TRUE,
suppress_duplicate_messages = FALSE, pseudo;
SeqIdPtr sip;
Char buf [64];
SeqFeatXrefPtr xref = NULL;
+ CharPtr except_text = NULL;
ValNodePtr vnp, cds_prod_head = NULL, mrna_prod_head = NULL,
lastcdsprod = NULL, lastmrnaprod = NULL;
@@ -10350,10 +11104,31 @@ static Boolean ValidateBioseqContextIndexed (BioseqPtr bsp, BioseqValidStrPtr bv
ValNodeFreeData (cds_prod_head);
ValNodeFreeData (mrna_prod_head);
+ /*
+ SeqEntryToBioSource (vsp->sep, NULL, NULL, 0, &biop);
+ */
+ BioseqToGeneticCode (bsp, NULL, NULL, NULL, NULL, 0, &biop);
+ if (biop != NULL) {
+ orp = biop->org;
+ if (orp != NULL) {
+ /* curated fly source still has duplicate features */
+ if (StringICmp (orp->taxname, "Drosophila melanogaster") == 0) {
+ isDrosophila = TRUE;
+ }
+ onp = orp->orgname;
+ if (onp != NULL) {
+ if (StringNICmp (onp->lineage, "Viruses; ", 9) == 0) {
+ isViral = TRUE;
+ }
+ }
+ }
+ }
+
sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &fcontext);
while (sfp != NULL) {
leave = TRUE;
if (last != NULL) {
+ ivalssame = FALSE;
if (fcontext.left == left && fcontext.right == right && fcontext.featdeftype == featdeftype) {
if (fcontext.strand == strand || strand == Seq_strand_unknown || fcontext.strand == Seq_strand_unknown) {
ivalssame = TRUE;
@@ -10389,21 +11164,30 @@ static Boolean ValidateBioseqContextIndexed (BioseqPtr bsp, BioseqValidStrPtr bv
featdeftype == FEATDEF_REGION || featdeftype == FEATDEF_misc_feature || featdeftype == FEATDEF_STS || featdeftype == FEATDEF_variation) {
severity = SEV_WARNING;
} else {
- if (! GPSorNTorNC (vsp->sep, sfp->location)) {
- severity = SEV_WARNING;
- } else {
- if (orp == NULL) {
- SeqEntryToBioSource (vsp->sep, NULL, NULL, 0, &biop);
- if (biop != NULL) {
- orp = biop->org;
+ if (isGPSorNTorNC || GPSorNTorNC (vsp->sep, sfp->location)) {
+ isGPSorNTorNC = TRUE;
+ if (! isCuratedFlybase) {
+ if (isDrosophila) {
+ isCuratedFlybase = TRUE;
}
}
- if (orp != NULL) {
+ if (isCuratedFlybase) {
/* curated fly source still has duplicate features */
- if (StringICmp (orp->taxname, "Drosophila melanogaster") == 0) {
- severity = SEV_WARNING;
+ severity = SEV_WARNING;
+ }
+ } else if (isGenBankAccn || IsGenBankAccn (vsp->sep, sfp->location)) {
+ isGenBankAccn = TRUE;
+ if (! isCuratedFlybase) {
+ if (isDrosophila) {
+ isCuratedFlybase = TRUE;
}
}
+ if (isCuratedFlybase) {
+ /* curated fly source still has duplicate features */
+ severity = SEV_WARNING;
+ }
+ } else {
+ severity = SEV_WARNING;
}
}
/* if different CDS frames, lower to warning */
@@ -10447,7 +11231,15 @@ static Boolean ValidateBioseqContextIndexed (BioseqPtr bsp, BioseqValidStrPtr bv
} else {
if (suppress_duplicate_messages && (featdeftype == FEATDEF_CDS || featdeftype == FEATDEF_mRNA) && HaveUniqueFeatIDXrefs (xref, sfp->xref)) {
/* do not report CDS or mRNA if every one has a unique product and unique featID xrefs */
+ } else if (featdeftype == FEATDEF_GENE &&
+ StringStr (sfp->except_text, "dicistronic gene") != NULL &&
+ StringStr (except_text, "dicistronic gene") != NULL &&
+ isCuratedFlybase) {
+ /* do not report genes marked dicistronic */
} else {
+ if (featdeftype == FEATDEF_GENE && isViral && (sfp->partial || last->partial)) {
+ severity = SEV_WARNING;
+ }
ValidErr (vsp, severity, ERR_SEQ_FEAT_DuplicateFeat, "Features have identical intervals, but labels differ");
}
}
@@ -10505,6 +11297,7 @@ static Boolean ValidateBioseqContextIndexed (BioseqPtr bsp, BioseqValidStrPtr bv
ivals = fcontext.ivals;
sap = fcontext.sap;
xref = sfp->xref;
+ except_text = sfp->except_text;
frame = 0;
if (sfp->data.choice == SEQFEAT_CDREGION) {
crp = (CdRegionPtr) sfp->data.value.ptrvalue;
@@ -10621,22 +11414,18 @@ static Boolean ValidateBioseqContextIndexed (BioseqPtr bsp, BioseqValidStrPtr bv
sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &fcontext);
while (sfp != NULL) {
+ if (gcp != NULL) {
+ gcp->itemID = fcontext.itemID;
+ gcp->thistype = OBJ_SEQFEAT;
+ }
+ vsp->descr = NULL;
+ vsp->sfp = sfp;
if (sfp->idx.subtype == FEATDEF_3UTR && utr3count < 2) {
if (fcontext.strand != Seq_strand_minus) {
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UTRdoesNotAbutCDS, "3'UTR is not on minus strand");
} else if (threeUTRright > 0) {
if (threeUTRright + 1 != fcontext.left) {
- if (gcp != NULL) {
- gcp->itemID = fcontext.itemID;
- gcp->thistype = OBJ_SEQFEAT;
- }
- vsp->descr = NULL;
- vsp->sfp = sfp;
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UTRdoesNotAbutCDS, "Previous 3'UTR does not abut next 3'UTR");
- if (gcp != NULL) {
- gcp->itemID = olditemid;
- gcp->thistype = olditemtype;
- }
}
}
threeUTRright = fcontext.right;
@@ -10644,18 +11433,7 @@ static Boolean ValidateBioseqContextIndexed (BioseqPtr bsp, BioseqValidStrPtr bv
cdsRight = fcontext.right;
if (threeUTRright > 0 && firstCDS) {
if (threeUTRright + 1 != fcontext.left) {
- if (gcp != NULL) {
- gcp->itemID = fcontext.itemID;
- gcp->thistype = OBJ_SEQFEAT;
- }
- vsp->descr = NULL;
- vsp->sfp = sfp;
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UTRdoesNotAbutCDS, "CDS does not abut 3'UTR");
- vsp->sfp = NULL;
- if (gcp != NULL) {
- gcp->itemID = olditemid;
- gcp->thistype = olditemtype;
- }
}
}
firstCDS = FALSE;
@@ -10664,17 +11442,7 @@ static Boolean ValidateBioseqContextIndexed (BioseqPtr bsp, BioseqValidStrPtr bv
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UTRdoesNotAbutCDS, "5'UTR is not on minus strand");
} else if (cdsRight > 0) {
if (cdsRight + 1 != fcontext.left) {
- if (gcp != NULL) {
- gcp->itemID = fcontext.itemID;
- gcp->thistype = OBJ_SEQFEAT;
- }
- vsp->descr = NULL;
- vsp->sfp = sfp;
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UTRdoesNotAbutCDS, "5'UTR does not abut CDS");
- if (gcp != NULL) {
- gcp->itemID = olditemid;
- gcp->thistype = olditemtype;
- }
}
}
threeUTRright = fcontext.right;
@@ -10686,6 +11454,12 @@ static Boolean ValidateBioseqContextIndexed (BioseqPtr bsp, BioseqValidStrPtr bv
sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &fcontext);
while (sfp != NULL) {
+ if (gcp != NULL) {
+ gcp->itemID = fcontext.itemID;
+ gcp->thistype = OBJ_SEQFEAT;
+ }
+ vsp->descr = NULL;
+ vsp->sfp = sfp;
if (sfp->idx.subtype == FEATDEF_5UTR && utr5count < 2) {
if (fcontext.strand == Seq_strand_minus) {
if (genecount > 1 && cdsgene != NULL && utr5gene != NULL && cdsgene != utr5gene) {
@@ -10699,22 +11473,11 @@ static Boolean ValidateBioseqContextIndexed (BioseqPtr bsp, BioseqValidStrPtr bv
cdsRight = fcontext.right;
if (fiveUTRright > 0 && firstCDS) {
if (fiveUTRright + 1 != fcontext.left) {
- if (gcp != NULL) {
- gcp->itemID = fcontext.itemID;
- gcp->thistype = OBJ_SEQFEAT;
- }
- vsp->descr = NULL;
- vsp->sfp = sfp;
if (genecount > 1 && cdsgene != NULL && utr5gene != NULL && cdsgene != utr5gene) {
/* ignore */
} else {
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UTRdoesNotAbutCDS, "5'UTR does not abut CDS");
}
- vsp->sfp = NULL;
- if (gcp != NULL) {
- gcp->itemID = olditemid;
- gcp->thistype = olditemtype;
- }
}
}
firstCDS = FALSE;
@@ -10723,31 +11486,11 @@ static Boolean ValidateBioseqContextIndexed (BioseqPtr bsp, BioseqValidStrPtr bv
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UTRdoesNotAbutCDS, "3'UTR is not on plus strand");
} else if (threeUTRright > 0) {
if (threeUTRright + 1 != fcontext.left) {
- if (gcp != NULL) {
- gcp->itemID = fcontext.itemID;
- gcp->thistype = OBJ_SEQFEAT;
- }
- vsp->descr = NULL;
- vsp->sfp = sfp;
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UTRdoesNotAbutCDS, "Previous 3'UTR does not abut next 3'UTR");
- if (gcp != NULL) {
- gcp->itemID = olditemid;
- gcp->thistype = olditemtype;
- }
}
} else if (cdsRight > 0) {
if (cdsRight + 1 != fcontext.left) {
- if (gcp != NULL) {
- gcp->itemID = fcontext.itemID;
- gcp->thistype = OBJ_SEQFEAT;
- }
- vsp->descr = NULL;
- vsp->sfp = sfp;
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UTRdoesNotAbutCDS, "CDS does not abut 3'UTR");
- if (gcp != NULL) {
- gcp->itemID = olditemid;
- gcp->thistype = olditemtype;
- }
}
}
threeUTRright = fcontext.right;
@@ -10757,6 +11500,148 @@ static Boolean ValidateBioseqContextIndexed (BioseqPtr bsp, BioseqValidStrPtr bv
}
}
+ if (! bvsp->is_mrna) {
+ last = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_RNA, 0, &fcontext);
+ if (last != NULL) {
+ lastrnatype = WhichRNA (last);
+ left = fcontext.left;
+ right = fcontext.right;
+ strand = fcontext.strand;
+ sfp = SeqMgrGetNextFeature (bsp, last, SEQFEAT_RNA, 0, &fcontext);
+ while (sfp != NULL) {
+ thisrnatype = WhichRNA (sfp);
+ if (fcontext.strand == strand || (strand != Seq_strand_minus && fcontext.strand != Seq_strand_minus)) {
+ if (lastrnatype != 0 && thisrnatype != 0) {
+ if (right + 1 < fcontext.left) {
+ /* gap */
+ if (strand == Seq_strand_minus) {
+ if ((lastrnatype == LARGE_RIBOSOMAL_SUBUNIT && thisrnatype == TRANSFER_RNA) ||
+ (lastrnatype == TRANSFER_RNA && thisrnatype == SMALL_RIBOSOMAL_SUBUNIT)) {
+ /* okay in mitochondria */
+ } else if ((lastrnatype == LARGE_RIBOSOMAL_SUBUNIT && thisrnatype == INTERNAL_SPACER_2) ||
+ (lastrnatype == INTERNAL_SPACER_2 && thisrnatype == MIDDLE_RIBOSOMAL_SUBUNIT) ||
+ (lastrnatype == MIDDLE_RIBOSOMAL_SUBUNIT && thisrnatype == INTERNAL_SPACER_1) ||
+ (lastrnatype == INTERNAL_SPACER_1 && thisrnatype == SMALL_RIBOSOMAL_SUBUNIT)) {
+ if (gcp != NULL) {
+ gcp->itemID = fcontext.itemID;
+ gcp->thistype = OBJ_SEQFEAT;
+ }
+ vsp->descr = NULL;
+ vsp->sfp = sfp;
+ ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_ITSdoesNotAbutRRNA, "ITS does not abut adjacent rRNA component");
+ }
+ } else {
+ if ((lastrnatype == SMALL_RIBOSOMAL_SUBUNIT && thisrnatype == TRANSFER_RNA) ||
+ (lastrnatype == TRANSFER_RNA && thisrnatype == LARGE_RIBOSOMAL_SUBUNIT)) {
+ /* okay in mitochondria */
+ } else if ((lastrnatype == SMALL_RIBOSOMAL_SUBUNIT && thisrnatype == INTERNAL_SPACER_1) ||
+ (lastrnatype == MIDDLE_RIBOSOMAL_SUBUNIT && thisrnatype == INTERNAL_SPACER_2) ||
+ (lastrnatype == INTERNAL_SPACER_1 && thisrnatype == MIDDLE_RIBOSOMAL_SUBUNIT) ||
+ (lastrnatype == INTERNAL_SPACER_2 && thisrnatype == LARGE_RIBOSOMAL_SUBUNIT)) {
+ if (gcp != NULL) {
+ gcp->itemID = fcontext.itemID;
+ gcp->thistype = OBJ_SEQFEAT;
+ }
+ vsp->descr = NULL;
+ vsp->sfp = sfp;
+ ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_ITSdoesNotAbutRRNA, "ITS does not abut adjacent rRNA component");
+ }
+ }
+ } else if (right + 1 > fcontext.left) {
+ /* overlaps */
+ if (strand == Seq_strand_minus) {
+ if ((lastrnatype == LARGE_RIBOSOMAL_SUBUNIT && thisrnatype == TRANSFER_RNA) ||
+ (lastrnatype == TRANSFER_RNA && thisrnatype == SMALL_RIBOSOMAL_SUBUNIT)) {
+ if (gcp != NULL) {
+ gcp->itemID = fcontext.itemID;
+ gcp->thistype = OBJ_SEQFEAT;
+ }
+ vsp->descr = NULL;
+ vsp->sfp = sfp;
+ ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_ITSdoesNotAbutRRNA, "tRNA overlaps adjacent rRNA component");
+ } else if ((lastrnatype == LARGE_RIBOSOMAL_SUBUNIT && thisrnatype == INTERNAL_SPACER_2) ||
+ (lastrnatype == INTERNAL_SPACER_2 && thisrnatype == MIDDLE_RIBOSOMAL_SUBUNIT) ||
+ (lastrnatype == MIDDLE_RIBOSOMAL_SUBUNIT && thisrnatype == INTERNAL_SPACER_1) ||
+ (lastrnatype == INTERNAL_SPACER_1 && thisrnatype == SMALL_RIBOSOMAL_SUBUNIT)) {
+ if (gcp != NULL) {
+ gcp->itemID = fcontext.itemID;
+ gcp->thistype = OBJ_SEQFEAT;
+ }
+ vsp->descr = NULL;
+ vsp->sfp = sfp;
+ ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_ITSdoesNotAbutRRNA, "ITS overlaps adjacent rRNA component");
+ }
+ } else {
+ if ((lastrnatype == SMALL_RIBOSOMAL_SUBUNIT && thisrnatype == TRANSFER_RNA) ||
+ (lastrnatype == TRANSFER_RNA && thisrnatype == LARGE_RIBOSOMAL_SUBUNIT)) {
+ /* okay in mitochondria */
+ } else if ((lastrnatype == SMALL_RIBOSOMAL_SUBUNIT && thisrnatype == INTERNAL_SPACER_1) ||
+ (lastrnatype == MIDDLE_RIBOSOMAL_SUBUNIT && thisrnatype == INTERNAL_SPACER_2) ||
+ (lastrnatype == INTERNAL_SPACER_1 && thisrnatype == MIDDLE_RIBOSOMAL_SUBUNIT) ||
+ (lastrnatype == INTERNAL_SPACER_2 && thisrnatype == LARGE_RIBOSOMAL_SUBUNIT)) {
+ if (gcp != NULL) {
+ gcp->itemID = fcontext.itemID;
+ gcp->thistype = OBJ_SEQFEAT;
+ }
+ vsp->descr = NULL;
+ vsp->sfp = sfp;
+ ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_ITSdoesNotAbutRRNA, "ITS overlaps adjacent rRNA component");
+ }
+ }
+ } else {
+ /* abuts */
+ if (strand == Seq_strand_minus) {
+ if ((lastrnatype == LARGE_RIBOSOMAL_SUBUNIT && thisrnatype == TRANSFER_RNA) ||
+ (lastrnatype == TRANSFER_RNA && thisrnatype == SMALL_RIBOSOMAL_SUBUNIT)) {
+ /* okay in mitochondria */
+ } else if ((lastrnatype == LARGE_RIBOSOMAL_SUBUNIT && thisrnatype != INTERNAL_SPACER_2) ||
+ (lastrnatype == INTERNAL_SPACER_2 && thisrnatype != MIDDLE_RIBOSOMAL_SUBUNIT) ||
+ (lastrnatype == MIDDLE_RIBOSOMAL_SUBUNIT && thisrnatype != INTERNAL_SPACER_1) ||
+ (lastrnatype == INTERNAL_SPACER_1 && thisrnatype != SMALL_RIBOSOMAL_SUBUNIT)) {
+ if (gcp != NULL) {
+ gcp->itemID = fcontext.itemID;
+ gcp->thistype = OBJ_SEQFEAT;
+ }
+ vsp->descr = NULL;
+ vsp->sfp = sfp;
+ ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_ITSdoesNotAbutRRNA, "Problem with order of abutting rRNA components");
+ }
+ } else {
+ if ((lastrnatype == SMALL_RIBOSOMAL_SUBUNIT && thisrnatype == TRANSFER_RNA) ||
+ (lastrnatype == TRANSFER_RNA && thisrnatype == LARGE_RIBOSOMAL_SUBUNIT)) {
+ /* okay in mitochondria */
+ } else if ((lastrnatype == SMALL_RIBOSOMAL_SUBUNIT && thisrnatype != INTERNAL_SPACER_1) ||
+ (lastrnatype == MIDDLE_RIBOSOMAL_SUBUNIT && thisrnatype != INTERNAL_SPACER_2) ||
+ (lastrnatype == INTERNAL_SPACER_1 && thisrnatype != MIDDLE_RIBOSOMAL_SUBUNIT) ||
+ (lastrnatype == INTERNAL_SPACER_2 && thisrnatype != LARGE_RIBOSOMAL_SUBUNIT)) {
+ if (gcp != NULL) {
+ gcp->itemID = fcontext.itemID;
+ gcp->thistype = OBJ_SEQFEAT;
+ }
+ vsp->descr = NULL;
+ vsp->sfp = sfp;
+ ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_ITSdoesNotAbutRRNA, "Problem with order of abutting rRNA components");
+ }
+ }
+ }
+ }
+ }
+ last = sfp;
+ left = fcontext.left;
+ right = fcontext.right;
+ strand = fcontext.strand;
+ lastrnatype = thisrnatype;
+ sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_RNA, 0, &fcontext);
+ }
+ }
+ }
+
+ vsp->sfp = NULL;
+ if (gcp != NULL) {
+ gcp->itemID = olditemid;
+ gcp->thistype = olditemtype;
+ }
+
mrna = SeqMgrGetRNAgivenProduct (bsp, &fcontext);
if (mrna != NULL) {
genomicgrp = SeqMgrGetGeneXref (mrna);
@@ -12144,9 +13029,7 @@ static void CheckTrnaCodons (ValidStructPtr vsp, GatherContextPtr gcp, SeqFeatPt
GeneticCodePtr gncp;
Uint2 idx;
Int2 j;
- SeqEntryPtr sep;
ErrSev sev = SEV_ERROR;
- Uint1 shift;
SeqMapTablePtr smtp;
Uint1 taa;
ValNodePtr vnp;
@@ -12188,8 +13071,11 @@ static void CheckTrnaCodons (ValidStructPtr vsp, GatherContextPtr gcp, SeqFeatPt
if (trp->codon[j] < 64) {
if (codes == NULL) {
bsp = GetBioseqGivenSeqLoc (sfp->location, gcp->entityID);
+ /*
sep = GetBestTopParentForData (gcp->entityID, bsp);
code = SeqEntryToGeneticCode (sep, NULL, NULL, 0);
+ */
+ BioseqToGeneticCode (bsp, &code, NULL, NULL, NULL, 0, NULL);
gncp = GeneticCodeFind (code, NULL);
if (gncp == NULL) {
gncp = GeneticCodeFind (1, NULL);
@@ -12207,12 +13093,14 @@ static void CheckTrnaCodons (ValidStructPtr vsp, GatherContextPtr gcp, SeqFeatPt
taa = codes[trp->codon[j]];
if (aa > 0 && aa != 255) {
if (taa != aa) {
- if (aa == 'U') {
+ if (aa == 'U' || aa == 'O') {
sev = SEV_WARNING;
}
if (aa == 'U' && taa == '*' && trp->codon [j] == 14) {
/* selenocysteine normally uses TGA (14), so ignore without requiring exception in record */
- /* TAG (11) is used for pyrrolysine in archaebacteria */
+ } else if (aa == 'O' && taa == '*' && trp->codon [j] == 11) {
+ /* pyrrolysine normally uses TAG (11) in archaebacteria, so ignore without requiring exception in record */
+
/* TAA (10) is not yet known to be used for an exceptional amino acid */
} else if (StringISearch (sfp->except_text, "modified codon recognition") == NULL) {
ValidErr (vsp, sev, ERR_SEQ_FEAT_TrnaCodonWrong, "tRNA codon does not match genetic code");
@@ -12225,6 +13113,7 @@ static void CheckTrnaCodons (ValidStructPtr vsp, GatherContextPtr gcp, SeqFeatPt
}
if (aa > 0 && aa != 255) {
+ /* - no gaps now that O and J are added
if (aa <= 74) {
shift = 0;
} else if (aa > 79) {
@@ -12232,16 +13121,19 @@ static void CheckTrnaCodons (ValidStructPtr vsp, GatherContextPtr gcp, SeqFeatPt
} else {
shift = 1;
}
+ */
if (aa != '*') {
- idx = aa - (64 + shift);
+ idx = aa - (64 /* + shift */);
} else {
- idx = 25;
+ idx = 25; /* termination */
}
- if (idx > 0 && idx < 26) {
+ if (idx > 0 && idx < 28) {
/* valid trna amino acid */
} else {
ValidErr (vsp, sev, ERR_SEQ_FEAT_BadTrnaAA, "Invalid tRNA amino acid");
}
+ } else {
+ ValidErr (vsp, sev, ERR_SEQ_FEAT_BadTrnaAA, "Invalid tRNA amino acid");
}
}
@@ -12752,6 +13644,7 @@ static CharPtr legal_exception_strings [] = {
"nonconsensus splice site",
"modified codon recognition",
"alternative start codon",
+ "dicistronic gene",
NULL
};
@@ -12760,6 +13653,7 @@ static CharPtr refseq_exception_strings [] = {
"unclassified translation discrepancy",
"mismatches in transcription",
"mismatches in translation",
+ "adjusted for low-quality genome",
NULL
};
@@ -12847,7 +13741,9 @@ static void ValidateExceptText (ValidStructPtr vsp, GatherContextPtr gcp, SeqFea
typedef struct samecds {
Boolean found;
SeqMgrFeatContextPtr gcontext;
+ Uint2 slpTag;
Uint1 subtype;
+ Boolean bypassGeneTest;
} SameCds, PNTR SameCdsPtr;
static Boolean LIBCALLBACK FindSameCDS (SeqFeatPtr sfp, SeqMgrFeatContextPtr ccontext)
@@ -12893,16 +13789,57 @@ static Boolean LIBCALLBACK FindSameCDS (SeqFeatPtr sfp, SeqMgrFeatContextPtr cco
return FALSE;
}
}
- } else if (gcontext->left == ccontext->left || gcontext->right == ccontext->right) {
- /* if either end of CDS and mRNA is identical, okay to suppress partial warning */
- same->found = TRUE;
- return FALSE;
+ } else if (SeqLocAinB (sfp->location, gcontext->sfp->location) > 0) {
+
+ if (ccontext->strand == Seq_strand_minus || gcontext->strand == Seq_strand_minus) {
+ if (same->slpTag == SLP_NOSTART && gcontext->partialL) {
+ if (gcontext->right == ccontext->right) {
+ same->found = TRUE;
+ return FALSE;
+ }
+ if (gcontext->right > ccontext->right) {
+ same->bypassGeneTest = TRUE;
+ return FALSE;
+ }
+ } else if (same->slpTag == SLP_NOSTOP && gcontext->partialR) {
+ if (gcontext->left == ccontext->left) {
+ same->found = TRUE;
+ return FALSE;
+ }
+ if (gcontext->left < ccontext->left) {
+ same->bypassGeneTest = TRUE;
+ return FALSE;
+ }
+ }
+
+ } else {
+
+ if (same->slpTag == SLP_NOSTART && gcontext->partialL) {
+ if (gcontext->left == ccontext->left) {
+ same->found = TRUE;
+ return FALSE;
+ }
+ if (gcontext->left < ccontext->left) {
+ same->bypassGeneTest = TRUE;
+ return FALSE;
+ }
+ } else if (same->slpTag == SLP_NOSTOP && gcontext->partialR) {
+ if (gcontext->right == ccontext->right) {
+ same->found = TRUE;
+ return FALSE;
+ }
+ if (gcontext->right > ccontext->right) {
+ same->bypassGeneTest = TRUE;
+ return FALSE;
+ }
+ }
+ }
}
}
return TRUE;
}
-static Boolean SameAsCDS (SeqFeatPtr sfp)
+static Boolean SameAsCDS (SeqFeatPtr sfp, Uint2 slpTag, BoolPtr bypassGeneTestP)
{
BioseqPtr bsp;
@@ -12915,10 +13852,15 @@ static Boolean SameAsCDS (SeqFeatPtr sfp)
if (SeqMgrGetDesiredFeature (0, bsp, 0, 0, sfp, &gcontext) != sfp) return FALSE;
same.found = FALSE;
same.gcontext = &gcontext;
+ same.slpTag = slpTag;
same.subtype = sfp->idx.subtype;
+ same.bypassGeneTest = FALSE;
MemSet ((Pointer) &cdsFilt, 0, sizeof (cdsFilt));
cdsFilt [SEQFEAT_CDREGION] = TRUE;
SeqMgrExploreFeatures (bsp, (Pointer) &same, FindSameCDS, sfp->location, cdsFilt, NULL);
+ if (bypassGeneTestP != NULL) {
+ *bypassGeneTestP = same.bypassGeneTest;
+ }
return same.found;
}
@@ -13568,6 +14510,49 @@ static void ValidateGoTermsSfp (
}
}
+static void LookForAccnLocs (SeqIdPtr sip, Pointer userdata)
+
+{
+ BoolPtr bp;
+ TextSeqIdPtr tsip;
+
+ if (sip == NULL || userdata == NULL) return;
+ bp = (BoolPtr) userdata;
+
+ switch (sip->choice) {
+ case SEQID_GENBANK :
+ case SEQID_EMBL :
+ case SEQID_DDBJ :
+ case SEQID_TPG :
+ case SEQID_TPE :
+ case SEQID_TPD :
+ case SEQID_OTHER :
+ tsip = (TextSeqIdPtr) sip->data.ptrvalue;
+ if (tsip != NULL) {
+ if (StringDoesHaveText (tsip->accession)) {
+ *bp = TRUE;
+ }
+ }
+ break;
+ default :
+ break;
+ }
+}
+
+static CharPtr infMessage [] = {
+ "unknown error",
+ "empty inference string",
+ "bad inference prefix",
+ "bad inference body",
+ "single inference field",
+ "spaces in inference",
+ "same species misused",
+ "bad inference accession",
+ "bad inference accession version",
+ "accession.version not public",
+ NULL
+};
+
NLM_EXTERN void ValidateSeqFeat (GatherContextPtr gcp)
{
Int2 type, i, j;
@@ -13590,7 +14575,8 @@ NLM_EXTERN void ValidateSeqFeat (GatherContextPtr gcp)
tRNAPtr trp;
GBQualPtr gbq;
Boolean pseudo, excpt, conflict, codonqual,
- anticodonqual, protidqual, transidqual, ovgenepseudo;
+ anticodonqual, productqual, protidqual,
+ transidqual, ovgenepseudo;
ImpFeatPtr ifp;
GeneRefPtr grp;
ProtRefPtr prp;
@@ -13641,6 +14627,10 @@ NLM_EXTERN void ValidateSeqFeat (GatherContextPtr gcp)
Boolean hasxref;
CharPtr sfp_old_locus_tag;
CharPtr gene_old_locus_tag;
+ Boolean bypassGeneTest;
+ Boolean dicistronic = FALSE;
+ Int2 inferenceCode;
+ Boolean accn_seqid;
vsp = (ValidStructPtr) (gcp->userdata);
@@ -13653,6 +14643,26 @@ NLM_EXTERN void ValidateSeqFeat (GatherContextPtr gcp)
ValidateSeqLoc (vsp, sfp->product, "Product");
+ if (vsp->feat_loc_has_gi) {
+ accn_seqid = FALSE;
+ VisitSeqIdsInSeqLoc (sfp->location, (Pointer) &accn_seqid, LookForAccnLocs);
+ if (accn_seqid) {
+ if (! vsp->is_smupd_in_sep) {
+ ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_FeatureRefersToAccession, "Feature location refers to accession");
+ }
+ }
+ }
+
+ if (vsp->feat_prod_has_gi) {
+ accn_seqid = FALSE;
+ VisitSeqIdsInSeqLoc (sfp->product, (Pointer) &accn_seqid, LookForAccnLocs);
+ if (accn_seqid) {
+ if (! vsp->is_smupd_in_sep) {
+ ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_FeatureRefersToAccession, "Feature product refers to accession");
+ }
+ }
+ }
+
partials[0] = SeqLocPartialCheck (sfp->product);
partials[1] = SeqLocPartialCheck (sfp->location);
if ((partials[0] != SLP_COMPLETE) || (partials[1] != SLP_COMPLETE) || (sfp->partial)) { /* partialness */
@@ -13729,21 +14739,21 @@ NLM_EXTERN void ValidateSeqFeat (GatherContextPtr gcp)
for (i = 0; i < 2; i++) {
errtype = SLP_NOSTART;
for (j = 0; j < 4; j++) {
+ bypassGeneTest = FALSE;
if (partials[i] & errtype) {
if (i == 1 && j < 2 && IsCddFeat (sfp)) {
/* suppresses warning */
- } else if (i == 1 && j < 2 && sfp->data.choice == SEQFEAT_GENE && SameAsCDS (sfp)) {
+ } else if (i == 1 && j < 2 && sfp->data.choice == SEQFEAT_GENE && SameAsCDS (sfp, errtype, NULL)) {
/*
ValidErr (vsp, SEV_INFO, ERR_SEQ_FEAT_PartialProblem,
"%s: %s",
parterr[i], parterrs[j]);
*/
} else if (i == 1 && j < 2 && sfp->idx.subtype == SEQFEAT_GENE && SameAsMRNA (sfp)) {
- } else if (i == 1 && j < 2 && sfp->idx.subtype == FEATDEF_mRNA && SameAsCDS (sfp)) {
- } else if (i == 1 && j < 2 && sfp->idx.subtype == FEATDEF_mRNA && SameAsGene (sfp)) {
- /*
- } else if (i == 1 && j < 2 && sfp->data.choice == SEQFEAT_CDREGION && SameAsMRNA (sfp)) {
- */
+ } else if (i == 1 && j < 2 && sfp->idx.subtype == FEATDEF_mRNA && SameAsCDS (sfp, errtype, &bypassGeneTest)) {
+ } else if (i == 1 && j < 2 && sfp->idx.subtype == FEATDEF_mRNA && (! bypassGeneTest) && SameAsGene (sfp)) {
+ } else if (i == 1 && j < 2 && sfp->data.choice == SEQFEAT_CDREGION && SameAsMRNA (sfp) &&
+ PartialAtSpliceSiteOrGap (sfp->location, errtype, &isgap, &badseq)) {
} else if (i == 1 && j < 2 && PartialAtSpliceSiteOrGap (sfp->location, errtype, &isgap, &badseq)) {
if (! isgap) {
if (sfp->idx.subtype != FEATDEF_CDS || SplicingNotExpected (sfp)) {
@@ -14031,6 +15041,19 @@ NLM_EXTERN void ValidateSeqFeat (GatherContextPtr gcp)
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_ProteinNameEndsInBracket, "Protein name ends with bracket and may contain organism name");
}
}
+ if (StringNICmp (str, "hypothetical protein XP_", 24) == 0) {
+ bsp = GetBioseqGivenSeqLoc (sfp->location, gcp->entityID);
+ if (bsp != NULL) {
+ for (sip = bsp->id; sip != NULL; sip = sip->next) {
+ if (sip->choice != SEQID_OTHER) continue;
+ tsip = (TextSeqIdPtr) sip->data.ptrvalue;
+ if (tsip == NULL) continue;
+ if (StringICmp (tsip->accession, str + 21) != 0) {
+ ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_HpotheticalProteinMismatch, "Hypothetical protein reference does not match accession");
+ }
+ }
+ }
+ }
}
if (str != NULL && sfp->comment != NULL) {
if (StringCmp (str, sfp->comment) == 0) {
@@ -14126,16 +15149,22 @@ NLM_EXTERN void ValidateSeqFeat (GatherContextPtr gcp)
}
if (rrp->type == 3) { /* tRNA */
anticodonqual = FALSE;
+ productqual = FALSE;
gbq = sfp->qual;
while (gbq != NULL) {
if (StringICmp (gbq->qual, "anticodon") == 0) {
anticodonqual = TRUE;
+ } else if (StringICmp (gbq->qual, "product") == 0) {
+ productqual = TRUE;
}
gbq = gbq->next;
}
if (anticodonqual) {
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidQualifierValue, "Unparsed anticodon qualifier in tRNA");
}
+ if (productqual) {
+ ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidQualifierValue, "Unparsed product qualifier in tRNA");
+ }
}
if (rrp->type == 3 && rrp->ext.choice == 1) { /* tRNA with string extension */
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidQualifierValue, "Unparsed product qualifier in tRNA");
@@ -14314,6 +15343,15 @@ NLM_EXTERN void ValidateSeqFeat (GatherContextPtr gcp)
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_InvalidQualifierValue, "Qualifier other than replace has just quotation marks");
}
}
+ if (StringICmp (gbq->qual, "inference") == 0) {
+ inferenceCode = ValidateInferenceQualifier (gbq->val, TRUE);
+ if (inferenceCode != VALID_INFERENCE) {
+ if (inferenceCode < VALID_INFERENCE || inferenceCode > ACC_VERSION_NOT_PUBLIC) {
+ inferenceCode = VALID_INFERENCE;
+ }
+ ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_InvalidInferenceValue, "Inference qualifier problem - %s", infMessage [(int) inferenceCode]);
+ }
+ }
}
if (sfp->product != NULL) {
@@ -14347,7 +15385,13 @@ NLM_EXTERN void ValidateSeqFeat (GatherContextPtr gcp)
break;
}
}
+ bsp = BioseqFindFromSeqLoc (sfp->location);
protBsp = BioseqFindFromSeqLoc (sfp->product);
+ if (bsp != NULL && protBsp != NULL) {
+ if (bsp == protBsp) {
+ ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_SelfReferentialProduct, "Self-referential feature product");
+ }
+ }
if (protBsp != NULL && protBsp->id != NULL) {
for (sip = protBsp->id; sip != NULL; sip = sip->next) {
switch (sip->choice) {
@@ -14450,6 +15494,8 @@ NLM_EXTERN void ValidateSeqFeat (GatherContextPtr gcp)
if (sfpx == NULL) {
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_GeneXrefWithoutGene,
"Feature has gene locus cross-reference but no equivalent gene feature exists");
+ } else if (StringStr (sfpx->except_text, "dicistronic gene") != NULL) {
+ dicistronic = TRUE;
}
}
}
@@ -14460,6 +15506,8 @@ NLM_EXTERN void ValidateSeqFeat (GatherContextPtr gcp)
if (sfpx == NULL) {
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_GeneXrefWithoutGene,
"Feature has gene locus_tag cross-reference but no equivalent gene feature exists");
+ } else if (StringStr (sfpx->except_text, "dicistronic gene") != NULL) {
+ dicistronic = TRUE;
}
}
}
@@ -14508,8 +15556,12 @@ NLM_EXTERN void ValidateSeqFeat (GatherContextPtr gcp)
}
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UnnecessaryGeneXref, "Unnecessary gene cross-reference %s", label);
} else {
- if (GPSorNTorNC (vsp->sep, sfp->location)) {
+ if ((! dicistronic) && GPSorNTorNC (vsp->sep, sfp->location)) {
+ /*
SeqEntryToBioSource (vsp->sep, NULL, NULL, 0, &biop);
+ */
+ bsp = BioseqFindFromSeqLoc (sfp->location);
+ BioseqToGeneticCode (bsp, NULL, NULL, NULL, NULL, 0, &biop);
if (biop != NULL) {
orp = biop->org;
if (orp != NULL) {
@@ -14558,6 +15610,7 @@ static CharPtr bypass_mrna_trans_check [] = {
"artificial frameshift",
"unclassified transcription discrepancy",
"mismatches in transcription",
+ "adjusted for low-quality genome",
NULL
};
@@ -14873,6 +15926,7 @@ static CharPtr bypass_cds_trans_check [] = {
"rearrangement required for product",
"unclassified translation discrepancy",
"mismatches in translation",
+ "adjusted for low-quality genome",
NULL
};
@@ -14909,6 +15963,11 @@ NLM_EXTERN void CdTransCheck (ValidStructPtr vsp, SeqFeatPtr sfp)
StreamCache sc;
Boolean isgap;
Boolean badseq;
+ BioseqPtr bsp;
+ SeqIdPtr sip;
+ Boolean is_ged = FALSE;
+ Boolean is_refseq = FALSE;
+ Boolean has_gi = FALSE;
if (sfp == NULL)
@@ -15155,6 +16214,32 @@ NLM_EXTERN void CdTransCheck (ValidStructPtr vsp, SeqFeatPtr sfp)
sev = SEV_WARNING;
}
if (report_errors || unclassified_except) {
+ bsp = BioseqFindFromSeqLoc (sfp->location);
+ if (bsp != NULL) {
+ for (sip = bsp->id; sip != NULL; sip = sip->next) {
+ switch (sip->choice) {
+ case SEQID_GI :
+ has_gi = TRUE;
+ break;
+ case SEQID_GENBANK :
+ case SEQID_EMBL :
+ case SEQID_DDBJ :
+ case SEQID_TPG :
+ case SEQID_TPE :
+ case SEQID_TPD :
+ is_ged = TRUE;
+ break;
+ case SEQID_OTHER :
+ is_refseq = TRUE;
+ break;
+ default :
+ break;
+ }
+ }
+ if (has_gi && is_ged && (! is_refseq)) {
+ sev = SEV_REJECT;
+ }
+ }
ValidErr (vsp, sev, ERR_SEQ_FEAT_InternalStop, "%ld internal stops. Genetic code [%d]", (long) stop_count, gccode);
}
}
@@ -15400,6 +16485,13 @@ erret:
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_TranslExcept, "Unparsed transl_except qual. Skipped");
}
}
+ } else {
+ if (transl_except) {
+ has_errors = TRUE;
+ if (report_errors) {
+ ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_TranslExcept, "Unparsed transl_except qual (but protein is okay). Skipped");
+ }
+ }
}
if (prot2seq != NULL)
@@ -15466,7 +16558,8 @@ static void SpliceCheckEx (ValidStructPtr vsp, SeqFeatPtr sfp, Boolean checkAll)
if (sfp->excpt) {
if (StringISearch (sfp->except_text, "ribosomal slippage") != NULL||
StringISearch (sfp->except_text, "artificial frameshift") != NULL ||
- StringISearch (sfp->except_text, "nonconsensus splice site") != NULL) {
+ StringISearch (sfp->except_text, "nonconsensus splice site") != NULL ||
+ StringISearch (sfp->except_text, "adjusted for low-quality genome") != NULL) {
report_errors = FALSE;
}
}
@@ -15605,6 +16698,7 @@ static void SpliceCheckEx (ValidStructPtr vsp, SeqFeatPtr sfp, Boolean checkAll)
}
if (((checkAll && (!lastPartial)) || ctr < total) && (stp < (len - 2))) { /* check donor on all but last exon and on sequence */
+ tbuf[0] = '\0';
StreamCacheSetPosition (&sc, stp + 1);
residue1 = StreamCacheGetResidue (&sc);
residue2 = StreamCacheGetResidue (&sc);
@@ -15694,6 +16788,7 @@ static void SpliceCheckEx (ValidStructPtr vsp, SeqFeatPtr sfp, Boolean checkAll)
tbuf[0] = '\0';
if (bsp == NULL) {
StringCpy (tbuf, "?");
+ SeqIdWrite (sip, tbuf, PRINTID_FASTA_SHORT, 39);
} else if (vsp->suppressContext || vsp->convertGiToAccn) {
WorstBioseqLabel (bsp, tbuf, 39, OM_LABEL_CONTENT);
} else {