diff options
Diffstat (limited to 'api/sqnutil1.c')
-rw-r--r-- | api/sqnutil1.c | 387 |
1 files changed, 373 insertions, 14 deletions
diff --git a/api/sqnutil1.c b/api/sqnutil1.c index 623c5750..cf854efb 100644 --- a/api/sqnutil1.c +++ b/api/sqnutil1.c @@ -29,7 +29,7 @@ * * Version Creation Date: 9/2/97 * -* $Revision: 6.415 $ +* $Revision: 6.433 $ * * File Description: * @@ -1976,7 +1976,7 @@ NLM_EXTERN void PromoteXrefsExEx (SeqFeatPtr sfp, BioseqPtr bsp, Uint2 entityID, sfp = first; while (sfp != NULL) { if (sfp->data.choice == SEQFEAT_RNA && - sfp->idx.subtype != FEATDEF_tRNA && + /* sfp->idx.subtype != FEATDEF_tRNA && */ sfp->product == NULL && (! sfp->pseudo)) { gbq = sfp->qual; prevqual = (GBQualPtr PNTR) &(sfp->qual); @@ -2090,6 +2090,12 @@ NLM_EXTERN void PromoteXrefsExEx (SeqFeatPtr sfp, BioseqPtr bsp, Uint2 entityID, case FEATDEF_snoRNA : mip->biomol = MOLECULE_TYPE_SNORNA; break; + case FEATDEF_ncRNA : + mip->biomol = MOLECULE_TYPE_NCRNA; + break; + case FEATDEF_tmRNA : + mip->biomol = MOLECULE_TYPE_TMRNA; + break; default : mip->biomol = 0; break; @@ -3516,6 +3522,7 @@ static Boolean HandledGBQualOnCDS (SeqFeatPtr sfp, GBQualPtr gbq, ValNodePtr PNT return FALSE; } + static Boolean HandledGBQualOnRNA (SeqFeatPtr sfp, GBQualPtr gbq, Boolean isEmblOrDdbj) { @@ -3612,10 +3619,7 @@ static Boolean HandledGBQualOnRNA (SeqFeatPtr sfp, GBQualPtr gbq, Boolean isEmbl return TRUE; } } - if (rrp->type == 255 && - (StringICmp (name, "misc_RNA") == 0 || - StringICmp (name, "ncRNA") == 0 || - StringICmp (name, "tmRNA") == 0)) { + if (rrp->type == 255) { /* new convention follows ASN.1 spec comments, allows new RNA types */ return FALSE; } @@ -4314,12 +4318,12 @@ static void CleanupFeatureGBQuals (SeqFeatPtr sfp, Boolean isEmblOrDdbj) sfp->comment = str; } } else if (StringICmp (gbq->qual, "db_xref") == 0) { - vnp = ValNodeNew (NULL); - db = DbtagNew (); - vnp->data.ptrvalue = db; tag = gbq->val; ptr = StringChr (tag, ':'); if (ptr != NULL) { + vnp = ValNodeNew (NULL); + db = DbtagNew (); + vnp->data.ptrvalue = db; *ptr = '\0'; ptr++; db->db = StringSave (tag); @@ -4760,13 +4764,54 @@ static OrgModPtr SortOrgModList (OrgModPtr list) return list; } -static void CleanOrgModList (OrgModPtr PNTR ompp) +static void RemoveSpaceBeforeAndAfterColon (CharPtr str) { + CharPtr pColon, cp, src, dst; + + if (StringHasNoText (str)) { + return; + } + + pColon = StringChr (str, ':'); + while (pColon != NULL) { + cp = pColon - 1; + while (cp > str && isspace (*cp)) { + cp--; + } + if (cp < str || !isspace (*cp)) { + cp++; + } + *cp = ':'; + dst = cp + 1; + cp = pColon + 1; + while (isspace (*cp)) { + cp++; + } + src = cp; + pColon = dst - 1; + if (src != dst) { + while (*src != 0) { + *dst = *src; + dst++; src++; + } + *dst = 0; + } + pColon = StringChr (pColon + 1, ':'); + } +} + + +NLM_EXTERN void CleanOrgModList (OrgModPtr PNTR ompp) + +{ + Char ch; OrgModPtr last = NULL; OrgModPtr next; OrgModPtr omp; OrgModPtr PNTR prev; + CharPtr ptr; + CharPtr tmp; Boolean unlink; if (ompp == NULL) return; @@ -4793,6 +4838,8 @@ static void CleanOrgModList (OrgModPtr PNTR ompp) omp->subname = NULL; unlink = TRUE; } + } else if (HasNoText (omp->subname)) { + unlink = TRUE; } else { last = omp; } @@ -4806,6 +4853,28 @@ static void CleanOrgModList (OrgModPtr PNTR ompp) } omp = next; } + + for (omp = *ompp; omp != NULL; omp = omp->next) { + if (omp->subtype != ORGMOD_specimen_voucher && + omp->subtype != ORGMOD_culture_collection && + omp->subtype != ORGMOD_bio_material) continue; + if (StringHasNoText (omp->subname)) continue; + RemoveSpaceBeforeAndAfterColon (omp->subname); + ptr = StringStr (omp->subname, "::"); + if (ptr == NULL) continue; + ptr++; + tmp = ptr; + tmp++; + ch = *tmp; + while (ch != '\0') { + *ptr = ch; + ptr++; + tmp++; + ch = *tmp; + } + *ptr = '\0'; + + } } static int LIBCALLBACK SortBySubSourceSubtype (VoidPtr ptr1, VoidPtr ptr2) @@ -4936,7 +5005,29 @@ static CharPtr CombineSplitQual (CharPtr origval, CharPtr newval) return str; } -static void CleanSubSourceList (SubSourcePtr PNTR sspp) +static Uint1 LocationForPlastidText (CharPtr plastid_name) +{ + if (StringICmp (plastid_name, "chloroplast") == 0) { + return GENOME_chloroplast; + } else if (StringICmp (plastid_name, "chromoplast") == 0) { + return GENOME_chromoplast; + } else if (StringICmp (plastid_name, "kinetoplast") == 0) { + return GENOME_kinetoplast; + } else if (StringICmp (plastid_name, "plastid") == 0) { + return GENOME_plastid; + } else if (StringICmp (plastid_name, "apicoplast") == 0) { + return GENOME_apicoplast; + } else if (StringICmp (plastid_name, "leucoplast") == 0) { + return GENOME_leucoplast; + } else if (StringICmp (plastid_name, "proplastid") == 0) { + return GENOME_proplastid; + } else { + return 0; + } +} + + +extern void CleanSubSourceList (SubSourcePtr PNTR sspp, Uint1 location) { Char ch; @@ -5059,6 +5150,17 @@ static void CleanSubSourceList (SubSourcePtr PNTR sspp) ssp->name = NULL; unlink = TRUE; } + } else if (HasNoText (ssp->name) && + ssp->subtype != SUBSRC_germline && + ssp->subtype != SUBSRC_rearranged && + ssp->subtype != SUBSRC_transgenic && + ssp->subtype != SUBSRC_environmental_sample && + ssp->subtype != SUBSRC_metagenomic) { + unlink = TRUE; + } else if (ssp->subtype == SUBSRC_plastid_name && + location != 0 + && location == LocationForPlastidText (ssp->name)) { + unlink = TRUE; } else { last = ssp; } @@ -5412,7 +5514,6 @@ Nlm_QualNameAssoc current_subsource_subtype_alist[] = { {"Map", SUBSRC_map}, {"Metagenomic", SUBSRC_metagenomic}, {"Plasmid-name", SUBSRC_plasmid_name}, - {"Plastid-name", SUBSRC_plastid_name}, {"Pop-variant", SUBSRC_pop_variant}, {"Rearranged", SUBSRC_rearranged}, {"Rev-PCR-primer-name", SUBSRC_rev_primer_name}, @@ -5426,6 +5527,7 @@ Nlm_QualNameAssoc current_subsource_subtype_alist[] = { { NULL, 0 } }; Nlm_QualNameAssoc discouraged_subsource_subtype_alist[] = { + {"Plastid-name", SUBSRC_plastid_name}, { NULL, 0 } }; Nlm_QualNameAssoc discontinued_subsource_subtype_alist[] = { @@ -6673,10 +6775,13 @@ static void NormalizePubdesc (PubdescPtr pdp, Boolean stripSerial, ValNodePtr PN { Char buf1 [121]; Char buf2 [121]; + CitArtPtr cap; CitGenPtr cgp; + CitJourPtr cjp; Boolean fixInitials = TRUE; Boolean hasArt = FALSE; Boolean hasUid = FALSE; + ImprintPtr imp; ValNodePtr next; ValNodePtr PNTR prev; ValNodePtr vnp; @@ -6735,6 +6840,23 @@ static void NormalizePubdesc (PubdescPtr pdp, Boolean stripSerial, ValNodePtr PN buf1 [0] = '\0'; PubLabelUnique (vnp, buf1, sizeof (buf1) - 1, OM_LABEL_CONTENT, TRUE); NormalizeAPub (vnp, stripSerial, fixInitials); + if (vnp->choice == PUB_Article) { + cap = (CitArtPtr) vnp->data.ptrvalue; + if (cap != NULL && cap->from == 1) { + cjp = (CitJourPtr) cap->fromptr; + if (cjp != NULL) { + imp = cjp->imp; + if (imp != NULL) { + if (imp->pubstatus == PUBSTATUS_aheadofprint && imp->prepub != 2) { + imp->prepub = 2; + } + if (imp->pubstatus == PUBSTATUS_epublish && imp->prepub == 2) { + imp->prepub = 0; + } + } + } + } + } if (vnp->choice == PUB_Gen && empty_citgen ((CitGenPtr) vnp->data.ptrvalue)) { *prev = vnp->next; vnp->next = NULL; @@ -7534,11 +7656,13 @@ static void CleanupFeatureStrings (SeqFeatPtr sfp, Boolean stripSerial, ValNodeP BioSourcePtr biop; Char ch; Uint1 codon [6]; + GBQualPtr gbq; GeneRefPtr grp; ImpFeatPtr ifp; Boolean is_fMet = FALSE; Int2 j; Boolean justTrnaText; + GBQualPtr last; size_t len; CharPtr name; OrgNamePtr onp; @@ -7923,9 +8047,14 @@ static void CleanupFeatureStrings (SeqFeatPtr sfp, Boolean stripSerial, ValNodeP name++; ch = *name; } + /* if (ch == 's' && StringCmp (name, "s ribosomal RNA") == 0) { *name = 'S'; } + */ + if (ch == 's' && name [1] == ' ') { + *name = 'S'; + } } StrStripSpaces ((CharPtr) rrp->ext.value.ptrvalue); name = (CharPtr) rrp->ext.value.ptrvalue; @@ -8049,6 +8178,41 @@ static void CleanupFeatureStrings (SeqFeatPtr sfp, Boolean stripSerial, ValNodeP rrp->ext.value.ptrvalue = MemFree (rrp->ext.value.ptrvalue); rrp->ext.value.ptrvalue = StringSave ("internal transcribed spacer 3"); } + + name = (CharPtr) rrp->ext.value.ptrvalue; + if (StringNICmp (name, "internal transcribed spacer", 27) == 0) { + gbq = GBQualNew (); + if (gbq != NULL) { + rrp->ext.value.ptrvalue = StringSave ("misc_RNA"); + gbq->qual = StringSave ("product"); + gbq->val = name; + if (sfp->qual == NULL) { + sfp->qual = gbq; + } else { + last = sfp->qual; + while (last->next != NULL) { + last = last->next; + } + last->next = gbq; + } + } + } + + for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) { + if (StringCmp (gbq->qual, "product") != 0) continue; + name = gbq->val; + if (StringHasNoText (name)) continue; + if (StringICmp (name, "its1") == 0 || StringICmp (name, "its 1") == 0) { + gbq->val = MemFree (gbq->val); + gbq->val = StringSave ("internal transcribed spacer 1"); + } else if (StringICmp (name, "its2") == 0 || StringICmp (name, "its 2") == 0) { + gbq->val = MemFree (gbq->val); + gbq->val = StringSave ("internal transcribed spacer 2"); + } else if (StringICmp (name, "its3") == 0 || StringICmp (name, "its 3") == 0) { + gbq->val = MemFree (gbq->val); + gbq->val = StringSave ("internal transcribed spacer 3"); + } + } } break; case SEQFEAT_PUB : @@ -8099,8 +8263,8 @@ static void CleanupFeatureStrings (SeqFeatPtr sfp, Boolean stripSerial, ValNodeP CleanVisStringList (&(orp->mod)); OrpModToSubSource (&(orp->mod), &(biop->subtype)); } - CleanSubSourceList (&(biop->subtype)); biop->subtype = SortSubSourceList (biop->subtype); + CleanSubSourceList (&(biop->subtype), biop->genome); break; default : break; @@ -8245,8 +8409,8 @@ static void CleanupDescriptorStrings (ValNodePtr sdp, Boolean stripSerial, ValNo CleanVisStringList (&(orp->mod)); OrpModToSubSource (&(orp->mod), &(biop->subtype)); } - CleanSubSourceList (&(biop->subtype)); biop->subtype = SortSubSourceList (biop->subtype); + CleanSubSourceList (&(biop->subtype), biop->genome); break; case Seq_descr_molinfo : break; @@ -8603,6 +8767,7 @@ static CodeBreakPtr SortCodeBreaks (SeqFeatPtr sfp, CodeBreakPtr list) if (head [i].pos < pos) { out_of_order = TRUE; } + pos = head [i].pos; } if (out_of_order) { @@ -8655,6 +8820,195 @@ static void CleanupDuplicatedCodeBreaks (CodeBreakPtr PNTR prevcbp) } } + +CharPtr ncrnaClassList[] = { +"antisense_RNA", +"autocatalytically_spliced_intron", +"hammerhead_ribozyme", +"RNase_P_RNA", +"RNase_MRP_RNA", +"telomerase_RNA", +"guide_RNA", +"rasiRNA", +"scRNA", +"siRNA", +"miRNA", +"piRNA", +"snoRNA", +"snRNA", +"SRP_RNA", +"vault_RNA", +"Y_RNA", +"other", +NULL}; + +Int4 NcrnaOTHER = sizeof (ncrnaClassList) / sizeof (CharPtr) - 1; + + +extern Boolean IsStringInNcRNAClassList (CharPtr str) +{ + CharPtr PNTR p; + + if (StringHasNoText (str)) return FALSE; + for (p = ncrnaClassList; *p != NULL; p++) + { + if (StringICmp (str, *p) == 0) + { + return TRUE; + } + } + return FALSE; +} + + +static void AddNonCopiedQual (SeqFeatPtr sfp, CharPtr qual, CharPtr class_val) +{ + GBQualPtr gbq; + + if (sfp == NULL || StringHasNoText (qual) || StringHasNoText (class_val)) + { + return; + } + gbq = sfp->qual; + while (gbq != NULL + && (StringCmp (gbq->qual, qual) != 0 + || StringCmp (gbq->val, class_val) != 0)) + { + gbq = gbq->next; + } + if (gbq == NULL) + { + gbq = GBQualNew (); + gbq->qual = StringSave (qual); + gbq->val = StringSave (class_val); + gbq->next = sfp->qual; + sfp->qual = gbq; + } + +} + + +static CharPtr GetMiRNAProduct (CharPtr str) +{ + Int4 len; + CharPtr product = NULL; + + if (StringHasNoText (str)) return NULL; + if (StringNCmp (str, "miRNA ", 6) == 0) + { + product = StringSave (str + 6); + } + else if (StringNCmp (str, "microRNA ", 9) == 0) + { + product = StringSave (str + 9); + } + else + { + len = StringLen (str); + if (len > 6 && StringCmp (str + len - 6, " miRNA") == 0 + && (len < 15 || StringCmp (str - 15, "precursor miRNA") != 0)) + { + product = (CharPtr) MemNew (sizeof (Char) * (len - 5)); + StringNCpy (product, str, len - 6); + product[len - 6] = 0; + } + else if (len > 9 && StringCmp (str + len - 9, " microRNA") == 0 + && (len < 21 || StringCmp (str - 21, "precursor microRNA") != 0)) + { + product = (CharPtr) MemNew (sizeof (Char) * (len - 8)); + StringNCpy (product, str, len - 9); + product[len - 9] = 0; + } + } + return product; +} + + +static Boolean ConvertToNcRNA (SeqFeatPtr sfp) +{ + GBQualPtr gbq; + RnaRefPtr rrp; + Boolean was_converted = FALSE; + CharPtr miRNAproduct = NULL; + + if (sfp == NULL || sfp->data.choice != SEQFEAT_RNA || sfp->data.value.ptrvalue == NULL) + { + return FALSE; + } + rrp = (RnaRefPtr) sfp->data.value.ptrvalue; + if (rrp->type == 5 || rrp->type == 6 || rrp->type == 7) + { + if (rrp->type == 5) + { + AddNonCopiedQual (sfp, "ncRNA_class", "snRNA"); + } + else if (rrp->type == 6) + { + AddNonCopiedQual (sfp, "ncRNA_class", "scRNA"); + } + else if (rrp->type == 7) + { + AddNonCopiedQual (sfp, "ncRNA_class", "snoRNA"); + } + if (rrp->ext.choice == 1) + { + AddNonCopiedQual (sfp, "product", rrp->ext.value.ptrvalue); + rrp->ext.value.ptrvalue = MemFree (rrp->ext.value.ptrvalue); + } + rrp->ext.choice = 1; + rrp->ext.value.ptrvalue = StringSave ("ncRNA"); + rrp->type = 255; + was_converted = TRUE; + } + else if (rrp->type == 255 && rrp->ext.choice == 1) + { + if (IsStringInNcRNAClassList (rrp->ext.value.ptrvalue)) + { + AddNonCopiedQual (sfp, "ncRNA_class", rrp->ext.value.ptrvalue); + rrp->ext.value.ptrvalue = MemFree (rrp->ext.value.ptrvalue); + rrp->ext.value.ptrvalue = StringSave ("ncRNA"); + was_converted = TRUE; + } + else if ((miRNAproduct = GetMiRNAProduct (rrp->ext.value.ptrvalue)) != NULL) + { + AddNonCopiedQual (sfp, "ncRNA_class", "miRNA"); + rrp->ext.value.ptrvalue = MemFree (rrp->ext.value.ptrvalue); + rrp->ext.value.ptrvalue = StringSave ("ncRNA"); + AddNonCopiedQual (sfp, "product", miRNAproduct); + miRNAproduct = MemFree (miRNAproduct); + was_converted = TRUE; + } + else if (StringCmp (rrp->ext.value.ptrvalue, "ncRNA") != 0 + && StringCmp (rrp->ext.value.ptrvalue, "tmRNA") != 0 + && StringCmp (rrp->ext.value.ptrvalue, "misc_RNA") != 0) + { + AddNonCopiedQual (sfp, "product", rrp->ext.value.ptrvalue); + rrp->ext.value.ptrvalue = MemFree (rrp->ext.value.ptrvalue); + rrp->ext.value.ptrvalue = StringSave ("misc_RNA"); + } + } + if (rrp->type == 255 && rrp->ext.choice == 0) { + rrp->ext.choice = 1; + rrp->ext.value.ptrvalue = StringSave ("misc_RNA"); + } + if (rrp->type == 255 && rrp->ext.choice == 1 && + StringCmp (rrp->ext.value.ptrvalue, "misc_RNA") == 0) { + for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) { + if (StringCmp (gbq->qual, "ncRNA_class") == 0) { + rrp->ext.value.ptrvalue = MemFree (rrp->ext.value.ptrvalue); + rrp->ext.value.ptrvalue = StringSave ("ncRNA"); + was_converted = TRUE; + } else if (StringCmp (gbq->qual, "tag_peptide") == 0) { + rrp->ext.value.ptrvalue = MemFree (rrp->ext.value.ptrvalue); + rrp->ext.value.ptrvalue = StringSave ("tmRNA"); + was_converted = TRUE; + } + } + } + return was_converted; +} + + NLM_EXTERN void CleanUpSeqFeat ( SeqFeatPtr sfp, Boolean isEmblOrDdbj, @@ -8843,6 +9197,9 @@ NLM_EXTERN void CleanUpSeqFeat ( } } } + if (ConvertToNcRNA (sfp)) { + sfp->idx.subtype = FindFeatDefType (sfp); + } } else if (sfp->data.choice == SEQFEAT_REGION || sfp->data.choice == SEQFEAT_SITE || sfp->data.choice == SEQFEAT_BOND || @@ -8926,6 +9283,8 @@ static void BasicSeqEntryCleanupInternal (SeqEntryPtr sep, ValNodePtr PNTR publi case MOLECULE_TYPE_CRNA : case MOLECULE_TYPE_SNORNA : case MOLECULE_TYPE_TRANSCRIBED_RNA : + case MOLECULE_TYPE_NCRNA : + case MOLECULE_TYPE_TMRNA : bsp->mol = Seq_mol_rna; break; case MOLECULE_TYPE_PEPTIDE : |