/* bulk2htgs.c * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information (NCBI) * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government do not place any restriction on its use or reproduction. * We would, however, appreciate having the NCBI and the author cited in * any work or product based on this material * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * =========================================================================== * * File Name: bulk2htgs.c * * Author: Jonathan Kans * * Version Creation Date: 11/2/99 * * $Revision: 6.6 $ * * File Description: * * Modifications: * -------------------------------------------------------------------------- * Date Name Description of modification * ------- ---------- ----------------------------------------------------- * * * ========================================================================== */ #include #include #include #include #include #include #include static void AddBioSourceToBioseq (BioseqPtr bsp, CharPtr organism, BioSourcePtr bio) { BioSourcePtr biop; OrgRefPtr orp; SeqDescrPtr sdp; if (bsp == NULL) return; if (bio != NULL) { biop = AsnIoMemCopy ((Pointer) bio, (AsnReadFunc) BioSourceAsnRead, (AsnWriteFunc) BioSourceAsnWrite); } else { if (StringHasNoText (organism)) return; biop = BioSourceNew (); if (biop == NULL) return; orp = OrgRefNew (); if (orp == NULL) return; biop->org = orp; orp->taxname = StringSave (organism); } sdp = SeqDescrAdd (&(bsp->descr)); if (sdp == NULL) return; sdp->choice = Seq_descr_source; sdp->data.ptrvalue = (Pointer) biop; } static void AddMolInfoToBioseq (BioseqPtr bsp, Boolean is_mrna, Int2 htgs_phase) { MolInfoPtr mip; SeqDescrPtr sdp; if (bsp == NULL) return; mip = MolInfoNew (); if (mip == NULL) return; if (is_mrna) { mip->biomol = MOLECULE_TYPE_MRNA; } else { mip->biomol = MOLECULE_TYPE_GENOMIC; } switch (htgs_phase) { case 0 : mip->tech = MI_TECH_htgs_0; break; case 1 : mip->tech = MI_TECH_htgs_1; break; case 2 : mip->tech = MI_TECH_htgs_2; break; case 3 : mip->tech = MI_TECH_htgs_3; break; default : break; } sdp = SeqDescrAdd (&(bsp->descr)); if (sdp == NULL) return; sdp->choice = Seq_descr_molinfo; sdp->data.ptrvalue = (Pointer) mip; } static void ConvertSeqID (BioseqPtr bsp, CharPtr general, Boolean parse_colon, Boolean id_comment) { Char ch; CharPtr db, id, ptr; DbtagPtr dbt; Char idcom [128], tmp [128]; Boolean justdigits; ObjectIdPtr oip; SeqDescrPtr sdp; SeqIdPtr sip = NULL; long int val; if (bsp == NULL) return; for (sip = bsp->id; sip != NULL && sip->choice != SEQID_LOCAL; sip = sip->next) continue; if (sip == NULL) return; oip = (ObjectIdPtr) sip->data.ptrvalue; if (oip == NULL) return; if (oip->str != NULL) { StringNCpy_0 (tmp, oip->str, sizeof (tmp)); } else { sprintf (tmp, "%ld", (long) oip->id); } /* if colon in localid, parse db and id separately */ ptr = StringChr (tmp, ':'); if (parse_colon && ptr != NULL) { db = tmp; *ptr = '\0'; ptr++; id = ptr; } else { db = NULL; id = tmp; } /* ignore db in localid if general tag passed in */ if (! StringHasNoText (general)) { db = general; } if (StringHasNoText (db) || StringHasNoText (id)) return; dbt = DbtagNew (); if (dbt == NULL) return; /* insert dbtag between seqid and objectid, change choice and objectid */ sip->choice = SEQID_GENERAL; sip->data.ptrvalue = (Pointer) dbt; dbt->db = StringSave (db); dbt->tag = oip; oip->str = MemFree (oip->str); for (justdigits = TRUE, ptr = id, ch = *ptr; ch != '\0'; ptr++, ch = *ptr) { if (ch == ' ' || ch == '+' || ch == '-') { } else if (! IS_DIGIT (ch)) { justdigits = FALSE; } } if (justdigits && sscanf (id, "%ld", &val) == 1) { oip->id = (Int4) val; } else { oip->str = StringSave (id); } if (id_comment) { if (oip->str != NULL) { StringNCpy_0 (tmp, oip->str, sizeof (tmp)); } else { sprintf (tmp, "%ld", oip->id); } sprintf (idcom, "This sequence was identified as %s by the submitter", tmp); sdp = SeqDescrAdd (&(bsp->descr)); if (sdp != NULL) { sdp->choice = Seq_descr_comment; sdp->data.ptrvalue = (Pointer) StringSave (idcom); } } SeqMgrReplaceInBioseqIndex (bsp); } static void ProcessOneRecord (SeqSubmitPtr ssp, CharPtr organism, BioSourcePtr biop, CharPtr general, FILE* ofp, Boolean is_mrna, Int2 htgs_phase, Boolean parse_colon, Boolean id_comment, CharPtr comment, Uint2 datatype, Pointer dataptr) { AsnIoPtr aip; BioseqPtr bsp; Int4 pos; SeqDescrPtr sdp; SeqEntryPtr sep; if (ssp == NULL || ofp == NULL) return; if (organism == NULL && biop == NULL) return; if (datatype != OBJ_BIOSEQ) return; bsp = (BioseqPtr) dataptr; if (bsp == NULL) return; sep = SeqMgrGetSeqEntryForData (bsp); if (sep == NULL) return; AddBioSourceToBioseq (bsp, organism, biop); AddMolInfoToBioseq (bsp, is_mrna, htgs_phase); if (is_mrna) { bsp->mol = Seq_mol_rna; } else { bsp->mol = Seq_mol_dna; } ConvertSeqID (bsp, general, parse_colon, id_comment); if (! StringHasNoText (comment)) { sdp = SeqDescrAdd (&(bsp->descr)); if (sdp != NULL) { sdp->choice = Seq_descr_comment; sdp->data.ptrvalue = (Pointer) StringSave (comment); } } sdp = SeqDescrAdd (&(bsp->descr)); if (sdp != NULL) { sdp->choice = Seq_descr_create_date; sdp->data.ptrvalue = (Pointer) DateCurr (); } ssp->data = sep; ssp->datatype = 1; aip = AsnIoNew (ASNIO_TEXT_OUT, ofp, NULL, NULL, NULL); SeqSubmitAsnWrite (ssp, aip, NULL); pos = AsnIoTell (aip); AsnIoFree (aip, FALSE); fseek (ofp, pos, SEEK_SET); fprintf (ofp, "\n"); ssp->data = NULL; SeqEntryFree (sep); } static BioSourcePtr ReadBioSource (CharPtr path) { AsnIoPtr aip; BioSourcePtr biop = NULL; aip = AsnIoOpen (path, "r"); if (aip == NULL) return NULL; biop = BioSourceAsnRead (aip, NULL); AsnIoClose (aip); return biop; } /* template file can contain either Seq-submit or Submit-block */ static SeqSubmitPtr ReadSubmitBlock (CharPtr path) { CitSubPtr csp; Pointer dataptr; Uint2 datatype; FILE *fp; SubmitBlockPtr sbp = NULL; SeqSubmitPtr ssp = NULL; fp = FileOpen (path, "r"); if (fp == NULL) return NULL; dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, FALSE, FALSE, TRUE, FALSE); FileClose (fp); switch (datatype) { case OBJ_SUBMIT_BLOCK : sbp = (SubmitBlockPtr) dataptr; ssp = SeqSubmitNew (); if (ssp != NULL) { ssp->sub = sbp; } break; case OBJ_SEQSUB : ssp = (SeqSubmitPtr) dataptr; if (ssp != NULL) { sbp = ssp->sub; } break; default : break; } if (sbp != NULL) { csp = sbp->cit; if (csp != NULL) { csp->date = DateFree (csp->date); csp->date = DateCurr (); } } return ssp; } Args myargs [] = { {"Filename for FASTA input", "stdin", NULL, NULL, FALSE, 'i', ARG_FILE_IN, 0.0, 0, NULL}, {"Filename for Seq-submit template", NULL, NULL, NULL, FALSE, 't', ARG_FILE_IN, 0.0, 0, NULL}, {"Filename for ASN.1 output", "stdout", NULL, NULL, FALSE, 'o', ARG_FILE_OUT, 0.0, 0, NULL}, {"Organism name", NULL, NULL, NULL, TRUE, 'n', ARG_STRING, 0.0, 0, NULL}, {"General ID tag", NULL, NULL, NULL, TRUE, 'g', ARG_STRING, 0.0, 0, NULL}, {"Sequences are mRNA?", "F", NULL, NULL, TRUE, 'm', ARG_BOOLEAN, 0.0, 0, NULL}, {"HTGS phase?", "1", "0" ,"3", FALSE, 'p', ARG_INT, 0.0, 0, NULL}, {"Parse colon in tag", "F", NULL, NULL, TRUE, 'd', ARG_BOOLEAN, 0.0, 0, NULL}, {"Comment", NULL, NULL, NULL, TRUE, 'c', ARG_STRING, 0.0, 0, NULL}, {"Filename for BioSource", NULL, NULL, NULL, TRUE, 'b', ARG_FILE_IN, 0.0, 0, NULL}, {"Make identifier comment", "F", NULL, NULL, TRUE, 'f', ARG_BOOLEAN, 0.0, 0, NULL}, }; Int2 Main (void) { BioSourcePtr biop = NULL; Pointer dataptr; Uint2 datatype; CharPtr fasta_fname, template_fname, output_fname, organism, generalid, comment, biosource_fname; Int2 htgs_phase; FILE *ifp, *ofp; Boolean id_comment, is_mrna, parse_colon; SeqSubmitPtr ssp; ErrSetFatalLevel (SEV_FATAL); ErrClearOptFlags (EO_SHOW_USERSTR); UseLocalAsnloadDataAndErrMsg (); ErrPathReset (); if (! AllObjLoad ()) { Message (MSG_FATAL, "AllObjLoad failed"); return 1; } if (! SubmitAsnLoad ()) { Message (MSG_FATAL, "SubmitAsnLoad failed"); return 1; } if (! SeqCodeSetLoad ()) { Message (MSG_FATAL, "SeqCodeSetLoad failed"); return 1; } if (! GeneticCodeTableLoad ()) { Message (MSG_FATAL, "GeneticCodeTableLoad failed"); return 1; } if (! GetArgs ("bulk2htgs", sizeof (myargs) / sizeof (Args), myargs)) { return 0; } fasta_fname = myargs [0].strvalue; template_fname = myargs [1].strvalue; output_fname = myargs [2].strvalue; organism = myargs [3].strvalue; generalid = myargs [4].strvalue; is_mrna = (Boolean) myargs [5].intvalue; htgs_phase = (Int2) myargs [6].intvalue; parse_colon = (Boolean) myargs [7].intvalue; comment = myargs [8].strvalue; biosource_fname = myargs [9].strvalue; id_comment = (Boolean) myargs [10].intvalue; if (StringHasNoText (output_fname)) { Message (MSG_FATAL, "Unable to open output file"); return 1; } ssp = ReadSubmitBlock (template_fname); if (ssp == NULL) { Message (MSG_FATAL, "Unable to read template file"); return 1; } ssp->datatype = 1; if (! StringHasNoText (biosource_fname)) { biop = ReadBioSource (biosource_fname); if (biop == NULL) { Message (MSG_FATAL, "Unable to read BioSource file"); return 1; } } if (biop == NULL && StringHasNoText (organism)) { Message (MSG_FATAL, "Organism name or BioSource file is required for processing"); return 1; } ifp = FileOpen (fasta_fname, "r"); if (ifp == NULL) { Message (MSG_FATAL, "Unable to open input file"); return 1; } ofp = FileOpen (output_fname, "w"); if (ofp == NULL) { Message (MSG_FATAL, "Unable to create output file"); return 1; } while ((dataptr = ReadAsnFastaOrFlatFile (ifp, &datatype, NULL, FALSE, FALSE, TRUE, FALSE)) != NULL) { ProcessOneRecord (ssp, organism, biop, generalid, ofp, is_mrna, htgs_phase, parse_colon, id_comment, comment, datatype, dataptr); } FileClose (ofp); FileClose (ifp); BioSourceFree (biop); SeqSubmitFree (ssp); return 0; }