Load ncbi (6.1.20020828) into ncbi-tools6/branches/upstream/current.

author: Aaron M. Ucko <ucko@debian.org> 2005-03-23 20:49:08 +0000
committer: Aaron M. Ucko <ucko@debian.org> 2005-03-23 20:49:08 +0000
commit: ee1ab2cbbf85d439732174f321efc1114f19f749 (patch)
tree: 4c803451e8507be875a478b39bdd31702f0ea281 /asn
parent: c36b9906c3ef791147b3643f9e485cc02568819f (diff)
9 files changed, 245 insertions, 14 deletions
diff --git a/asn/asn.all b/asn/asn.all
index 1b3b2e5d..f1c2d87b 100644
--- a/asn/asn.all
+++ b/asn/asn.all
@@ -8,7 +8,7 @@
 --
 --**********************************************************************
 
---$Revision: 6.3 $
+--$Revision: 6.4 $
 --**********************************************************************
 --
 --  NCBI General Data elements
@@ -97,7 +97,8 @@ Person-id ::= CHOICE {
     name Name-std ,             -- structured name
     ml VisibleString ,          -- MEDLINE name (semi-structured)
                                 --    eg. "Jones RM"
-    str VisibleString }         -- unstructured name
+    str VisibleString,          -- unstructured name
+    consortium VisibleString }  -- consortium name
 
 Name-std ::= SEQUENCE { -- Structured names
     last VisibleString ,
@@ -737,7 +738,7 @@ Seq-loc-equiv ::= SET OF Seq-loc      -- for a set of equivalent locations
 END
     
 
---$Revision: 6.5 $
+--$Revision: 6.6 $
 --**********************************************************************
 --
 --  NCBI Sequence elements
@@ -866,13 +867,21 @@ MolInfo ::= SEQUENCE {
         other (255) }           -- use Source.techexp
                DEFAULT unknown ,
     techexp VisibleString OPTIONAL ,   -- explanation if tech not enough
+    --
+    -- Completeness is not indicated in most records.  For genomes, assume
+    -- the sequences are incomplete unless specifically marked as complete.
+    -- For mRNAs, assume the ends are not known exactly unless marked as
+    -- having the left or right end.
+    --
     completeness INTEGER {
       unknown (0) ,
       complete (1) ,                   -- complete biological entity
       partial (2) ,                    -- partial but no details given
-      no-left (3),                     -- missing 5' or NH3 end
+      no-left (3) ,                    -- missing 5' or NH3 end
       no-right (4) ,                   -- missing 3' or COOH end
       no-ends (5) ,                    -- missing both ends
+      has-left (6) ,                   -- 5' or NH3 end present
+      has-right (7) ,                  -- 3' or COOH end present
       other (255) } DEFAULT unknown }
 
 
@@ -1501,7 +1510,7 @@ PDB-replace ::= SEQUENCE {
 
 END
 
---$Revision: 6.7 $
+--$Revision: 6.9 $
 --**********************************************************************
 --
 --  NCBI Sequence Feature elements
@@ -1781,7 +1790,8 @@ Gene-ref ::= SEQUENCE {
     maploc VisibleString OPTIONAL ,       -- descriptive map location
     pseudo BOOLEAN DEFAULT FALSE ,        -- pseudogene
     db SET OF Dbtag OPTIONAL ,            -- ids in other dbases
-    syn SET OF VisibleString OPTIONAL }   -- synonyms for locus
+    syn SET OF VisibleString OPTIONAL ,   -- synonyms for locus
+    locus-tag VisibleString OPTIONAL }    -- systematic gene name (e.g., MI0001, ORF0069)
 
 END
 
@@ -1864,6 +1874,9 @@ OrgMod ::= SEQUENCE {
         anamorph (29) ,
         teleomorph (30) ,
         breed (31) ,
+        gb-acronym (32) ,       -- used by taxonomy database
+        gb-anamorph (33) ,      -- used by taxonomy database
+        gb-synonym (34) ,       -- used by taxonomy database
         old-lineage (253) ,
         old-name (254) ,
         other (255) } ,         -- ASN5: old-name (254) will be added to next spec
diff --git a/asn/asnpub.all b/asn/asnpub.all
index 94f27044..7cd8735f 100644
--- a/asn/asnpub.all
+++ b/asn/asnpub.all
@@ -1,4 +1,4 @@
---$Revision: 6.3 $
+--$Revision: 6.4 $
 --**********************************************************************
 --
 --  NCBI General Data elements
@@ -87,7 +87,8 @@ Person-id ::= CHOICE {
     name Name-std ,             -- structured name
     ml VisibleString ,          -- MEDLINE name (semi-structured)
                                 --    eg. "Jones RM"
-    str VisibleString }         -- unstructured name
+    str VisibleString,          -- unstructured name
+    consortium VisibleString }  -- consortium name
 
 Name-std ::= SEQUENCE { -- Structured names
     last VisibleString ,
diff --git a/asn/gbseq.asn b/asn/gbseq.asn
new file mode 100644
index 00000000..be39c295
--- /dev/null
+++ b/asn/gbseq.asn
@@ -0,0 +1,166 @@
+--$Revision: 6.3 $
+--*********************************************************
+--
+-- ASN.1 and XML for the components of a GenBank format sequence
+-- J.Ostell 2002
+--
+--*********************************************************
+
+NCBI-GBSeq DEFINITIONS ::=
+BEGIN
+
+--********
+--  GBSeq represents the elements in a GenBank style report
+--    of a sequence with some small additions to structure and support
+--    for protein (GenPept) versions of GenBank format as seen in
+--    Entrez. While this represents the simplification, reduction of
+--    detail, and flattening to a single sequence perspective of GenBank
+--    format (compared with the full ASN.1 or XML from which GenBank and
+--    this format is derived at NCBI), it is presented in ASN.1 or XML for
+--    automated parsing and processing. It is hoped that this compromise
+--    will be useful for those bulk processing at the GenBank format level
+--    of detail today. Since it is a compromise, a number of pragmatic
+--    decisions have been made.
+--
+--  In pursuit of simplicity and familiarity a number of
+--    fields do not have full substructure defined here where there is
+--    already a standard GenBank format string. For example:
+--
+--    Date  DD-Mon-YYYY
+--    Authors   LastName, Intials (with periods)
+--   Journal   JounalName Volume (issue), page-range (year)
+--   FeatureLocations as per GenBank feature table, but FeatureIntervals
+--    may also be provided as a convenience
+--   FeatureQualifiers  as per GenBank feature table
+--   Primary has a string that represents a table to construct
+--    a third party (TPA) sequence.
+--   other-seqids can have strings with the "vertical bar format" sequence
+--    identifiers used in BLAST for example, when they are non-genbank types.
+--    Currently in GenBank format you only see GI, but there are others, like
+--    patents, submitter clone names, etc which will appear here, as they
+--    always have in the ASN.1 format, and full XML format.
+--   source-db is a formatted text block for peptides in GenPept format that
+--    carries information from the source protein database.
+--
+--  There are also a number of elements that could have been
+--   more exactly specified, but in the interest of simplicity
+--   have been simply left as options. For example..
+--
+--  accession and accession.version will always appear in a GenBank record
+--   they are optional because this format can also be used for non-GenBank
+--   sequences, and in that case will have only "other-seqids".
+--
+--  sequences will normally all have "sequence" filled in. But contig records
+--    will have a "join" statement in the "contig" slot, and no "sequence".
+--    We also may consider a retrieval option with no sequence of any kind
+--     and no feature table to quickly check minimal values.
+--
+--  a reference may have an author list, or be from a consortium, or both.
+--
+--  some fields, such as taxonomy, do appear as separate elements in GenBank
+--    format but without a specific linetype (in GenBank format this comes
+--    under ORGANISM). Another example is the separation of primary accession
+--    from the list of secondary accessions. In GenBank format primary
+--    accession is just the first one on the list that includes all secondaries
+--    after it.
+--
+--  create-date deserves special comment. The date you see on the right hand
+--    side of the LOCUS line in GenBank format is actually the last date the
+--    the record was modified (or the update-date). The date the record was
+--    first submitted to GenBank appears in the first submission citation in
+--    the reference section. Internally in the databases and ASN.1 NCBI keeps
+--    the first date the record was released into the sequence database at
+--    NCBI as create-date. For records from EMBL, which supports create-date,
+--    it is the date provided by EMBL. For DDBJ records, which do not supply
+--    a create-date (same as GenBank format) the create-date is the first date
+--    NCBI saw the record from DDBJ. For older GenBank records, before NCBI
+--    took responsibility for GenBank, it is just the first date NCBI saw the
+--    record. Create-date can be very useful, so we expose it here, but users
+--    must understand it is only an approximation and comes from many sources,
+--    and with many exceptions and caveats. It does NOT tell you the first
+--    date the public might have seen this record and thus is NOT an accurate
+--    measure for legal issues of precedence.
+--
+--********
+
+GBSeq ::= SEQUENCE {
+	locus VisibleString ,
+	length INTEGER ,
+	strandedness INTEGER {
+		not-set (0) ,
+		single-stranded (1) ,
+		double-stranded (2) ,
+		mixed-stranded (3) } DEFAULT not-set ,
+	moltype INTEGER {
+		nucleic-acid (0) ,
+		dna (1) ,
+		rna (2) ,
+		trna (3) ,
+		rrna (4) ,
+		mrna (5) ,
+		urna (6) ,
+		snrna (7) ,
+		snorna (8) ,
+		peptide (9) } DEFAULT nucleic-acid ,
+	topology INTEGER {
+		linear (1) ,
+		circular (2) } DEFAULT linear ,
+	division VisibleString ,
+	update-date VisibleString ,
+	create-date VisibleString ,
+	definition VisibleString ,
+	primary-accession VisibleString OPTIONAL ,
+	accession-version VisibleString OPTIONAL ,
+	other-seqids SEQUENCE OF Seqid OPTIONAL ,
+	secondary-accessions SEQUENCE OF Secondary-accession OPTIONAL,
+	keywords SEQUENCE OF Keyword OPTIONAL ,
+	segment VisibleString OPTIONAL ,
+	source VisibleString ,
+	organism VisibleString ,
+	taxonomy VisibleString ,
+	references SEQUENCE OF GBReference ,
+	comment VisibleString OPTIONAL ,
+	primary VisibleString OPTIONAL ,
+	source-db VisibleString OPTIONAL ,
+	feature-table SEQUENCE OF GBFeature OPTIONAL ,
+	sequence VisibleString OPTIONAL ,  -- Optional for other dump forms
+	contig VisibleString OPTIONAL }
+
+	Secondary-accession ::= VisibleString
+
+	Seqid ::= VisibleString
+
+	Keyword ::= VisibleString
+
+	GBReference ::= SEQUENCE {
+		reference VisibleString ,
+		authors SEQUENCE OF Author OPTIONAL ,
+		consortium VisibleString OPTIONAL ,
+		title VisibleString OPTIONAL ,
+		journal VisibleString ,
+		medline INTEGER OPTIONAL ,
+		pubmed INTEGER OPTIONAL ,
+		remark VisibleString OPTIONAL }
+
+	Author ::= VisibleString
+
+	GBFeature ::= SEQUENCE {
+		key VisibleString ,
+		location VisibleString ,
+		intervals SEQUENCE OF GBInterval OPTIONAL ,
+		quals SEQUENCE OF GBQualifier OPTIONAL }
+
+	GBInterval ::= SEQUENCE {
+		from INTEGER OPTIONAL ,
+		to INTEGER OPTIONAL ,
+		point INTEGER OPTIONAL ,
+		accession VisibleString }
+
+	GBQualifier ::= SEQUENCE {
+		name VisibleString ,
+		value VisibleString OPTIONAL }
+
+	GBSet ::= SEQUENCE OF GBSeq
+		
+END
+
diff --git a/asn/general.asn b/asn/general.asn
index 4fbd99b5..654fff8d 100644
--- a/asn/general.asn
+++ b/asn/general.asn
@@ -1,4 +1,4 @@
---$Revision: 6.3 $
+--$Revision: 6.4 $
 --**********************************************************************
 --
 --  NCBI General Data elements
@@ -87,7 +87,8 @@ Person-id ::= CHOICE {
     name Name-std ,             -- structured name
     ml VisibleString ,          -- MEDLINE name (semi-structured)
                                 --    eg. "Jones RM"
-    str VisibleString }         -- unstructured name
+    str VisibleString,          -- unstructured name
+    consortium VisibleString }  -- consortium name
 
 Name-std ::= SEQUENCE { -- Structured names
     last VisibleString ,
diff --git a/asn/makestat.bat b/asn/makestat.bat
index b8f88ffc..4fbe8342 100644
--- a/asn/makestat.bat
+++ b/asn/makestat.bat
@@ -38,3 +38,5 @@ asntool -m ..\asn\medlars.asn -o asnmdrs.h
 asntool -m ..\asn\proj.asn -o asnproj.h
 asntool -m ..\access\entrez2.asn -o asnent2.h
 asntool -m ..\access\mim.asn -o asnmim.h
+asntool -m ..\asn\gbseq.asn -o asngbseq.h
+asntool -m ..\asn\tinyseq.asn -o asntseq.h
diff --git a/asn/makestat.unx b/asn/makestat.unx
index f8fdcd5c..728499d7 100755
--- a/asn/makestat.unx
+++ b/asn/makestat.unx
@@ -38,3 +38,5 @@ asntool -m ../asn/medlars.asn -o asnmdrs.h
 asntool -m ../asn/proj.asn -o asnproj.h
 asntool -m ../access/entrez2.asn -o asnent2.h
 asntool -m ../access/mim.asn -o asnmim.h
+asntool -m ../asn/gbseq.asn -o asngbseq.h
+asntool -m ../asn/tinyseq.asn -o asntseq.h
diff --git a/asn/seq.asn b/asn/seq.asn
index 49c55d0d..0c397967 100644
--- a/asn/seq.asn
+++ b/asn/seq.asn
@@ -1,4 +1,4 @@
---$Revision: 6.5 $
+--$Revision: 6.6 $
 --**********************************************************************
 --
 --  NCBI Sequence elements
@@ -127,13 +127,21 @@ MolInfo ::= SEQUENCE {
         other (255) }           -- use Source.techexp
                DEFAULT unknown ,
     techexp VisibleString OPTIONAL ,   -- explanation if tech not enough
+    --
+    -- Completeness is not indicated in most records.  For genomes, assume
+    -- the sequences are incomplete unless specifically marked as complete.
+    -- For mRNAs, assume the ends are not known exactly unless marked as
+    -- having the left or right end.
+    --
     completeness INTEGER {
       unknown (0) ,
       complete (1) ,                   -- complete biological entity
       partial (2) ,                    -- partial but no details given
-      no-left (3),                     -- missing 5' or NH3 end
+      no-left (3) ,                    -- missing 5' or NH3 end
       no-right (4) ,                   -- missing 3' or COOH end
       no-ends (5) ,                    -- missing both ends
+      has-left (6) ,                   -- 5' or NH3 end present
+      has-right (7) ,                  -- 3' or COOH end present
       other (255) } DEFAULT unknown }
 
 
diff --git a/asn/seqfeat.asn b/asn/seqfeat.asn
index 8d796562..7fff56e9 100644
--- a/asn/seqfeat.asn
+++ b/asn/seqfeat.asn
@@ -1,4 +1,4 @@
---$Revision: 6.7 $
+--$Revision: 6.9 $
 --**********************************************************************
 --
 --  NCBI Sequence Feature elements
@@ -278,7 +278,8 @@ Gene-ref ::= SEQUENCE {
     maploc VisibleString OPTIONAL ,       -- descriptive map location
     pseudo BOOLEAN DEFAULT FALSE ,        -- pseudogene
     db SET OF Dbtag OPTIONAL ,            -- ids in other dbases
-    syn SET OF VisibleString OPTIONAL }   -- synonyms for locus
+    syn SET OF VisibleString OPTIONAL ,   -- synonyms for locus
+    locus-tag VisibleString OPTIONAL }    -- systematic gene name (e.g., MI0001, ORF0069)
 
 END
 
@@ -361,6 +362,9 @@ OrgMod ::= SEQUENCE {
         anamorph (29) ,
         teleomorph (30) ,
         breed (31) ,
+        gb-acronym (32) ,       -- used by taxonomy database
+        gb-anamorph (33) ,      -- used by taxonomy database
+        gb-synonym (34) ,       -- used by taxonomy database
         old-lineage (253) ,
         old-name (254) ,
         other (255) } ,         -- ASN5: old-name (254) will be added to next spec
diff --git a/asn/tinyseq.asn b/asn/tinyseq.asn
new file mode 100644
index 00000000..d16a2949
--- /dev/null
+++ b/asn/tinyseq.asn
@@ -0,0 +1,34 @@
+--$Revision: 6.1 $
+--**********************************************************************
+--
+--  ASN.1 for a tiny Bioseq in XML
+--    basically a structured FASTA file with a few extras
+--    in this case we drop all modularity of components
+--      All ids are Optional - simpler structure, less checking
+--      Components of organism are hard coded - can't easily add or change
+--      sequence is just string whether DNA or protein
+--  by James Ostell, 2000
+--
+--**********************************************************************
+
+NCBI-TSeq DEFINITIONS ::=
+BEGIN
+
+TSeq ::= SEQUENCE {
+	seqtype ENUMERATED {
+		nucleotide (1),
+		protein (2) },
+	gi INTEGER OPTIONAL,
+	accver VisibleString OPTIONAL,
+	sid VisibleString OPTIONAL,
+	local VisibleString OPTIONAL,
+	taxid INTEGER OPTIONAL,
+	orgname VisibleString OPTIONAL,
+	defline VisibleString,
+	length INTEGER,
+	sequence VisibleString }
+
+TSeqSet ::= SEQUENCE OF TSeq    -- a bunch of them
+
+END
+
author	Aaron M. Ucko <ucko@debian.org>	2005-03-23 20:49:08 +0000
committer	Aaron M. Ucko <ucko@debian.org>	2005-03-23 20:49:08 +0000
commit	ee1ab2cbbf85d439732174f321efc1114f19f749 (patch)
tree	4c803451e8507be875a478b39bdd31702f0ea281 /asn
parent	c36b9906c3ef791147b3643f9e485cc02568819f (diff)