diff options
author | Aaron M. Ucko <ucko@debian.org> | 2005-03-23 20:49:08 +0000 |
---|---|---|
committer | Aaron M. Ucko <ucko@debian.org> | 2005-03-23 20:49:08 +0000 |
commit | ee1ab2cbbf85d439732174f321efc1114f19f749 (patch) | |
tree | 4c803451e8507be875a478b39bdd31702f0ea281 /asn | |
parent | c36b9906c3ef791147b3643f9e485cc02568819f (diff) |
Load ncbi (6.1.20020828) into ncbi-tools6/branches/upstream/current.
Diffstat (limited to 'asn')
-rw-r--r-- | asn/asn.all | 25 | ||||
-rw-r--r-- | asn/asnpub.all | 5 | ||||
-rw-r--r-- | asn/gbseq.asn | 166 | ||||
-rw-r--r-- | asn/general.asn | 5 | ||||
-rw-r--r-- | asn/makestat.bat | 2 | ||||
-rwxr-xr-x | asn/makestat.unx | 2 | ||||
-rw-r--r-- | asn/seq.asn | 12 | ||||
-rw-r--r-- | asn/seqfeat.asn | 8 | ||||
-rw-r--r-- | asn/tinyseq.asn | 34 |
9 files changed, 245 insertions, 14 deletions
diff --git a/asn/asn.all b/asn/asn.all index 1b3b2e5d..f1c2d87b 100644 --- a/asn/asn.all +++ b/asn/asn.all @@ -8,7 +8,7 @@ -- --********************************************************************** ---$Revision: 6.3 $ +--$Revision: 6.4 $ --********************************************************************** -- -- NCBI General Data elements @@ -97,7 +97,8 @@ Person-id ::= CHOICE { name Name-std , -- structured name ml VisibleString , -- MEDLINE name (semi-structured) -- eg. "Jones RM" - str VisibleString } -- unstructured name + str VisibleString, -- unstructured name + consortium VisibleString } -- consortium name Name-std ::= SEQUENCE { -- Structured names last VisibleString , @@ -737,7 +738,7 @@ Seq-loc-equiv ::= SET OF Seq-loc -- for a set of equivalent locations END ---$Revision: 6.5 $ +--$Revision: 6.6 $ --********************************************************************** -- -- NCBI Sequence elements @@ -866,13 +867,21 @@ MolInfo ::= SEQUENCE { other (255) } -- use Source.techexp DEFAULT unknown , techexp VisibleString OPTIONAL , -- explanation if tech not enough + -- + -- Completeness is not indicated in most records. For genomes, assume + -- the sequences are incomplete unless specifically marked as complete. + -- For mRNAs, assume the ends are not known exactly unless marked as + -- having the left or right end. + -- completeness INTEGER { unknown (0) , complete (1) , -- complete biological entity partial (2) , -- partial but no details given - no-left (3), -- missing 5' or NH3 end + no-left (3) , -- missing 5' or NH3 end no-right (4) , -- missing 3' or COOH end no-ends (5) , -- missing both ends + has-left (6) , -- 5' or NH3 end present + has-right (7) , -- 3' or COOH end present other (255) } DEFAULT unknown } @@ -1501,7 +1510,7 @@ PDB-replace ::= SEQUENCE { END ---$Revision: 6.7 $ +--$Revision: 6.9 $ --********************************************************************** -- -- NCBI Sequence Feature elements @@ -1781,7 +1790,8 @@ Gene-ref ::= SEQUENCE { maploc VisibleString OPTIONAL , -- descriptive map location pseudo BOOLEAN DEFAULT FALSE , -- pseudogene db SET OF Dbtag OPTIONAL , -- ids in other dbases - syn SET OF VisibleString OPTIONAL } -- synonyms for locus + syn SET OF VisibleString OPTIONAL , -- synonyms for locus + locus-tag VisibleString OPTIONAL } -- systematic gene name (e.g., MI0001, ORF0069) END @@ -1864,6 +1874,9 @@ OrgMod ::= SEQUENCE { anamorph (29) , teleomorph (30) , breed (31) , + gb-acronym (32) , -- used by taxonomy database + gb-anamorph (33) , -- used by taxonomy database + gb-synonym (34) , -- used by taxonomy database old-lineage (253) , old-name (254) , other (255) } , -- ASN5: old-name (254) will be added to next spec diff --git a/asn/asnpub.all b/asn/asnpub.all index 94f27044..7cd8735f 100644 --- a/asn/asnpub.all +++ b/asn/asnpub.all @@ -1,4 +1,4 @@ ---$Revision: 6.3 $ +--$Revision: 6.4 $ --********************************************************************** -- -- NCBI General Data elements @@ -87,7 +87,8 @@ Person-id ::= CHOICE { name Name-std , -- structured name ml VisibleString , -- MEDLINE name (semi-structured) -- eg. "Jones RM" - str VisibleString } -- unstructured name + str VisibleString, -- unstructured name + consortium VisibleString } -- consortium name Name-std ::= SEQUENCE { -- Structured names last VisibleString , diff --git a/asn/gbseq.asn b/asn/gbseq.asn new file mode 100644 index 00000000..be39c295 --- /dev/null +++ b/asn/gbseq.asn @@ -0,0 +1,166 @@ +--$Revision: 6.3 $ +--********************************************************* +-- +-- ASN.1 and XML for the components of a GenBank format sequence +-- J.Ostell 2002 +-- +--********************************************************* + +NCBI-GBSeq DEFINITIONS ::= +BEGIN + +--******** +-- GBSeq represents the elements in a GenBank style report +-- of a sequence with some small additions to structure and support +-- for protein (GenPept) versions of GenBank format as seen in +-- Entrez. While this represents the simplification, reduction of +-- detail, and flattening to a single sequence perspective of GenBank +-- format (compared with the full ASN.1 or XML from which GenBank and +-- this format is derived at NCBI), it is presented in ASN.1 or XML for +-- automated parsing and processing. It is hoped that this compromise +-- will be useful for those bulk processing at the GenBank format level +-- of detail today. Since it is a compromise, a number of pragmatic +-- decisions have been made. +-- +-- In pursuit of simplicity and familiarity a number of +-- fields do not have full substructure defined here where there is +-- already a standard GenBank format string. For example: +-- +-- Date DD-Mon-YYYY +-- Authors LastName, Intials (with periods) +-- Journal JounalName Volume (issue), page-range (year) +-- FeatureLocations as per GenBank feature table, but FeatureIntervals +-- may also be provided as a convenience +-- FeatureQualifiers as per GenBank feature table +-- Primary has a string that represents a table to construct +-- a third party (TPA) sequence. +-- other-seqids can have strings with the "vertical bar format" sequence +-- identifiers used in BLAST for example, when they are non-genbank types. +-- Currently in GenBank format you only see GI, but there are others, like +-- patents, submitter clone names, etc which will appear here, as they +-- always have in the ASN.1 format, and full XML format. +-- source-db is a formatted text block for peptides in GenPept format that +-- carries information from the source protein database. +-- +-- There are also a number of elements that could have been +-- more exactly specified, but in the interest of simplicity +-- have been simply left as options. For example.. +-- +-- accession and accession.version will always appear in a GenBank record +-- they are optional because this format can also be used for non-GenBank +-- sequences, and in that case will have only "other-seqids". +-- +-- sequences will normally all have "sequence" filled in. But contig records +-- will have a "join" statement in the "contig" slot, and no "sequence". +-- We also may consider a retrieval option with no sequence of any kind +-- and no feature table to quickly check minimal values. +-- +-- a reference may have an author list, or be from a consortium, or both. +-- +-- some fields, such as taxonomy, do appear as separate elements in GenBank +-- format but without a specific linetype (in GenBank format this comes +-- under ORGANISM). Another example is the separation of primary accession +-- from the list of secondary accessions. In GenBank format primary +-- accession is just the first one on the list that includes all secondaries +-- after it. +-- +-- create-date deserves special comment. The date you see on the right hand +-- side of the LOCUS line in GenBank format is actually the last date the +-- the record was modified (or the update-date). The date the record was +-- first submitted to GenBank appears in the first submission citation in +-- the reference section. Internally in the databases and ASN.1 NCBI keeps +-- the first date the record was released into the sequence database at +-- NCBI as create-date. For records from EMBL, which supports create-date, +-- it is the date provided by EMBL. For DDBJ records, which do not supply +-- a create-date (same as GenBank format) the create-date is the first date +-- NCBI saw the record from DDBJ. For older GenBank records, before NCBI +-- took responsibility for GenBank, it is just the first date NCBI saw the +-- record. Create-date can be very useful, so we expose it here, but users +-- must understand it is only an approximation and comes from many sources, +-- and with many exceptions and caveats. It does NOT tell you the first +-- date the public might have seen this record and thus is NOT an accurate +-- measure for legal issues of precedence. +-- +--******** + +GBSeq ::= SEQUENCE { + locus VisibleString , + length INTEGER , + strandedness INTEGER { + not-set (0) , + single-stranded (1) , + double-stranded (2) , + mixed-stranded (3) } DEFAULT not-set , + moltype INTEGER { + nucleic-acid (0) , + dna (1) , + rna (2) , + trna (3) , + rrna (4) , + mrna (5) , + urna (6) , + snrna (7) , + snorna (8) , + peptide (9) } DEFAULT nucleic-acid , + topology INTEGER { + linear (1) , + circular (2) } DEFAULT linear , + division VisibleString , + update-date VisibleString , + create-date VisibleString , + definition VisibleString , + primary-accession VisibleString OPTIONAL , + accession-version VisibleString OPTIONAL , + other-seqids SEQUENCE OF Seqid OPTIONAL , + secondary-accessions SEQUENCE OF Secondary-accession OPTIONAL, + keywords SEQUENCE OF Keyword OPTIONAL , + segment VisibleString OPTIONAL , + source VisibleString , + organism VisibleString , + taxonomy VisibleString , + references SEQUENCE OF GBReference , + comment VisibleString OPTIONAL , + primary VisibleString OPTIONAL , + source-db VisibleString OPTIONAL , + feature-table SEQUENCE OF GBFeature OPTIONAL , + sequence VisibleString OPTIONAL , -- Optional for other dump forms + contig VisibleString OPTIONAL } + + Secondary-accession ::= VisibleString + + Seqid ::= VisibleString + + Keyword ::= VisibleString + + GBReference ::= SEQUENCE { + reference VisibleString , + authors SEQUENCE OF Author OPTIONAL , + consortium VisibleString OPTIONAL , + title VisibleString OPTIONAL , + journal VisibleString , + medline INTEGER OPTIONAL , + pubmed INTEGER OPTIONAL , + remark VisibleString OPTIONAL } + + Author ::= VisibleString + + GBFeature ::= SEQUENCE { + key VisibleString , + location VisibleString , + intervals SEQUENCE OF GBInterval OPTIONAL , + quals SEQUENCE OF GBQualifier OPTIONAL } + + GBInterval ::= SEQUENCE { + from INTEGER OPTIONAL , + to INTEGER OPTIONAL , + point INTEGER OPTIONAL , + accession VisibleString } + + GBQualifier ::= SEQUENCE { + name VisibleString , + value VisibleString OPTIONAL } + + GBSet ::= SEQUENCE OF GBSeq + +END + diff --git a/asn/general.asn b/asn/general.asn index 4fbd99b5..654fff8d 100644 --- a/asn/general.asn +++ b/asn/general.asn @@ -1,4 +1,4 @@ ---$Revision: 6.3 $ +--$Revision: 6.4 $ --********************************************************************** -- -- NCBI General Data elements @@ -87,7 +87,8 @@ Person-id ::= CHOICE { name Name-std , -- structured name ml VisibleString , -- MEDLINE name (semi-structured) -- eg. "Jones RM" - str VisibleString } -- unstructured name + str VisibleString, -- unstructured name + consortium VisibleString } -- consortium name Name-std ::= SEQUENCE { -- Structured names last VisibleString , diff --git a/asn/makestat.bat b/asn/makestat.bat index b8f88ffc..4fbe8342 100644 --- a/asn/makestat.bat +++ b/asn/makestat.bat @@ -38,3 +38,5 @@ asntool -m ..\asn\medlars.asn -o asnmdrs.h asntool -m ..\asn\proj.asn -o asnproj.h asntool -m ..\access\entrez2.asn -o asnent2.h asntool -m ..\access\mim.asn -o asnmim.h +asntool -m ..\asn\gbseq.asn -o asngbseq.h +asntool -m ..\asn\tinyseq.asn -o asntseq.h diff --git a/asn/makestat.unx b/asn/makestat.unx index f8fdcd5c..728499d7 100755 --- a/asn/makestat.unx +++ b/asn/makestat.unx @@ -38,3 +38,5 @@ asntool -m ../asn/medlars.asn -o asnmdrs.h asntool -m ../asn/proj.asn -o asnproj.h asntool -m ../access/entrez2.asn -o asnent2.h asntool -m ../access/mim.asn -o asnmim.h +asntool -m ../asn/gbseq.asn -o asngbseq.h +asntool -m ../asn/tinyseq.asn -o asntseq.h diff --git a/asn/seq.asn b/asn/seq.asn index 49c55d0d..0c397967 100644 --- a/asn/seq.asn +++ b/asn/seq.asn @@ -1,4 +1,4 @@ ---$Revision: 6.5 $ +--$Revision: 6.6 $ --********************************************************************** -- -- NCBI Sequence elements @@ -127,13 +127,21 @@ MolInfo ::= SEQUENCE { other (255) } -- use Source.techexp DEFAULT unknown , techexp VisibleString OPTIONAL , -- explanation if tech not enough + -- + -- Completeness is not indicated in most records. For genomes, assume + -- the sequences are incomplete unless specifically marked as complete. + -- For mRNAs, assume the ends are not known exactly unless marked as + -- having the left or right end. + -- completeness INTEGER { unknown (0) , complete (1) , -- complete biological entity partial (2) , -- partial but no details given - no-left (3), -- missing 5' or NH3 end + no-left (3) , -- missing 5' or NH3 end no-right (4) , -- missing 3' or COOH end no-ends (5) , -- missing both ends + has-left (6) , -- 5' or NH3 end present + has-right (7) , -- 3' or COOH end present other (255) } DEFAULT unknown } diff --git a/asn/seqfeat.asn b/asn/seqfeat.asn index 8d796562..7fff56e9 100644 --- a/asn/seqfeat.asn +++ b/asn/seqfeat.asn @@ -1,4 +1,4 @@ ---$Revision: 6.7 $ +--$Revision: 6.9 $ --********************************************************************** -- -- NCBI Sequence Feature elements @@ -278,7 +278,8 @@ Gene-ref ::= SEQUENCE { maploc VisibleString OPTIONAL , -- descriptive map location pseudo BOOLEAN DEFAULT FALSE , -- pseudogene db SET OF Dbtag OPTIONAL , -- ids in other dbases - syn SET OF VisibleString OPTIONAL } -- synonyms for locus + syn SET OF VisibleString OPTIONAL , -- synonyms for locus + locus-tag VisibleString OPTIONAL } -- systematic gene name (e.g., MI0001, ORF0069) END @@ -361,6 +362,9 @@ OrgMod ::= SEQUENCE { anamorph (29) , teleomorph (30) , breed (31) , + gb-acronym (32) , -- used by taxonomy database + gb-anamorph (33) , -- used by taxonomy database + gb-synonym (34) , -- used by taxonomy database old-lineage (253) , old-name (254) , other (255) } , -- ASN5: old-name (254) will be added to next spec diff --git a/asn/tinyseq.asn b/asn/tinyseq.asn new file mode 100644 index 00000000..d16a2949 --- /dev/null +++ b/asn/tinyseq.asn @@ -0,0 +1,34 @@ +--$Revision: 6.1 $ +--********************************************************************** +-- +-- ASN.1 for a tiny Bioseq in XML +-- basically a structured FASTA file with a few extras +-- in this case we drop all modularity of components +-- All ids are Optional - simpler structure, less checking +-- Components of organism are hard coded - can't easily add or change +-- sequence is just string whether DNA or protein +-- by James Ostell, 2000 +-- +--********************************************************************** + +NCBI-TSeq DEFINITIONS ::= +BEGIN + +TSeq ::= SEQUENCE { + seqtype ENUMERATED { + nucleotide (1), + protein (2) }, + gi INTEGER OPTIONAL, + accver VisibleString OPTIONAL, + sid VisibleString OPTIONAL, + local VisibleString OPTIONAL, + taxid INTEGER OPTIONAL, + orgname VisibleString OPTIONAL, + defline VisibleString, + length INTEGER, + sequence VisibleString } + +TSeqSet ::= SEQUENCE OF TSeq -- a bunch of them + +END + |